azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +82 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/_experimental.py +172 -0
- azure/ai/evaluation/_common/constants.py +72 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/rai_service.py +632 -0
- azure/ai/evaluation/_common/utils.py +445 -0
- azure/ai/evaluation/_constants.py +72 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
- azure/ai/evaluation/_evaluate/_utils.py +298 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
- azure/ai/evaluation/_exceptions.py +128 -0
- azure/ai/evaluation/_http_utils.py +466 -0
- azure/ai/evaluation/_model_configurations.py +123 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +16 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
- azure/ai/evaluation/simulator/_constants.py +27 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
- azure/ai/evaluation/simulator/_simulator.py +716 -0
- azure/ai/evaluation/simulator/_tracing.py +89 -0
- azure/ai/evaluation/simulator/_utils.py +132 -0
- azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
import inspect
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Callable, Dict, Literal, Optional, Union, cast
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from promptflow._sdk.entities._flows import FlexFlow as flex_flow
|
|
13
|
+
from promptflow._sdk.entities._flows import Prompty as prompty_sdk
|
|
14
|
+
from promptflow._sdk.entities._flows.dag import Flow as dag_flow
|
|
15
|
+
from promptflow.client import PFClient
|
|
16
|
+
from promptflow.core import Prompty as prompty_core
|
|
17
|
+
from typing_extensions import ParamSpec
|
|
18
|
+
|
|
19
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
20
|
+
|
|
21
|
+
from ..._user_agent import USER_AGENT
|
|
22
|
+
from .._utils import _trace_destination_from_project_scope
|
|
23
|
+
|
|
24
|
+
LOGGER = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
P = ParamSpec("P")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
|
|
30
|
+
"""
|
|
31
|
+
Get evaluator type for telemetry.
|
|
32
|
+
|
|
33
|
+
:param evaluator: The evaluator object
|
|
34
|
+
:type evaluator: Dict[str, Callable]
|
|
35
|
+
:return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
|
|
36
|
+
:rtype: Literal["content-safety", "built-in", "custom"]
|
|
37
|
+
"""
|
|
38
|
+
module = inspect.getmodule(evaluator)
|
|
39
|
+
module_name = module.__name__ if module else ""
|
|
40
|
+
|
|
41
|
+
built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
|
|
42
|
+
content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
|
|
43
|
+
|
|
44
|
+
if content_safety:
|
|
45
|
+
return "content-safety"
|
|
46
|
+
if built_in:
|
|
47
|
+
return "built-in"
|
|
48
|
+
return "custom"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _get_evaluator_properties(evaluator, evaluator_name):
|
|
52
|
+
"""
|
|
53
|
+
Get evaluator properties for telemetry.
|
|
54
|
+
|
|
55
|
+
:param: evaluator: The evaluator object
|
|
56
|
+
:param: evaluator_name: The alias for the evaluator
|
|
57
|
+
:type: str
|
|
58
|
+
:raises Exception: If the evaluator properties cannot be retrieved
|
|
59
|
+
:return: A dictionary containing the evaluator properties, including
|
|
60
|
+
"name": A name for the evaluator
|
|
61
|
+
"pf_type": The promptflow type being used
|
|
62
|
+
"type": The evaluator type. Accepted values are "built-in", "custom", and "content-safety"
|
|
63
|
+
"alias": The alias for the evaluator. Defaults to an empty string.
|
|
64
|
+
:rtype: Dict[str, str]
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
# Cover flex flow and prompty based evaluator
|
|
69
|
+
if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
|
|
70
|
+
name = evaluator.name
|
|
71
|
+
pf_type = evaluator.__class__.__name__
|
|
72
|
+
# Cover dag flow based evaluator
|
|
73
|
+
elif isinstance(evaluator, dag_flow):
|
|
74
|
+
name = evaluator.name
|
|
75
|
+
pf_type = "DagFlow"
|
|
76
|
+
elif inspect.isfunction(evaluator):
|
|
77
|
+
name = evaluator.__name__
|
|
78
|
+
pf_type = flex_flow.__name__
|
|
79
|
+
elif hasattr(evaluator, "__class__") and callable(evaluator):
|
|
80
|
+
name = evaluator.__class__.__name__
|
|
81
|
+
pf_type = flex_flow.__name__
|
|
82
|
+
else:
|
|
83
|
+
# fallback option
|
|
84
|
+
name = str(evaluator)
|
|
85
|
+
pf_type = "Unknown"
|
|
86
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
87
|
+
LOGGER.debug("Failed to get evaluator properties: %s", e)
|
|
88
|
+
name = str(evaluator)
|
|
89
|
+
pf_type = "Unknown"
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
"name": name,
|
|
93
|
+
"pf_type": pf_type,
|
|
94
|
+
"type": _get_evaluator_type(evaluator),
|
|
95
|
+
"alias": evaluator_name if evaluator_name else "",
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# cspell:ignore isna
|
|
100
|
+
def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
|
|
101
|
+
"""Decorator to log evaluate activity
|
|
102
|
+
|
|
103
|
+
:param func: The function to be decorated
|
|
104
|
+
:type func: Callable
|
|
105
|
+
:returns: The decorated function
|
|
106
|
+
:rtype: Callable[P, EvaluationResult]
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
@functools.wraps(func)
|
|
110
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
|
|
111
|
+
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
112
|
+
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
113
|
+
|
|
114
|
+
evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
|
|
115
|
+
azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
|
|
116
|
+
|
|
117
|
+
pf_client = PFClient(
|
|
118
|
+
config=(
|
|
119
|
+
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
120
|
+
if azure_ai_project
|
|
121
|
+
else None
|
|
122
|
+
),
|
|
123
|
+
user_agent=USER_AGENT,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
127
|
+
track_in_cloud = bool(trace_destination) if trace_destination != "none" else False
|
|
128
|
+
evaluate_target = bool(kwargs.get("target", None))
|
|
129
|
+
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
130
|
+
custom_dimensions: Dict[str, Union[str, bool]] = {
|
|
131
|
+
"track_in_cloud": track_in_cloud,
|
|
132
|
+
"evaluate_target": evaluate_target,
|
|
133
|
+
"evaluator_config": evaluator_config,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
with log_activity(
|
|
137
|
+
get_telemetry_logger(),
|
|
138
|
+
"pf.evals.evaluate",
|
|
139
|
+
activity_type=ActivityType.PUBLICAPI,
|
|
140
|
+
user_agent=USER_AGENT,
|
|
141
|
+
custom_dimensions=custom_dimensions,
|
|
142
|
+
):
|
|
143
|
+
result = func(*args, **kwargs)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
evaluators_info = []
|
|
147
|
+
for evaluator_name, evaluator in evaluators.items():
|
|
148
|
+
evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
|
|
149
|
+
try:
|
|
150
|
+
evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
|
|
151
|
+
like=f"outputs.{evaluator_name}", axis=1
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
failed_rows = (
|
|
155
|
+
evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
|
|
156
|
+
)
|
|
157
|
+
total_rows = evaluator_df.shape[0]
|
|
158
|
+
|
|
159
|
+
evaluator_info["failed_rows"] = failed_rows
|
|
160
|
+
evaluator_info["total_rows"] = total_rows
|
|
161
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
162
|
+
LOGGER.debug("Failed to collect evaluate failed row info for %s: %s", evaluator_name, e)
|
|
163
|
+
evaluators_info.append(evaluator_info)
|
|
164
|
+
|
|
165
|
+
custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
|
|
166
|
+
with log_activity(
|
|
167
|
+
get_telemetry_logger(),
|
|
168
|
+
"pf.evals.evaluate_usage_info",
|
|
169
|
+
activity_type=ActivityType.PUBLICAPI,
|
|
170
|
+
user_agent=USER_AGENT,
|
|
171
|
+
custom_dimensions=custom_dimensions,
|
|
172
|
+
):
|
|
173
|
+
pass
|
|
174
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
175
|
+
LOGGER.debug("Failed to collect evaluate usage info: %s", e)
|
|
176
|
+
|
|
177
|
+
return result
|
|
178
|
+
|
|
179
|
+
return wrapper
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import tempfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
|
|
11
|
+
import uuid
|
|
12
|
+
import base64
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
from promptflow.client import PFClient
|
|
16
|
+
from promptflow.entities import Run
|
|
17
|
+
|
|
18
|
+
from azure.ai.evaluation._constants import (
|
|
19
|
+
DEFAULT_EVALUATION_RESULTS_FILE_NAME,
|
|
20
|
+
DefaultOpenEncoding,
|
|
21
|
+
EvaluationRunProperties,
|
|
22
|
+
Prefixes,
|
|
23
|
+
)
|
|
24
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
25
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
26
|
+
|
|
27
|
+
LOGGER = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
AZURE_WORKSPACE_REGEX_FORMAT = (
|
|
30
|
+
"^azureml:[/]{1,2}subscriptions/([^/]+)/resource(groups|Groups)/([^/]+)"
|
|
31
|
+
"(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AzureMLWorkspace(NamedTuple):
|
|
36
|
+
subscription_id: str
|
|
37
|
+
resource_group_name: str
|
|
38
|
+
workspace_name: str
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def is_none(value) -> bool:
|
|
42
|
+
return value is None or str(value).lower() == "none"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def extract_workspace_triad_from_trace_provider( # pylint: disable=name-too-long
|
|
46
|
+
trace_provider: str,
|
|
47
|
+
) -> AzureMLWorkspace:
|
|
48
|
+
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
49
|
+
if not match or len(match.groups()) != 5:
|
|
50
|
+
raise EvaluationException(
|
|
51
|
+
message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
52
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
53
|
+
f"workspaces/<workspace_name>, got {trace_provider}",
|
|
54
|
+
internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
55
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
56
|
+
"workspaces/<workspace_name>,",
|
|
57
|
+
target=ErrorTarget.UNKNOWN,
|
|
58
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
59
|
+
blame=ErrorBlame.UNKNOWN,
|
|
60
|
+
)
|
|
61
|
+
subscription_id = match.group(1)
|
|
62
|
+
resource_group_name = match.group(3)
|
|
63
|
+
workspace_name = match.group(5)
|
|
64
|
+
return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def load_jsonl(path):
|
|
68
|
+
with open(path, "r", encoding=DefaultOpenEncoding.READ) as f:
|
|
69
|
+
return [json.loads(line) for line in f.readlines()]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
|
|
73
|
+
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
74
|
+
|
|
75
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
76
|
+
azure_pf_client = _get_azure_pf_client(
|
|
77
|
+
subscription_id=ws_triad.subscription_id,
|
|
78
|
+
resource_group=ws_triad.resource_group_name,
|
|
79
|
+
workspace_name=ws_triad.workspace_name,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return azure_pf_client, ws_triad
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _store_multimodal_content(messages, tmpdir: str):
|
|
86
|
+
# verify if images folder exists
|
|
87
|
+
images_folder_path = os.path.join(tmpdir, "images")
|
|
88
|
+
os.makedirs(images_folder_path, exist_ok=True)
|
|
89
|
+
|
|
90
|
+
# traverse all messages and replace base64 image data with new file name.
|
|
91
|
+
for message in messages:
|
|
92
|
+
if isinstance(message.get("content", []), list):
|
|
93
|
+
for content in message.get("content", []):
|
|
94
|
+
if content.get("type") == "image_url":
|
|
95
|
+
image_url = content.get("image_url")
|
|
96
|
+
if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"):
|
|
97
|
+
# Extract the base64 string
|
|
98
|
+
base64image = image_url["url"].replace("data:image/jpg;base64,", "")
|
|
99
|
+
|
|
100
|
+
# Generate a unique filename
|
|
101
|
+
image_file_name = f"{str(uuid.uuid4())}.jpg"
|
|
102
|
+
image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path
|
|
103
|
+
|
|
104
|
+
# Decode the base64 string to binary image data
|
|
105
|
+
image_data_binary = base64.b64decode(base64image)
|
|
106
|
+
|
|
107
|
+
# Write the binary image data to the file
|
|
108
|
+
image_file_path = os.path.join(images_folder_path, image_file_name)
|
|
109
|
+
with open(image_file_path, "wb") as f:
|
|
110
|
+
f.write(image_data_binary)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _log_metrics_and_instance_results(
|
|
114
|
+
metrics: Dict[str, Any],
|
|
115
|
+
instance_results: pd.DataFrame,
|
|
116
|
+
trace_destination: Optional[str],
|
|
117
|
+
run: Run,
|
|
118
|
+
evaluation_name: Optional[str],
|
|
119
|
+
) -> Optional[str]:
|
|
120
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
121
|
+
|
|
122
|
+
if trace_destination is None:
|
|
123
|
+
LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
|
|
127
|
+
tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
|
|
128
|
+
|
|
129
|
+
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
130
|
+
instance_results["line_number"] = instance_results.index.values
|
|
131
|
+
|
|
132
|
+
with EvalRun(
|
|
133
|
+
run_name=run.name if run is not None else evaluation_name,
|
|
134
|
+
tracking_uri=tracking_uri,
|
|
135
|
+
subscription_id=ws_triad.subscription_id,
|
|
136
|
+
group_name=ws_triad.resource_group_name,
|
|
137
|
+
workspace_name=ws_triad.workspace_name,
|
|
138
|
+
ml_client=azure_pf_client.ml_client,
|
|
139
|
+
promptflow_run=run,
|
|
140
|
+
) as ev_run:
|
|
141
|
+
artifact_name = EvalRun.EVALUATION_ARTIFACT
|
|
142
|
+
|
|
143
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
144
|
+
# storing multi_modal images if exists
|
|
145
|
+
col_name = "inputs.conversation"
|
|
146
|
+
if col_name in instance_results.columns:
|
|
147
|
+
for item in instance_results[col_name].items():
|
|
148
|
+
value = item[1]
|
|
149
|
+
if "messages" in value:
|
|
150
|
+
_store_multimodal_content(value["messages"], tmpdir)
|
|
151
|
+
|
|
152
|
+
# storing artifact result
|
|
153
|
+
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
154
|
+
|
|
155
|
+
with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
156
|
+
f.write(instance_results.to_json(orient="records", lines=True))
|
|
157
|
+
|
|
158
|
+
ev_run.log_artifact(tmpdir, artifact_name)
|
|
159
|
+
|
|
160
|
+
# Using mlflow to create a dummy run since once created via PF show traces of dummy run in UI.
|
|
161
|
+
# Those traces can be confusing.
|
|
162
|
+
# adding these properties to avoid showing traces if a dummy run is created.
|
|
163
|
+
# We are doing that only for the pure evaluation runs.
|
|
164
|
+
if run is None:
|
|
165
|
+
ev_run.write_properties_to_run_history(
|
|
166
|
+
properties={
|
|
167
|
+
EvaluationRunProperties.RUN_TYPE: "eval_run",
|
|
168
|
+
EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
|
|
169
|
+
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
170
|
+
}
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
for metric_name, metric_value in metrics.items():
|
|
174
|
+
ev_run.log_metric(metric_name, metric_value)
|
|
175
|
+
|
|
176
|
+
evaluation_id = ev_run.info.run_name if run is not None else ev_run.info.run_id
|
|
177
|
+
return _get_ai_studio_url(trace_destination=trace_destination, evaluation_id=evaluation_id)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
|
|
181
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
182
|
+
studio_base_url = os.getenv("AI_STUDIO_BASE_URL", "https://ai.azure.com")
|
|
183
|
+
|
|
184
|
+
studio_url = (
|
|
185
|
+
f"{studio_base_url}/build/evaluation/{evaluation_id}?wsid=/subscriptions/{ws_triad.subscription_id}"
|
|
186
|
+
f"/resourceGroups/{ws_triad.resource_group_name}/providers/Microsoft.MachineLearningServices/"
|
|
187
|
+
f"workspaces/{ws_triad.workspace_name}"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return studio_url
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
|
|
194
|
+
subscription_id = project_scope["subscription_id"]
|
|
195
|
+
resource_group_name = project_scope["resource_group_name"]
|
|
196
|
+
workspace_name = project_scope["project_name"]
|
|
197
|
+
|
|
198
|
+
trace_destination = (
|
|
199
|
+
f"azureml://subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/"
|
|
200
|
+
f"providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
return trace_destination
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
207
|
+
p = Path(path)
|
|
208
|
+
if p.is_dir():
|
|
209
|
+
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
210
|
+
|
|
211
|
+
with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
|
|
212
|
+
json.dump(data_dict, f)
|
|
213
|
+
|
|
214
|
+
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _apply_column_mapping(
|
|
218
|
+
source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
|
|
219
|
+
) -> pd.DataFrame:
|
|
220
|
+
"""
|
|
221
|
+
Apply column mapping to source_df based on mapping_config.
|
|
222
|
+
|
|
223
|
+
This function is used for pre-validation of input data for evaluators
|
|
224
|
+
:param source_df: the data frame to be changed.
|
|
225
|
+
:type source_df: pd.DataFrame
|
|
226
|
+
:param mapping_config: The configuration, containing column mapping.
|
|
227
|
+
:type mapping_config: Dict[str, str].
|
|
228
|
+
:param inplace: If true, the source_df will be changed inplace.
|
|
229
|
+
:type inplace: bool
|
|
230
|
+
:return: The modified data frame.
|
|
231
|
+
:rtype: pd.DataFrame
|
|
232
|
+
"""
|
|
233
|
+
result_df = source_df
|
|
234
|
+
|
|
235
|
+
if mapping_config:
|
|
236
|
+
column_mapping = {}
|
|
237
|
+
columns_to_drop = set()
|
|
238
|
+
pattern_prefix = "data."
|
|
239
|
+
run_outputs_prefix = "run.outputs."
|
|
240
|
+
|
|
241
|
+
for map_to_key, map_value in mapping_config.items():
|
|
242
|
+
match = re.search(r"^\${([^{}]+)}$", map_value)
|
|
243
|
+
if match is not None:
|
|
244
|
+
pattern = match.group(1)
|
|
245
|
+
if pattern.startswith(pattern_prefix):
|
|
246
|
+
map_from_key = pattern[len(pattern_prefix) :]
|
|
247
|
+
elif pattern.startswith(run_outputs_prefix):
|
|
248
|
+
# Target-generated columns always starts from .outputs.
|
|
249
|
+
map_from_key = f"{Prefixes.TSG_OUTPUTS}{pattern[len(run_outputs_prefix) :]}"
|
|
250
|
+
# if we are not renaming anything, skip.
|
|
251
|
+
if map_from_key == map_to_key:
|
|
252
|
+
continue
|
|
253
|
+
# If column needs to be mapped to already existing column, we will add it
|
|
254
|
+
# to the drop list.
|
|
255
|
+
if map_to_key in source_df.columns:
|
|
256
|
+
columns_to_drop.add(map_to_key)
|
|
257
|
+
column_mapping[map_from_key] = map_to_key
|
|
258
|
+
# If we map column to another one, which is already present in the data
|
|
259
|
+
# set and the letter also needs to be mapped, we will not drop it, but map
|
|
260
|
+
# instead.
|
|
261
|
+
columns_to_drop = columns_to_drop - set(column_mapping.keys())
|
|
262
|
+
result_df = source_df.drop(columns=columns_to_drop, inplace=inplace)
|
|
263
|
+
result_df.rename(columns=column_mapping, inplace=True)
|
|
264
|
+
|
|
265
|
+
return result_df
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _has_aggregator(evaluator: object) -> bool:
|
|
269
|
+
return hasattr(evaluator, "__aggregate__")
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def get_int_env_var(env_var_name: str, default_value: int) -> int:
|
|
273
|
+
"""
|
|
274
|
+
The function `get_int_env_var` retrieves an integer environment variable value, with a
|
|
275
|
+
default value if the variable is not set or cannot be converted to an integer.
|
|
276
|
+
|
|
277
|
+
:param env_var_name: The name of the environment variable you want to retrieve the value of
|
|
278
|
+
:type env_var_name: str
|
|
279
|
+
:param default_value: The default value is the value that will be returned if the environment
|
|
280
|
+
variable is not found or if it cannot be converted to an integer
|
|
281
|
+
:type default_value: int
|
|
282
|
+
:return: an integer value.
|
|
283
|
+
:rtype: int
|
|
284
|
+
"""
|
|
285
|
+
try:
|
|
286
|
+
return int(os.environ[env_var_name])
|
|
287
|
+
except (ValueError, KeyError):
|
|
288
|
+
return default_value
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def set_event_loop_policy() -> None:
|
|
292
|
+
import asyncio
|
|
293
|
+
import platform
|
|
294
|
+
|
|
295
|
+
if platform.system().lower() == "windows":
|
|
296
|
+
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
297
|
+
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
298
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) # type: ignore[attr-defined]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._bleu import BleuScoreEvaluator
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"BleuScoreEvaluator",
|
|
9
|
+
]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
6
|
+
|
|
7
|
+
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _AsyncBleuScoreEvaluator:
|
|
11
|
+
def __init__(self):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
|
|
15
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
+
|
|
18
|
+
# NIST Smoothing
|
|
19
|
+
smoothing_function = SmoothingFunction().method4
|
|
20
|
+
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
21
|
+
|
|
22
|
+
return {
|
|
23
|
+
"bleu_score": score,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BleuScoreEvaluator:
|
|
28
|
+
"""
|
|
29
|
+
Calculate the BLEU score for a given response and ground truth.
|
|
30
|
+
|
|
31
|
+
BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
|
|
32
|
+
translation. It is widely used in text summarization and text generation use cases.
|
|
33
|
+
|
|
34
|
+
Use the BLEU score when you want to evaluate the similarity between the generated text and reference text,
|
|
35
|
+
especially in tasks such as machine translation or text summarization, where n-gram overlap is a significant
|
|
36
|
+
indicator of quality.
|
|
37
|
+
|
|
38
|
+
The BLEU score ranges from 0 to 1, with higher scores indicating better quality.
|
|
39
|
+
|
|
40
|
+
.. admonition:: Example:
|
|
41
|
+
|
|
42
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
43
|
+
:start-after: [START bleu_score_evaluator]
|
|
44
|
+
:end-before: [END bleu_score_evaluator]
|
|
45
|
+
:language: python
|
|
46
|
+
:dedent: 8
|
|
47
|
+
:caption: Initialize and call an BleuScoreEvaluator.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
|
|
51
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
52
|
+
|
|
53
|
+
def __init__(self):
|
|
54
|
+
self._async_evaluator = _AsyncBleuScoreEvaluator()
|
|
55
|
+
|
|
56
|
+
def __call__(self, *, response: str, ground_truth: str, **kwargs):
|
|
57
|
+
"""
|
|
58
|
+
Evaluate the BLEU score between the response and the ground truth.
|
|
59
|
+
|
|
60
|
+
:keyword response: The response to be evaluated.
|
|
61
|
+
:paramtype response: str
|
|
62
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
63
|
+
:paramtype ground_truth: str
|
|
64
|
+
:return: The BLEU score.
|
|
65
|
+
:rtype: Dict[str, float]
|
|
66
|
+
"""
|
|
67
|
+
return async_run_allowing_running_loop(
|
|
68
|
+
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def _to_async(self):
|
|
72
|
+
return self._async_evaluator
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import os
|
|
5
|
+
from typing import Dict, Union, List
|
|
6
|
+
|
|
7
|
+
from typing_extensions import overload, override
|
|
8
|
+
|
|
9
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
10
|
+
from azure.ai.evaluation._model_configurations import Conversation
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
14
|
+
"""
|
|
15
|
+
Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
|
|
16
|
+
|
|
17
|
+
The coherence measure assesses the ability of the language model to generate text that reads naturally,
|
|
18
|
+
flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
|
|
19
|
+
and user-friendliness of a model's generated responses in real-world applications.
|
|
20
|
+
|
|
21
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
22
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
23
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
24
|
+
|
|
25
|
+
.. admonition:: Example:
|
|
26
|
+
|
|
27
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
28
|
+
:start-after: [START coherence_evaluator]
|
|
29
|
+
:end-before: [END coherence_evaluator]
|
|
30
|
+
:language: python
|
|
31
|
+
:dedent: 8
|
|
32
|
+
:caption: Initialize and call a CoherenceEvaluator with a query and response.
|
|
33
|
+
|
|
34
|
+
.. note::
|
|
35
|
+
|
|
36
|
+
To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
|
|
37
|
+
To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
|
|
38
|
+
however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
_PROMPTY_FILE = "coherence.prompty"
|
|
42
|
+
_RESULT_KEY = "coherence"
|
|
43
|
+
|
|
44
|
+
id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
|
|
45
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
46
|
+
|
|
47
|
+
@override
|
|
48
|
+
def __init__(self, model_config):
|
|
49
|
+
current_dir = os.path.dirname(__file__)
|
|
50
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
51
|
+
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
|
|
52
|
+
|
|
53
|
+
@overload
|
|
54
|
+
def __call__(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
query: str,
|
|
58
|
+
response: str,
|
|
59
|
+
) -> Dict[str, Union[str, float]]:
|
|
60
|
+
"""Evaluate coherence for given input of query, response
|
|
61
|
+
|
|
62
|
+
:keyword query: The query to be evaluated.
|
|
63
|
+
:paramtype query: str
|
|
64
|
+
:keyword response: The response to be evaluated.
|
|
65
|
+
:paramtype response: str
|
|
66
|
+
:return: The coherence score.
|
|
67
|
+
:rtype: Dict[str, float]
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
@overload
|
|
71
|
+
def __call__(
|
|
72
|
+
self,
|
|
73
|
+
*,
|
|
74
|
+
conversation: Conversation,
|
|
75
|
+
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
76
|
+
"""Evaluate coherence for a conversation
|
|
77
|
+
|
|
78
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
79
|
+
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
80
|
+
to be dictionaries with keys "content", "role", and possibly "context".
|
|
81
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
82
|
+
:return: The coherence score.
|
|
83
|
+
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
@override
|
|
87
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
88
|
+
self,
|
|
89
|
+
*args,
|
|
90
|
+
**kwargs,
|
|
91
|
+
):
|
|
92
|
+
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
|
|
93
|
+
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
|
|
94
|
+
turns, the evaluator will aggregate the results of each turn.
|
|
95
|
+
|
|
96
|
+
:keyword query: The query to be evaluated.
|
|
97
|
+
:paramtype query: str
|
|
98
|
+
:keyword response: The response to be evaluated.
|
|
99
|
+
:paramtype response: Optional[str]
|
|
100
|
+
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
101
|
+
key "messages". Conversation turns are expected
|
|
102
|
+
to be dictionaries with keys "content" and "role".
|
|
103
|
+
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
104
|
+
:return: The relevance score.
|
|
105
|
+
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
|
|
106
|
+
"""
|
|
107
|
+
return super().__call__(*args, **kwargs)
|