azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +60 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/constants.py +65 -0
- azure/ai/evaluation/_common/rai_service.py +452 -0
- azure/ai/evaluation/_common/utils.py +87 -0
- azure/ai/evaluation/_constants.py +50 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
- azure/ai/evaluation/_evaluate/_utils.py +237 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
- azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
- azure/ai/evaluation/_exceptions.py +107 -0
- azure/ai/evaluation/_http_utils.py +395 -0
- azure/ai/evaluation/_model_configurations.py +27 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +15 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
- azure/ai/evaluation/simulator/_constants.py +17 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
- azure/ai/evaluation/simulator/_tracing.py +92 -0
- azure/ai/evaluation/simulator/_utils.py +111 -0
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0b1.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
import inspect
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Callable, Dict
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from promptflow._sdk.entities._flows import FlexFlow as flex_flow
|
|
14
|
+
from promptflow._sdk.entities._flows import Prompty as prompty_sdk
|
|
15
|
+
from promptflow._sdk.entities._flows.dag import Flow as dag_flow
|
|
16
|
+
from promptflow.client import PFClient
|
|
17
|
+
from promptflow.core import Prompty as prompty_core
|
|
18
|
+
|
|
19
|
+
from ..._user_agent import USER_AGENT
|
|
20
|
+
from .._utils import _trace_destination_from_project_scope
|
|
21
|
+
|
|
22
|
+
LOGGER = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _get_evaluator_type(evaluator: Dict[str, Callable]):
|
|
26
|
+
"""
|
|
27
|
+
Get evaluator type for telemetry.
|
|
28
|
+
|
|
29
|
+
:param evaluator: The evaluator object
|
|
30
|
+
:type evaluator: Dict[str, Callable]
|
|
31
|
+
:return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
|
|
32
|
+
:rtype: str
|
|
33
|
+
"""
|
|
34
|
+
built_in = False
|
|
35
|
+
content_safety = False
|
|
36
|
+
|
|
37
|
+
module = inspect.getmodule(evaluator)
|
|
38
|
+
built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
|
|
39
|
+
if built_in:
|
|
40
|
+
content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
|
|
41
|
+
|
|
42
|
+
if content_safety:
|
|
43
|
+
return "content-safety"
|
|
44
|
+
if built_in:
|
|
45
|
+
return "built-in"
|
|
46
|
+
return "custom"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _get_evaluator_properties(evaluator, evaluator_name):
|
|
50
|
+
"""
|
|
51
|
+
Get evaluator properties for telemetry.
|
|
52
|
+
|
|
53
|
+
:param: evaluator: The evaluator object
|
|
54
|
+
:param: evaluator_name: The alias for the evaluator
|
|
55
|
+
:type: str
|
|
56
|
+
:raises Exception: If the evaluator properties cannot be retrieved
|
|
57
|
+
:return: A dictionary containing the evaluator properties, including
|
|
58
|
+
"name": A name for the evaluator
|
|
59
|
+
"pf_type": The promptflow type being used
|
|
60
|
+
"type": The evaluator type. Accepted values are "built-in", "custom", and "content-safety"
|
|
61
|
+
"alias": The alias for the evaluator. Defaults to an empty string.
|
|
62
|
+
:rtype: Dict[str, str]
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Cover flex flow and prompty based evaluator
|
|
67
|
+
if isinstance(evaluator, (prompty_sdk, prompty_core, flex_flow)):
|
|
68
|
+
name = evaluator.name
|
|
69
|
+
pf_type = evaluator.__class__.__name__
|
|
70
|
+
# Cover dag flow based evaluator
|
|
71
|
+
elif isinstance(evaluator, dag_flow):
|
|
72
|
+
name = evaluator.name
|
|
73
|
+
pf_type = "DagFlow"
|
|
74
|
+
elif inspect.isfunction(evaluator):
|
|
75
|
+
name = evaluator.__name__
|
|
76
|
+
pf_type = flex_flow.__name__
|
|
77
|
+
elif hasattr(evaluator, "__class__") and callable(evaluator):
|
|
78
|
+
name = evaluator.__class__.__name__
|
|
79
|
+
pf_type = flex_flow.__name__
|
|
80
|
+
else:
|
|
81
|
+
# fallback option
|
|
82
|
+
name = str(evaluator)
|
|
83
|
+
pf_type = "Unknown"
|
|
84
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
85
|
+
LOGGER.debug(f"Failed to get evaluator properties: {e}")
|
|
86
|
+
name = str(evaluator)
|
|
87
|
+
pf_type = "Unknown"
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
"name": name,
|
|
91
|
+
"pf_type": pf_type,
|
|
92
|
+
"type": _get_evaluator_type(evaluator),
|
|
93
|
+
"alias": evaluator_name if evaluator_name else "",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# cspell:ignore isna
|
|
98
|
+
def log_evaluate_activity(func) -> None:
|
|
99
|
+
"""Decorator to log evaluate activity
|
|
100
|
+
|
|
101
|
+
:param func: The function to be decorated
|
|
102
|
+
:type func: Callable
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
@functools.wraps(func)
|
|
106
|
+
def wrapper(*args, **kwargs) -> Callable:
|
|
107
|
+
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
108
|
+
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
109
|
+
|
|
110
|
+
evaluators = kwargs.get("evaluators", [])
|
|
111
|
+
azure_ai_project = kwargs.get("azure_ai_project", None)
|
|
112
|
+
|
|
113
|
+
pf_client = PFClient(
|
|
114
|
+
config=(
|
|
115
|
+
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
116
|
+
if azure_ai_project
|
|
117
|
+
else None
|
|
118
|
+
),
|
|
119
|
+
user_agent=USER_AGENT,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
track_in_cloud = bool(pf_client._config.get_trace_destination())
|
|
123
|
+
evaluate_target = bool(kwargs.get("target", None))
|
|
124
|
+
evaluator_config = bool(kwargs.get("evaluator_config", None))
|
|
125
|
+
custom_dimensions = {
|
|
126
|
+
"track_in_cloud": track_in_cloud,
|
|
127
|
+
"evaluate_target": evaluate_target,
|
|
128
|
+
"evaluator_config": evaluator_config,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
with log_activity(
|
|
132
|
+
get_telemetry_logger(),
|
|
133
|
+
"pf.evals.evaluate",
|
|
134
|
+
activity_type=ActivityType.PUBLICAPI,
|
|
135
|
+
user_agent=USER_AGENT,
|
|
136
|
+
custom_dimensions=custom_dimensions,
|
|
137
|
+
):
|
|
138
|
+
result = func(*args, **kwargs)
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
evaluators_info = []
|
|
142
|
+
for evaluator_name, evaluator in evaluators.items():
|
|
143
|
+
evaluator_info = _get_evaluator_properties(evaluator, evaluator_name)
|
|
144
|
+
try:
|
|
145
|
+
evaluator_df = pd.DataFrame(result.get("rows", [])).filter(
|
|
146
|
+
like=f"outputs.{evaluator_name}", axis=1
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
failed_rows = (
|
|
150
|
+
evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum())
|
|
151
|
+
)
|
|
152
|
+
total_rows = evaluator_df.shape[0]
|
|
153
|
+
|
|
154
|
+
evaluator_info["failed_rows"] = failed_rows
|
|
155
|
+
evaluator_info["total_rows"] = total_rows
|
|
156
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
157
|
+
LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}")
|
|
158
|
+
evaluators_info.append(evaluator_info)
|
|
159
|
+
|
|
160
|
+
custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)}
|
|
161
|
+
with log_activity(
|
|
162
|
+
get_telemetry_logger(),
|
|
163
|
+
"pf.evals.evaluate_usage_info",
|
|
164
|
+
activity_type=ActivityType.PUBLICAPI,
|
|
165
|
+
user_agent=USER_AGENT,
|
|
166
|
+
custom_dimensions=custom_dimensions,
|
|
167
|
+
):
|
|
168
|
+
pass
|
|
169
|
+
except Exception as e: # pylint: disable=broad-exception-caught
|
|
170
|
+
LOGGER.debug(f"Failed to collect evaluate usage info: {e}")
|
|
171
|
+
|
|
172
|
+
return result
|
|
173
|
+
|
|
174
|
+
return wrapper
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import tempfile
|
|
9
|
+
from collections import namedtuple
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, Prefixes
|
|
15
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
16
|
+
from azure.ai.evaluation._evaluate._eval_run import EvalRun
|
|
17
|
+
|
|
18
|
+
LOGGER = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
AZURE_WORKSPACE_REGEX_FORMAT = (
|
|
21
|
+
"^azureml:[/]{1,2}subscriptions/([^/]+)/resource(groups|Groups)/([^/]+)"
|
|
22
|
+
"(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_none(value):
|
|
29
|
+
return value is None or str(value).lower() == "none"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long
|
|
33
|
+
match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
|
|
34
|
+
if not match or len(match.groups()) != 5:
|
|
35
|
+
raise EvaluationException(
|
|
36
|
+
message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
37
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
38
|
+
f"workspaces/<workspace_name>, got {trace_provider}",
|
|
39
|
+
internal_message="Malformed trace provider string, expected azureml://subscriptions/<subscription_id>/"
|
|
40
|
+
"resourceGroups/<resource_group>/providers/Microsoft.MachineLearningServices/"
|
|
41
|
+
"workspaces/<workspace_name>,",
|
|
42
|
+
target=ErrorTarget.UNKNOWN,
|
|
43
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
44
|
+
blame=ErrorBlame.UNKNOWN,
|
|
45
|
+
)
|
|
46
|
+
subscription_id = match.group(1)
|
|
47
|
+
resource_group_name = match.group(3)
|
|
48
|
+
workspace_name = match.group(5)
|
|
49
|
+
return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def load_jsonl(path):
|
|
53
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
54
|
+
return [json.loads(line) for line in f.readlines()]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _azure_pf_client_and_triad(trace_destination):
|
|
58
|
+
from promptflow.azure._cli._utils import _get_azure_pf_client
|
|
59
|
+
|
|
60
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
61
|
+
azure_pf_client = _get_azure_pf_client(
|
|
62
|
+
subscription_id=ws_triad.subscription_id,
|
|
63
|
+
resource_group=ws_triad.resource_group_name,
|
|
64
|
+
workspace_name=ws_triad.workspace_name,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return azure_pf_client, ws_triad
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _log_metrics_and_instance_results(
|
|
71
|
+
metrics,
|
|
72
|
+
instance_results,
|
|
73
|
+
trace_destination,
|
|
74
|
+
run,
|
|
75
|
+
evaluation_name,
|
|
76
|
+
) -> str:
|
|
77
|
+
if trace_destination is None:
|
|
78
|
+
LOGGER.error("Unable to log traces as trace destination was not defined.")
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
|
|
82
|
+
tracking_uri = azure_pf_client.ml_client.workspaces.get(ws_triad.workspace_name).mlflow_tracking_uri
|
|
83
|
+
|
|
84
|
+
# Adding line_number as index column this is needed by UI to form link to individual instance run
|
|
85
|
+
instance_results["line_number"] = instance_results.index.values
|
|
86
|
+
|
|
87
|
+
with EvalRun(
|
|
88
|
+
run_name=run.name if run is not None else evaluation_name,
|
|
89
|
+
tracking_uri=tracking_uri,
|
|
90
|
+
subscription_id=ws_triad.subscription_id,
|
|
91
|
+
group_name=ws_triad.resource_group_name,
|
|
92
|
+
workspace_name=ws_triad.workspace_name,
|
|
93
|
+
ml_client=azure_pf_client.ml_client,
|
|
94
|
+
promptflow_run=run,
|
|
95
|
+
) as ev_run:
|
|
96
|
+
|
|
97
|
+
artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
|
|
98
|
+
|
|
99
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
100
|
+
tmp_path = os.path.join(tmpdir, artifact_name)
|
|
101
|
+
|
|
102
|
+
with open(tmp_path, "w", encoding="utf-8") as f:
|
|
103
|
+
f.write(instance_results.to_json(orient="records", lines=True))
|
|
104
|
+
|
|
105
|
+
ev_run.log_artifact(tmpdir, artifact_name)
|
|
106
|
+
|
|
107
|
+
# Using mlflow to create a dummy run since once created via PF show traces of dummy run in UI.
|
|
108
|
+
# Those traces can be confusing.
|
|
109
|
+
# adding these properties to avoid showing traces if a dummy run is created.
|
|
110
|
+
# We are doing that only for the pure evaluation runs.
|
|
111
|
+
if run is None:
|
|
112
|
+
ev_run.write_properties_to_run_history(
|
|
113
|
+
properties={
|
|
114
|
+
"_azureml.evaluation_run": "azure-ai-generative-parent",
|
|
115
|
+
"_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
|
|
116
|
+
"isEvaluatorRun": "true",
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
for metric_name, metric_value in metrics.items():
|
|
121
|
+
ev_run.log_metric(metric_name, metric_value)
|
|
122
|
+
|
|
123
|
+
evaluation_id = ev_run.info.run_name if run is not None else ev_run.info.run_id
|
|
124
|
+
return _get_ai_studio_url(trace_destination=trace_destination, evaluation_id=evaluation_id)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
|
|
128
|
+
ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
|
|
129
|
+
studio_base_url = os.getenv("AI_STUDIO_BASE_URL", "https://ai.azure.com")
|
|
130
|
+
|
|
131
|
+
studio_url = (
|
|
132
|
+
f"{studio_base_url}/build/evaluation/{evaluation_id}?wsid=/subscriptions/{ws_triad.subscription_id}"
|
|
133
|
+
f"/resourceGroups/{ws_triad.resource_group_name}/providers/Microsoft.MachineLearningServices/"
|
|
134
|
+
f"workspaces/{ws_triad.workspace_name}"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return studio_url
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _trace_destination_from_project_scope(project_scope: dict) -> str:
|
|
141
|
+
subscription_id = project_scope["subscription_id"]
|
|
142
|
+
resource_group_name = project_scope["resource_group_name"]
|
|
143
|
+
workspace_name = project_scope["project_name"]
|
|
144
|
+
|
|
145
|
+
trace_destination = (
|
|
146
|
+
f"azureml://subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/"
|
|
147
|
+
f"providers/Microsoft.MachineLearningServices/workspaces/{workspace_name}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return trace_destination
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _write_output(path, data_dict):
|
|
154
|
+
p = Path(path)
|
|
155
|
+
if os.path.isdir(path):
|
|
156
|
+
p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
|
|
157
|
+
|
|
158
|
+
with open(p, "w") as f:
|
|
159
|
+
json.dump(data_dict, f)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False) -> pd.DataFrame:
|
|
163
|
+
"""
|
|
164
|
+
Apply column mapping to source_df based on mapping_config.
|
|
165
|
+
|
|
166
|
+
This function is used for pre-validation of input data for evaluators
|
|
167
|
+
:param source_df: the data frame to be changed.
|
|
168
|
+
:type source_df: pd.DataFrame
|
|
169
|
+
:param mapping_config: The configuration, containing column mapping.
|
|
170
|
+
:type mapping_config: dict.
|
|
171
|
+
:param inplace: If true, the source_df will be changed inplace.
|
|
172
|
+
:type inplace: bool
|
|
173
|
+
:return: The modified data frame.
|
|
174
|
+
"""
|
|
175
|
+
result_df = source_df
|
|
176
|
+
|
|
177
|
+
if mapping_config:
|
|
178
|
+
column_mapping = {}
|
|
179
|
+
columns_to_drop = set()
|
|
180
|
+
pattern_prefix = "data."
|
|
181
|
+
run_outputs_prefix = "run.outputs."
|
|
182
|
+
|
|
183
|
+
for map_to_key, map_value in mapping_config.items():
|
|
184
|
+
match = re.search(r"^\${([^{}]+)}$", map_value)
|
|
185
|
+
if match is not None:
|
|
186
|
+
pattern = match.group(1)
|
|
187
|
+
if pattern.startswith(pattern_prefix):
|
|
188
|
+
map_from_key = pattern[len(pattern_prefix) :]
|
|
189
|
+
elif pattern.startswith(run_outputs_prefix):
|
|
190
|
+
# Target-generated columns always starts from .outputs.
|
|
191
|
+
map_from_key = f"{Prefixes.TSG_OUTPUTS}{pattern[len(run_outputs_prefix) :]}"
|
|
192
|
+
# if we are not renaming anything, skip.
|
|
193
|
+
if map_from_key == map_to_key:
|
|
194
|
+
continue
|
|
195
|
+
# If column needs to be mapped to already existing column, we will add it
|
|
196
|
+
# to the drop list.
|
|
197
|
+
if map_to_key in source_df.columns:
|
|
198
|
+
columns_to_drop.add(map_to_key)
|
|
199
|
+
column_mapping[map_from_key] = map_to_key
|
|
200
|
+
# If we map column to another one, which is already present in the data
|
|
201
|
+
# set and the letter also needs to be mapped, we will not drop it, but map
|
|
202
|
+
# instead.
|
|
203
|
+
columns_to_drop = columns_to_drop - set(column_mapping.keys())
|
|
204
|
+
result_df = source_df.drop(columns=columns_to_drop, inplace=inplace)
|
|
205
|
+
result_df.rename(columns=column_mapping, inplace=True)
|
|
206
|
+
|
|
207
|
+
return result_df
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _has_aggregator(evaluator):
|
|
211
|
+
return hasattr(evaluator, "__aggregate__")
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def get_int_env_var(env_var_name, default_value=None):
|
|
215
|
+
"""
|
|
216
|
+
The function `get_int_env_var` retrieves an integer environment variable value, with an optional
|
|
217
|
+
default value if the variable is not set or cannot be converted to an integer.
|
|
218
|
+
|
|
219
|
+
:param env_var_name: The name of the environment variable you want to retrieve the value of
|
|
220
|
+
:param default_value: The default value is the value that will be returned if the environment
|
|
221
|
+
variable is not found or if it cannot be converted to an integer
|
|
222
|
+
:return: an integer value.
|
|
223
|
+
"""
|
|
224
|
+
try:
|
|
225
|
+
return int(os.environ.get(env_var_name, default_value))
|
|
226
|
+
except Exception:
|
|
227
|
+
return default_value
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def set_event_loop_policy():
|
|
231
|
+
import asyncio
|
|
232
|
+
import platform
|
|
233
|
+
|
|
234
|
+
if platform.system().lower() == "windows":
|
|
235
|
+
# Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
|
|
236
|
+
# On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
|
|
237
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._bleu import BleuScoreEvaluator
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"BleuScoreEvaluator",
|
|
9
|
+
]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
|
|
5
|
+
|
|
6
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
7
|
+
from azure.ai.evaluation._common.utils import nltk_tokenize
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _AsyncBleuScoreEvaluator:
|
|
11
|
+
def __init__(self):
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
async def __call__(self, *, response: str, ground_truth: str, **kwargs):
|
|
15
|
+
reference_tokens = nltk_tokenize(ground_truth)
|
|
16
|
+
hypothesis_tokens = nltk_tokenize(response)
|
|
17
|
+
|
|
18
|
+
# NIST Smoothing
|
|
19
|
+
smoothing_function = SmoothingFunction().method4
|
|
20
|
+
score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
|
|
21
|
+
|
|
22
|
+
return {
|
|
23
|
+
"bleu_score": score,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BleuScoreEvaluator:
|
|
28
|
+
"""
|
|
29
|
+
Evaluator that computes the BLEU Score between two strings.
|
|
30
|
+
|
|
31
|
+
BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
|
|
32
|
+
translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
|
|
33
|
+
generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
|
|
34
|
+
better quality.
|
|
35
|
+
|
|
36
|
+
**Usage**
|
|
37
|
+
|
|
38
|
+
.. code-block:: python
|
|
39
|
+
|
|
40
|
+
eval_fn = BleuScoreEvaluator()
|
|
41
|
+
result = eval_fn(
|
|
42
|
+
response="Tokyo is the capital of Japan.",
|
|
43
|
+
ground_truth="The capital of Japan is Tokyo.")
|
|
44
|
+
|
|
45
|
+
**Output format**
|
|
46
|
+
|
|
47
|
+
.. code-block:: python
|
|
48
|
+
|
|
49
|
+
{
|
|
50
|
+
"bleu_score": 0.22
|
|
51
|
+
}
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self):
|
|
55
|
+
self._async_evaluator = _AsyncBleuScoreEvaluator()
|
|
56
|
+
|
|
57
|
+
def __call__(self, *, response: str, ground_truth: str, **kwargs):
|
|
58
|
+
"""
|
|
59
|
+
Evaluate the BLEU score between the response and the ground truth.
|
|
60
|
+
|
|
61
|
+
:keyword response: The response to be evaluated.
|
|
62
|
+
:paramtype response: str
|
|
63
|
+
:keyword ground_truth: The ground truth to be compared against.
|
|
64
|
+
:paramtype ground_truth: str
|
|
65
|
+
:return: The BLEU score.
|
|
66
|
+
:rtype: dict
|
|
67
|
+
"""
|
|
68
|
+
return async_run_allowing_running_loop(
|
|
69
|
+
self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def _to_async(self):
|
|
73
|
+
return self._async_evaluator
|