azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +1 -1
- azure/ai/evaluation/_common/constants.py +4 -2
- azure/ai/evaluation/_common/math.py +18 -0
- azure/ai/evaluation/_common/rai_service.py +54 -62
- azure/ai/evaluation/_common/utils.py +182 -12
- azure/ai/evaluation/_constants.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +10 -3
- azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +17 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +26 -10
- azure/ai/evaluation/_evaluate/_evaluate.py +116 -62
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +16 -17
- azure/ai/evaluation/_evaluate/_utils.py +44 -25
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +59 -30
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +10 -13
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +18 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -20
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +63 -42
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +4 -4
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -4
- azure/ai/evaluation/_evaluators/_eci/_eci.py +4 -4
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +3 -2
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +4 -4
- azure/ai/evaluation/_evaluators/_qa/_qa.py +4 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +3 -2
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +11 -8
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +21 -7
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -5
- azure/ai/evaluation/_exceptions.py +9 -6
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +5 -5
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +85 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/_experimental.py +20 -9
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +4 -4
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +22 -64
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +0 -5
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +0 -4
- azure/ai/evaluation/simulator/_simulator.py +112 -113
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/METADATA +72 -44
- azure_ai_evaluation-1.0.0b4.dist-info/NOTICE.txt +50 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/RECORD +64 -56
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b4.dist-info}/top_level.txt +0 -0
|
@@ -5,8 +5,9 @@ import inspect
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
|
+
from concurrent.futures import Future
|
|
8
9
|
from pathlib import Path
|
|
9
|
-
from typing import Callable, Dict, Optional, Union
|
|
10
|
+
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
10
11
|
|
|
11
12
|
import pandas as pd
|
|
12
13
|
from promptflow.contracts.types import AttrDict
|
|
@@ -22,25 +23,31 @@ LOGGER = logging.getLogger(__name__)
|
|
|
22
23
|
|
|
23
24
|
class CodeRun:
|
|
24
25
|
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
run: Future,
|
|
29
|
+
input_data,
|
|
30
|
+
evaluator_name: Optional[str] = None,
|
|
31
|
+
aggregator: Callable[["CodeRun"], Future],
|
|
32
|
+
**kwargs, # pylint: disable=unused-argument
|
|
33
|
+
) -> None:
|
|
27
34
|
self.run = run
|
|
28
35
|
self.evaluator_name = evaluator_name if evaluator_name is not None else ""
|
|
29
36
|
self.input_data = input_data
|
|
30
|
-
self.aggregated_metrics =
|
|
37
|
+
self.aggregated_metrics = aggregator(self)
|
|
31
38
|
|
|
32
|
-
def get_result_df(self, exclude_inputs=False):
|
|
39
|
+
def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
|
|
33
40
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
34
|
-
result_df = self.run.result(timeout=batch_run_timeout)
|
|
41
|
+
result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
|
|
35
42
|
if exclude_inputs:
|
|
36
43
|
result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
|
|
37
44
|
return result_df
|
|
38
45
|
|
|
39
|
-
def get_aggregated_metrics(self):
|
|
46
|
+
def get_aggregated_metrics(self) -> Dict[str, Any]:
|
|
40
47
|
try:
|
|
41
48
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
42
|
-
aggregated_metrics = (
|
|
43
|
-
self.aggregated_metrics.result(timeout=batch_run_timeout)
|
|
49
|
+
aggregated_metrics: Optional[Any] = (
|
|
50
|
+
cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
|
|
44
51
|
if self.aggregated_metrics is not None
|
|
45
52
|
else None
|
|
46
53
|
)
|
|
@@ -104,10 +111,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
104
111
|
verify_integrity=True,
|
|
105
112
|
)
|
|
106
113
|
|
|
107
|
-
|
|
114
|
+
@staticmethod
|
|
115
|
+
def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
|
|
108
116
|
try:
|
|
109
117
|
if _has_aggregator(evaluator):
|
|
110
|
-
aggregate_input = None
|
|
111
118
|
evaluator_output = run.get_result_df(exclude_inputs=True)
|
|
112
119
|
if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
|
|
113
120
|
aggregate_input = evaluator_output["output"].tolist()
|
|
@@ -152,21 +159,30 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
152
159
|
column_mapping=column_mapping,
|
|
153
160
|
evaluator_name=evaluator_name,
|
|
154
161
|
)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
162
|
+
|
|
163
|
+
return CodeRun(
|
|
164
|
+
run=eval_future,
|
|
165
|
+
input_data=data,
|
|
166
|
+
evaluator_name=evaluator_name,
|
|
167
|
+
aggregator=lambda code_run: self._thread_pool.submit(
|
|
168
|
+
self._calculate_aggregations, evaluator=flow, run=code_run
|
|
169
|
+
),
|
|
170
|
+
)
|
|
159
171
|
|
|
160
172
|
def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
|
|
161
173
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
162
174
|
return result_df
|
|
163
175
|
|
|
164
|
-
def get_metrics(self, run: CodeRun) ->
|
|
176
|
+
def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
|
|
165
177
|
try:
|
|
166
178
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
167
179
|
print("Aggregated metrics")
|
|
168
180
|
print(aggregated_metrics)
|
|
169
181
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
170
182
|
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
|
|
171
|
-
return
|
|
183
|
+
return {}
|
|
172
184
|
return aggregated_metrics
|
|
185
|
+
|
|
186
|
+
def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
|
|
187
|
+
# Not implemented
|
|
188
|
+
return None
|
|
@@ -3,11 +3,12 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
5
|
import logging
|
|
6
|
+
import math
|
|
6
7
|
import os
|
|
7
8
|
from concurrent.futures import Future
|
|
8
9
|
from typing import Any, Callable, Dict, Optional, Union
|
|
10
|
+
from collections import OrderedDict
|
|
9
11
|
|
|
10
|
-
import numpy as np
|
|
11
12
|
import pandas as pd
|
|
12
13
|
from promptflow.client import PFClient
|
|
13
14
|
from promptflow.entities import Run
|
|
@@ -53,13 +54,27 @@ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
53
54
|
def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
|
|
54
55
|
run: Run = proxy_run.run.result()
|
|
55
56
|
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
56
|
-
result_df.replace("(Failed)",
|
|
57
|
+
result_df.replace("(Failed)", math.nan, inplace=True)
|
|
57
58
|
return result_df
|
|
58
59
|
|
|
59
60
|
def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
60
61
|
run: Run = proxy_run.run.result()
|
|
61
62
|
return self._pf_client.get_metrics(run)
|
|
62
63
|
|
|
64
|
+
def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
65
|
+
run = proxy_run.run.result()
|
|
66
|
+
|
|
67
|
+
# pylint: disable=protected-access
|
|
68
|
+
return OrderedDict(
|
|
69
|
+
[
|
|
70
|
+
("status", run.status),
|
|
71
|
+
("duration", str(run._end_time - run._created_on)),
|
|
72
|
+
("completed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")),
|
|
73
|
+
("failed_lines", run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")),
|
|
74
|
+
("log_path", str(run._output_path)),
|
|
75
|
+
]
|
|
76
|
+
)
|
|
77
|
+
|
|
63
78
|
@staticmethod
|
|
64
79
|
def _should_batch_use_async(flow):
|
|
65
80
|
if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
@@ -10,10 +10,11 @@ import posixpath
|
|
|
10
10
|
import time
|
|
11
11
|
import types
|
|
12
12
|
import uuid
|
|
13
|
-
from typing import Any, Dict, Optional, Set, Type
|
|
13
|
+
from typing import Any, Dict, List, Optional, Set, Type
|
|
14
14
|
from urllib.parse import urlparse
|
|
15
15
|
|
|
16
16
|
from promptflow._sdk.entities import Run
|
|
17
|
+
from typing_extensions import Self
|
|
17
18
|
|
|
18
19
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
19
20
|
from azure.ai.evaluation._http_utils import get_http_client
|
|
@@ -27,6 +28,7 @@ LOGGER = logging.getLogger(__name__)
|
|
|
27
28
|
# Handle optional import. The azure libraries are only present if
|
|
28
29
|
# promptflow-azure is installed.
|
|
29
30
|
try:
|
|
31
|
+
from azure.ai.ml import MLClient
|
|
30
32
|
from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
|
|
31
33
|
from azure.ai.ml.entities._datastore.datastore import Datastore
|
|
32
34
|
from azure.storage.blob import BlobServiceClient
|
|
@@ -121,8 +123,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
121
123
|
self._run_name = run_name
|
|
122
124
|
self._promptflow_run = promptflow_run
|
|
123
125
|
self._status = RunStatus.NOT_STARTED
|
|
124
|
-
self._url_base = None
|
|
125
|
-
self.
|
|
126
|
+
self._url_base: Optional[str] = None
|
|
127
|
+
self._info: Optional[RunInfo] = None
|
|
126
128
|
|
|
127
129
|
@property
|
|
128
130
|
def status(self) -> RunStatus:
|
|
@@ -134,6 +136,20 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
134
136
|
"""
|
|
135
137
|
return self._status
|
|
136
138
|
|
|
139
|
+
@property
|
|
140
|
+
def info(self) -> RunInfo:
|
|
141
|
+
if self._info is None:
|
|
142
|
+
msg = "Run info is missing"
|
|
143
|
+
raise EvaluationException(
|
|
144
|
+
message=msg,
|
|
145
|
+
internal_message=msg,
|
|
146
|
+
target=ErrorTarget.EVAL_RUN,
|
|
147
|
+
category=ErrorCategory.UNKNOWN,
|
|
148
|
+
blame=ErrorBlame.UNKNOWN,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
return self._info
|
|
152
|
+
|
|
137
153
|
def _get_scope(self) -> str:
|
|
138
154
|
"""
|
|
139
155
|
Return the scope information for the workspace.
|
|
@@ -161,11 +177,11 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
161
177
|
)
|
|
162
178
|
self._url_base = None
|
|
163
179
|
self._status = RunStatus.BROKEN
|
|
164
|
-
self.
|
|
180
|
+
self._info = RunInfo.generate(self._run_name)
|
|
165
181
|
else:
|
|
166
182
|
self._url_base = urlparse(self._tracking_uri).netloc
|
|
167
183
|
if self._promptflow_run is not None:
|
|
168
|
-
self.
|
|
184
|
+
self._info = RunInfo(
|
|
169
185
|
self._promptflow_run.name,
|
|
170
186
|
self._promptflow_run._experiment_name, # pylint: disable=protected-access
|
|
171
187
|
self._promptflow_run.name,
|
|
@@ -182,7 +198,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
182
198
|
body["run_name"] = self._run_name
|
|
183
199
|
response = self.request_with_retry(url=url, method="POST", json_dict=body)
|
|
184
200
|
if response.status_code != 200:
|
|
185
|
-
self.
|
|
201
|
+
self._info = RunInfo.generate(self._run_name)
|
|
186
202
|
LOGGER.warning(
|
|
187
203
|
"The run failed to start: %s: %s."
|
|
188
204
|
"The results will be saved locally, but will not be logged to Azure.",
|
|
@@ -192,7 +208,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
192
208
|
self._status = RunStatus.BROKEN
|
|
193
209
|
else:
|
|
194
210
|
parsed_response = response.json()
|
|
195
|
-
self.
|
|
211
|
+
self._info = RunInfo(
|
|
196
212
|
run_id=parsed_response["run"]["info"]["run_id"],
|
|
197
213
|
experiment_id=parsed_response["run"]["info"]["experiment_id"],
|
|
198
214
|
run_name=parsed_response["run"]["info"]["run_name"],
|
|
@@ -235,7 +251,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
235
251
|
LOGGER.warning("Unable to terminate the run.")
|
|
236
252
|
self._status = RunStatus.TERMINATED
|
|
237
253
|
|
|
238
|
-
def __enter__(self):
|
|
254
|
+
def __enter__(self) -> Self:
|
|
239
255
|
"""The Context Manager enter call.
|
|
240
256
|
|
|
241
257
|
:return: The instance of the class.
|
|
@@ -249,7 +265,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
249
265
|
exc_type: Optional[Type[BaseException]],
|
|
250
266
|
exc_value: Optional[BaseException],
|
|
251
267
|
exc_tb: Optional[types.TracebackType],
|
|
252
|
-
) ->
|
|
268
|
+
) -> None:
|
|
253
269
|
"""The context manager exit call.
|
|
254
270
|
|
|
255
271
|
:param exc_type: The exception type
|
|
@@ -408,7 +424,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
408
424
|
return
|
|
409
425
|
# First we will list the files and the appropriate remote paths for them.
|
|
410
426
|
root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
|
|
411
|
-
remote_paths = {"paths": []}
|
|
427
|
+
remote_paths: Dict[str, List[Dict[str, str]]] = {"paths": []}
|
|
412
428
|
local_paths = []
|
|
413
429
|
# Go over the artifact folder and upload all artifacts.
|
|
414
430
|
for root, _, filenames in os.walk(artifact_folder):
|
|
@@ -4,18 +4,22 @@
|
|
|
4
4
|
import inspect
|
|
5
5
|
import os
|
|
6
6
|
import re
|
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple,
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
|
|
8
|
+
import json
|
|
8
9
|
|
|
9
|
-
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
12
12
|
from promptflow.client import PFClient
|
|
13
|
+
from promptflow.entities import Run
|
|
14
|
+
from promptflow._sdk._errors import MissingAzurePackage
|
|
13
15
|
|
|
16
|
+
from azure.ai.evaluation._common.math import list_sum
|
|
14
17
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
18
|
|
|
16
19
|
from .._constants import (
|
|
17
20
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
18
21
|
EvaluationMetrics,
|
|
22
|
+
EvaluationRunProperties,
|
|
19
23
|
Prefixes,
|
|
20
24
|
_InternalEvaluationMetrics,
|
|
21
25
|
)
|
|
@@ -23,16 +27,25 @@ from .._model_configurations import AzureAIProject, EvaluatorConfig
|
|
|
23
27
|
from .._user_agent import USER_AGENT
|
|
24
28
|
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
|
|
25
29
|
from ._utils import (
|
|
30
|
+
EvaluateResult,
|
|
26
31
|
_apply_column_mapping,
|
|
27
32
|
_log_metrics_and_instance_results,
|
|
28
33
|
_trace_destination_from_project_scope,
|
|
29
34
|
_write_output,
|
|
30
35
|
)
|
|
31
36
|
|
|
37
|
+
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class __EvaluatorInfo(TypedDict):
|
|
41
|
+
result: pd.DataFrame
|
|
42
|
+
metrics: Dict[str, Any]
|
|
43
|
+
run_summary: Dict[str, Any]
|
|
44
|
+
|
|
32
45
|
|
|
33
46
|
# pylint: disable=line-too-long
|
|
34
47
|
def _aggregate_content_safety_metrics(
|
|
35
|
-
df: pd.DataFrame, evaluators: Dict[str,
|
|
48
|
+
df: pd.DataFrame, evaluators: Dict[str, Callable]
|
|
36
49
|
) -> Tuple[List[str], Dict[str, float]]:
|
|
37
50
|
"""Find and aggregate defect rates for content safety metrics. Returns both a list
|
|
38
51
|
of columns that were used to calculate defect rates and the defect rates themselves.
|
|
@@ -73,7 +86,7 @@ def _aggregate_content_safety_metrics(
|
|
|
73
86
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
74
87
|
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
75
88
|
defect_rates[defect_rate_name] = round(
|
|
76
|
-
|
|
89
|
+
list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
|
|
77
90
|
/ col_with_numeric_values.count(),
|
|
78
91
|
2,
|
|
79
92
|
)
|
|
@@ -107,13 +120,13 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
107
120
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
108
121
|
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
109
122
|
defect_rates[defect_rate_name] = round(
|
|
110
|
-
|
|
123
|
+
list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
|
|
111
124
|
2,
|
|
112
125
|
)
|
|
113
126
|
return label_cols, defect_rates
|
|
114
127
|
|
|
115
128
|
|
|
116
|
-
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str,
|
|
129
|
+
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
117
130
|
"""Aggregate metrics from the evaluation results.
|
|
118
131
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
119
132
|
that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
|
|
@@ -122,7 +135,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
|
|
|
122
135
|
:param df: The dataframe of evaluation results.
|
|
123
136
|
:type df: ~pandas.DataFrame
|
|
124
137
|
:param evaluators: A dictionary mapping of strings to evaluator classes.
|
|
125
|
-
:type evaluators: Dict[str,
|
|
138
|
+
:type evaluators: Dict[str, Callable]
|
|
126
139
|
:return: The aggregated metrics.
|
|
127
140
|
:rtype: Dict[str, float]
|
|
128
141
|
"""
|
|
@@ -277,7 +290,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
277
290
|
|
|
278
291
|
def _validate_columns(
|
|
279
292
|
df: pd.DataFrame,
|
|
280
|
-
evaluators: Dict[str,
|
|
293
|
+
evaluators: Dict[str, Callable],
|
|
281
294
|
target: Optional[Callable],
|
|
282
295
|
column_mapping: Dict[str, Dict[str, str]],
|
|
283
296
|
) -> None:
|
|
@@ -287,7 +300,7 @@ def _validate_columns(
|
|
|
287
300
|
:param df: The data frame to be validated.
|
|
288
301
|
:type df: pd.DataFrame
|
|
289
302
|
:param evaluators: The dictionary of evaluators.
|
|
290
|
-
:type evaluators: Dict[str,
|
|
303
|
+
:type evaluators: Dict[str, Callable]
|
|
291
304
|
:param target: The callable to be applied to data set.
|
|
292
305
|
:type target: Optional[Callable]
|
|
293
306
|
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
|
|
@@ -326,7 +339,7 @@ def _apply_target_to_data(
|
|
|
326
339
|
initial_data: pd.DataFrame,
|
|
327
340
|
evaluation_name: Optional[str] = None,
|
|
328
341
|
_run_name: Optional[str] = None,
|
|
329
|
-
) -> Tuple[pd.DataFrame, Set[str]]:
|
|
342
|
+
) -> Tuple[pd.DataFrame, Set[str], Run]:
|
|
330
343
|
"""
|
|
331
344
|
Apply the target function to the data set and return updated data and generated columns.
|
|
332
345
|
|
|
@@ -348,15 +361,15 @@ def _apply_target_to_data(
|
|
|
348
361
|
# We are manually creating the temporary directory for the flow
|
|
349
362
|
# because the way tempdir remove temporary directories will
|
|
350
363
|
# hang the debugger, because promptflow will keep flow directory.
|
|
351
|
-
run = pf_client.run(
|
|
364
|
+
run: Run = pf_client.run(
|
|
352
365
|
flow=target,
|
|
353
366
|
display_name=evaluation_name,
|
|
354
367
|
data=data,
|
|
355
|
-
properties={
|
|
368
|
+
properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
|
|
356
369
|
stream=True,
|
|
357
370
|
name=_run_name,
|
|
358
371
|
)
|
|
359
|
-
target_output = pf_client.runs.get_details(run, all_results=True)
|
|
372
|
+
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
360
373
|
# Remove input and output prefix
|
|
361
374
|
generated_columns = {
|
|
362
375
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -378,16 +391,18 @@ def _apply_target_to_data(
|
|
|
378
391
|
return target_output, generated_columns, run
|
|
379
392
|
|
|
380
393
|
|
|
381
|
-
def _process_column_mappings(
|
|
394
|
+
def _process_column_mappings(
|
|
395
|
+
column_mapping: Dict[str, Optional[Dict[str, str]]],
|
|
396
|
+
) -> Dict[str, Dict[str, str]]:
|
|
382
397
|
"""Process column_mapping to replace ${target.} with ${data.}
|
|
383
398
|
|
|
384
399
|
:param column_mapping: The configuration for evaluators.
|
|
385
|
-
:type column_mapping: Dict[str, Dict[str, str]]
|
|
400
|
+
:type column_mapping: Dict[str, Optional[Dict[str, str]]]
|
|
386
401
|
:return: The processed configuration.
|
|
387
402
|
:rtype: Dict[str, Dict[str, str]]
|
|
388
403
|
"""
|
|
389
404
|
|
|
390
|
-
processed_config = {}
|
|
405
|
+
processed_config: Dict[str, Dict[str, str]] = {}
|
|
391
406
|
|
|
392
407
|
unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
|
|
393
408
|
|
|
@@ -554,41 +569,69 @@ def evaluate(
|
|
|
554
569
|
raise e
|
|
555
570
|
|
|
556
571
|
|
|
572
|
+
def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
573
|
+
# Extract evaluators with a non-empty "run_summary"
|
|
574
|
+
output_dict = {
|
|
575
|
+
name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
if output_dict:
|
|
579
|
+
print("======= Combined Run Summary (Per Evaluator) =======\n")
|
|
580
|
+
print(json.dumps(output_dict, indent=4))
|
|
581
|
+
print("\n====================================================")
|
|
582
|
+
|
|
583
|
+
|
|
557
584
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
558
585
|
*,
|
|
586
|
+
evaluators: Dict[str, Callable],
|
|
559
587
|
evaluation_name: Optional[str] = None,
|
|
560
588
|
target: Optional[Callable] = None,
|
|
561
|
-
data:
|
|
562
|
-
evaluators: Optional[Dict[str, Callable]] = None,
|
|
589
|
+
data: str,
|
|
563
590
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
564
591
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
565
592
|
output_path: Optional[str] = None,
|
|
566
593
|
**kwargs,
|
|
567
|
-
):
|
|
594
|
+
) -> EvaluateResult:
|
|
568
595
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
569
596
|
|
|
570
597
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
571
598
|
if evaluator_config is None:
|
|
572
599
|
evaluator_config = {}
|
|
573
600
|
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
574
|
-
column_mapping =
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
601
|
+
column_mapping = _process_column_mappings(
|
|
602
|
+
{
|
|
603
|
+
evaluator_name: evaluator_configuration.get("column_mapping", None)
|
|
604
|
+
for evaluator_name, evaluator_configuration in evaluator_config.items()
|
|
605
|
+
}
|
|
606
|
+
)
|
|
579
607
|
_validate_columns(input_data_df, evaluators, target, column_mapping)
|
|
580
608
|
|
|
581
609
|
# Target Run
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
610
|
+
try:
|
|
611
|
+
pf_client = PFClient(
|
|
612
|
+
config=(
|
|
613
|
+
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
614
|
+
if azure_ai_project
|
|
615
|
+
else None
|
|
616
|
+
),
|
|
617
|
+
user_agent=USER_AGENT,
|
|
618
|
+
)
|
|
619
|
+
# pylint: disable=raise-missing-from
|
|
620
|
+
except MissingAzurePackage:
|
|
621
|
+
msg = (
|
|
622
|
+
"The required packages for remote tracking are missing.\n"
|
|
623
|
+
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
raise EvaluationException(
|
|
627
|
+
message=msg,
|
|
628
|
+
target=ErrorTarget.EVALUATE,
|
|
629
|
+
category=ErrorCategory.MISSING_PACKAGE,
|
|
630
|
+
blame=ErrorBlame.USER_ERROR,
|
|
631
|
+
)
|
|
588
632
|
|
|
589
|
-
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
590
|
-
target_run = None
|
|
591
|
-
target_generated_columns = set()
|
|
633
|
+
trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
634
|
+
target_run: Optional[Run] = None
|
|
592
635
|
|
|
593
636
|
# Create default configuration for evaluators that directly maps
|
|
594
637
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
@@ -627,45 +670,54 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
627
670
|
# Also ignore columns that are already in config, since they've been covered by target mapping.
|
|
628
671
|
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
629
672
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
673
|
+
|
|
674
|
+
def eval_batch_run(
|
|
675
|
+
batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
|
|
676
|
+
) -> Dict[str, __EvaluatorInfo]:
|
|
677
|
+
with BatchRunContext(batch_run_client):
|
|
678
|
+
runs = {
|
|
679
|
+
evaluator_name: batch_run_client.run(
|
|
680
|
+
flow=evaluator,
|
|
681
|
+
run=target_run,
|
|
682
|
+
evaluator_name=evaluator_name,
|
|
683
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
684
|
+
data=data,
|
|
685
|
+
stream=True,
|
|
686
|
+
name=kwargs.get("_run_name"),
|
|
687
|
+
)
|
|
688
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
# get_details needs to be called within BatchRunContext scope in order to have user agent populated
|
|
692
|
+
return {
|
|
693
|
+
evaluator_name: {
|
|
694
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
695
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
696
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
697
|
+
}
|
|
698
|
+
for evaluator_name, run in runs.items()
|
|
699
|
+
}
|
|
700
|
+
|
|
630
701
|
# Batch Run
|
|
631
|
-
evaluators_info = {}
|
|
632
702
|
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
633
703
|
if use_pf_client:
|
|
634
|
-
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
635
|
-
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
636
|
-
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
637
|
-
batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
|
|
638
|
-
|
|
639
704
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
640
705
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
641
706
|
data = os.path.abspath(data)
|
|
707
|
+
|
|
708
|
+
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
709
|
+
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
710
|
+
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
711
|
+
per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
|
|
642
712
|
else:
|
|
643
|
-
batch_run_client = CodeClient()
|
|
644
713
|
data = input_data_df
|
|
645
|
-
|
|
646
|
-
with BatchRunContext(batch_run_client):
|
|
647
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
648
|
-
evaluators_info[evaluator_name] = {}
|
|
649
|
-
evaluators_info[evaluator_name]["run"] = batch_run_client.run(
|
|
650
|
-
flow=evaluator,
|
|
651
|
-
run=target_run,
|
|
652
|
-
evaluator_name=evaluator_name,
|
|
653
|
-
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
654
|
-
data=data,
|
|
655
|
-
stream=True,
|
|
656
|
-
name=kwargs.get("_run_name"),
|
|
657
|
-
)
|
|
658
|
-
|
|
659
|
-
# get_details needs to be called within BatchRunContext scope in order to have user agent populated
|
|
660
|
-
for evaluator_name, evaluator_info in evaluators_info.items():
|
|
661
|
-
evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
|
|
662
|
-
evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
|
|
714
|
+
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
663
715
|
|
|
664
716
|
# Concatenate all results
|
|
665
717
|
evaluators_result_df = None
|
|
666
718
|
evaluators_metric = {}
|
|
667
|
-
for evaluator_name,
|
|
668
|
-
evaluator_result_df =
|
|
719
|
+
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
720
|
+
evaluator_result_df = evaluator_result["result"]
|
|
669
721
|
|
|
670
722
|
# drop input columns
|
|
671
723
|
evaluator_result_df = evaluator_result_df.drop(
|
|
@@ -688,7 +740,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
688
740
|
else evaluator_result_df
|
|
689
741
|
)
|
|
690
742
|
|
|
691
|
-
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in
|
|
743
|
+
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
|
|
692
744
|
|
|
693
745
|
# Rename columns, generated by target function to outputs instead of inputs.
|
|
694
746
|
# If target generates columns, already present in the input data, these columns
|
|
@@ -706,9 +758,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
706
758
|
evaluation_name,
|
|
707
759
|
)
|
|
708
760
|
|
|
709
|
-
result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
|
|
761
|
+
result: EvaluateResult = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
|
|
710
762
|
|
|
711
763
|
if output_path:
|
|
712
764
|
_write_output(output_path, result)
|
|
713
765
|
|
|
766
|
+
_print_summary(per_evaluator_results)
|
|
767
|
+
|
|
714
768
|
return result
|