azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +22 -0
- azure/ai/evaluation/_common/constants.py +5 -0
- azure/ai/evaluation/_common/math.py +11 -0
- azure/ai/evaluation/_common/rai_service.py +172 -35
- azure/ai/evaluation/_common/utils.py +162 -23
- azure/ai/evaluation/_constants.py +6 -6
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +4 -4
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +6 -3
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
- azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
- azure/ai/evaluation/_evaluate/_utils.py +40 -7
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
- azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +49 -15
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +44 -11
- azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +125 -82
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
- azure/ai/evaluation/_exceptions.py +17 -0
- azure/ai/evaluation/_model_configurations.py +18 -1
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
- azure/ai/evaluation/simulator/_simulator.py +115 -61
- azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +166 -9
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/NOTICE.txt +20 -0
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
- /azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +0 -0
- /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
|
@@ -2,18 +2,19 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
|
+
import json
|
|
5
6
|
import os
|
|
6
7
|
import re
|
|
7
8
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
|
|
8
|
-
import json
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
12
|
+
from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
|
|
12
13
|
from promptflow.client import PFClient
|
|
13
14
|
from promptflow.entities import Run
|
|
14
|
-
from promptflow._sdk._errors import MissingAzurePackage
|
|
15
15
|
|
|
16
16
|
from azure.ai.evaluation._common.math import list_sum
|
|
17
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
17
18
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
18
19
|
|
|
19
20
|
from .._constants import (
|
|
@@ -23,11 +24,10 @@ from .._constants import (
|
|
|
23
24
|
Prefixes,
|
|
24
25
|
_InternalEvaluationMetrics,
|
|
25
26
|
)
|
|
26
|
-
from .._model_configurations import AzureAIProject, EvaluatorConfig
|
|
27
|
+
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
27
28
|
from .._user_agent import USER_AGENT
|
|
28
|
-
from .
|
|
29
|
+
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
|
|
29
30
|
from ._utils import (
|
|
30
|
-
EvaluateResult,
|
|
31
31
|
_apply_column_mapping,
|
|
32
32
|
_log_metrics_and_instance_results,
|
|
33
33
|
_trace_destination_from_project_scope,
|
|
@@ -36,6 +36,14 @@ from ._utils import (
|
|
|
36
36
|
|
|
37
37
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
38
38
|
|
|
39
|
+
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
40
|
+
# originating column name, usually because the aggregation of the original value
|
|
41
|
+
# means something sufficiently different.
|
|
42
|
+
# Note that content safety metrics are handled seprately.
|
|
43
|
+
METRIC_COLUMN_NAME_REPLACEMENTS = {
|
|
44
|
+
"groundedness_pro_label": "groundedness_pro_passing_rate",
|
|
45
|
+
}
|
|
46
|
+
|
|
39
47
|
|
|
40
48
|
class __EvaluatorInfo(TypedDict):
|
|
41
49
|
result: pd.DataFrame
|
|
@@ -43,6 +51,32 @@ class __EvaluatorInfo(TypedDict):
|
|
|
43
51
|
run_summary: Dict[str, Any]
|
|
44
52
|
|
|
45
53
|
|
|
54
|
+
def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
|
|
55
|
+
"""Identify and average various metrics that need to have the metric name be replaced,
|
|
56
|
+
instead of having the metric match the originating column name.
|
|
57
|
+
:param df: The dataframe of evaluation results.
|
|
58
|
+
:type df: ~pandas.DataFrame
|
|
59
|
+
:return: A tuple; the first element is a list of dataframe columns that were aggregated,
|
|
60
|
+
and the second element is a dictionary of resultant new metric column names and their values.
|
|
61
|
+
:rtype: Tuple[List[str], Dict[str, float]]
|
|
62
|
+
"""
|
|
63
|
+
renamed_cols = []
|
|
64
|
+
metric_columns = {}
|
|
65
|
+
for col in df.columns:
|
|
66
|
+
metric_prefix = col.split(".")[0]
|
|
67
|
+
metric_name = col.split(".")[1]
|
|
68
|
+
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
69
|
+
renamed_cols.append(col)
|
|
70
|
+
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
71
|
+
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
72
|
+
metric_columns[new_col_name] = round(
|
|
73
|
+
list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
|
|
74
|
+
2,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return renamed_cols, metric_columns
|
|
78
|
+
|
|
79
|
+
|
|
46
80
|
# pylint: disable=line-too-long
|
|
47
81
|
def _aggregate_content_safety_metrics(
|
|
48
82
|
df: pd.DataFrame, evaluators: Dict[str, Callable]
|
|
@@ -146,8 +180,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
146
180
|
# Rename certain columns as defect rates if we know that's what their aggregates represent
|
|
147
181
|
# Content safety metrics
|
|
148
182
|
content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
|
|
183
|
+
other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
|
|
149
184
|
handled_columns.extend(content_safety_cols)
|
|
185
|
+
handled_columns.extend(other_renamed_cols)
|
|
150
186
|
defect_rates.update(cs_defect_rates)
|
|
187
|
+
defect_rates.update(renamed_cols)
|
|
151
188
|
# Label-based (true/false) metrics where 'true' means 'something is wrong'
|
|
152
189
|
label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
|
|
153
190
|
handled_columns.extend(label_cols)
|
|
@@ -163,34 +200,127 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
163
200
|
return metrics
|
|
164
201
|
|
|
165
202
|
|
|
166
|
-
def
|
|
203
|
+
def _validate_columns_for_target(
|
|
204
|
+
df: pd.DataFrame,
|
|
205
|
+
target: Callable,
|
|
206
|
+
) -> None:
|
|
207
|
+
"""
|
|
208
|
+
Check that all columns needed by target function are present.
|
|
209
|
+
|
|
210
|
+
:param df: The data frame to be validated.
|
|
211
|
+
:type df: pd.DataFrame
|
|
212
|
+
:param target: The callable to be applied to data set.
|
|
213
|
+
:type target: Optional[Callable]
|
|
214
|
+
:raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
|
|
215
|
+
"""
|
|
216
|
+
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
217
|
+
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
218
|
+
raise EvaluationException(
|
|
219
|
+
message=msg,
|
|
220
|
+
internal_message=msg,
|
|
221
|
+
target=ErrorTarget.EVALUATE,
|
|
222
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
223
|
+
blame=ErrorBlame.USER_ERROR,
|
|
224
|
+
)
|
|
225
|
+
# If the target function is given, it may return
|
|
226
|
+
# several columns and hence we cannot check the availability of columns
|
|
227
|
+
# without knowing target function semantics.
|
|
228
|
+
# Instead, here we will validate the columns, taken by target.
|
|
167
229
|
required_inputs = [
|
|
168
230
|
param.name
|
|
169
|
-
for param in inspect.signature(
|
|
231
|
+
for param in inspect.signature(target).parameters.values()
|
|
170
232
|
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
171
233
|
]
|
|
172
234
|
|
|
173
|
-
missing_inputs = [col for col in required_inputs if col not in
|
|
174
|
-
if missing_inputs and "conversation" in required_inputs:
|
|
175
|
-
non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
|
|
176
|
-
if len(missing_inputs) == len(non_conversation_inputs) and [
|
|
177
|
-
input in non_conversation_inputs for input in missing_inputs
|
|
178
|
-
]:
|
|
179
|
-
missing_inputs = []
|
|
235
|
+
missing_inputs = [col for col in required_inputs if col not in df.columns]
|
|
180
236
|
if missing_inputs:
|
|
181
|
-
|
|
182
|
-
msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
|
|
183
|
-
raise EvaluationException(
|
|
184
|
-
message=msg,
|
|
185
|
-
internal_message=msg,
|
|
186
|
-
target=ErrorTarget.EVALUATE,
|
|
187
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
188
|
-
blame=ErrorBlame.USER_ERROR,
|
|
189
|
-
)
|
|
190
|
-
msg = f"Missing required inputs for target : {missing_inputs}."
|
|
237
|
+
msg = f"Missing required inputs for target: {missing_inputs}."
|
|
191
238
|
raise EvaluationException(
|
|
192
239
|
message=msg,
|
|
193
|
-
|
|
240
|
+
target=ErrorTarget.EVALUATE,
|
|
241
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
242
|
+
blame=ErrorBlame.USER_ERROR,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _validate_columns_for_evaluators(
|
|
247
|
+
df: pd.DataFrame,
|
|
248
|
+
evaluators: Dict[str, Callable],
|
|
249
|
+
target: Optional[Callable],
|
|
250
|
+
target_generated_columns: Optional[Set[str]],
|
|
251
|
+
column_mapping: Dict[str, Dict[str, str]],
|
|
252
|
+
) -> None:
|
|
253
|
+
"""
|
|
254
|
+
Check that all columns needed by evaluators are present.
|
|
255
|
+
|
|
256
|
+
:param df: The data frame to be validated.
|
|
257
|
+
:type df: pd.DataFrame
|
|
258
|
+
:param evaluators: The dictionary of evaluators.
|
|
259
|
+
:type evaluators: Dict[str, Callable]
|
|
260
|
+
:param target: The callable to be applied to data set.
|
|
261
|
+
:type target: Optional[Callable]
|
|
262
|
+
:param target_generated_columns: The set of columns generated by the target callable.
|
|
263
|
+
:type target_generated_columns: Optional[Set[str]]
|
|
264
|
+
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
|
|
265
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
266
|
+
:raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
|
|
267
|
+
"""
|
|
268
|
+
missing_inputs_per_evaluator = {}
|
|
269
|
+
|
|
270
|
+
for evaluator_name, evaluator in evaluators.items():
|
|
271
|
+
# Apply column mapping
|
|
272
|
+
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
273
|
+
new_df = _apply_column_mapping(df, mapping_config)
|
|
274
|
+
|
|
275
|
+
# Validate input data for evaluator
|
|
276
|
+
is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
|
|
277
|
+
if is_built_in:
|
|
278
|
+
# Note that for built-in evaluators supporting the "conversation" parameter,
|
|
279
|
+
# input parameters are now optional.
|
|
280
|
+
evaluator_params = [
|
|
281
|
+
param.name
|
|
282
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
283
|
+
if param.name not in ["kwargs", "args", "self"]
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
if "conversation" in evaluator_params and "conversation" in new_df.columns:
|
|
287
|
+
# Ignore the missing fields if "conversation" presents in the input data
|
|
288
|
+
missing_inputs = []
|
|
289
|
+
else:
|
|
290
|
+
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
|
|
291
|
+
|
|
292
|
+
# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
|
|
293
|
+
# Otherwise, remove it from the missing inputs
|
|
294
|
+
if "conversation" in missing_inputs:
|
|
295
|
+
if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
|
|
296
|
+
missing_inputs.remove("conversation")
|
|
297
|
+
else:
|
|
298
|
+
evaluator_params = [
|
|
299
|
+
param.name
|
|
300
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
301
|
+
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
|
|
305
|
+
|
|
306
|
+
if missing_inputs:
|
|
307
|
+
missing_inputs_per_evaluator[evaluator_name] = missing_inputs
|
|
308
|
+
|
|
309
|
+
if missing_inputs_per_evaluator:
|
|
310
|
+
msg = "Some evaluators are missing required inputs:\n"
|
|
311
|
+
for evaluator_name, missing in missing_inputs_per_evaluator.items():
|
|
312
|
+
msg += f"- {evaluator_name}: {missing}\n"
|
|
313
|
+
|
|
314
|
+
# Add the additional notes
|
|
315
|
+
msg += "\nTo resolve this issue:\n"
|
|
316
|
+
msg += "- Ensure the data contains required inputs.\n"
|
|
317
|
+
if target is not None:
|
|
318
|
+
msg += "- Verify that the target is generating the necessary columns for the evaluators. "
|
|
319
|
+
msg += f"Currently generated columns: {target_generated_columns} \n"
|
|
320
|
+
msg += "- Check that the column mapping is correctly configured."
|
|
321
|
+
|
|
322
|
+
raise EvaluationException(
|
|
323
|
+
message=msg.strip(),
|
|
194
324
|
target=ErrorTarget.EVALUATE,
|
|
195
325
|
category=ErrorCategory.MISSING_FIELD,
|
|
196
326
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -199,76 +329,85 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
|
|
|
199
329
|
|
|
200
330
|
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
|
|
201
331
|
if data is None:
|
|
202
|
-
msg = "data parameter
|
|
332
|
+
msg = "The 'data' parameter is required for evaluation."
|
|
203
333
|
raise EvaluationException(
|
|
204
334
|
message=msg,
|
|
205
|
-
internal_message=msg,
|
|
206
335
|
target=ErrorTarget.EVALUATE,
|
|
207
|
-
category=ErrorCategory.
|
|
336
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
337
|
+
blame=ErrorBlame.USER_ERROR,
|
|
338
|
+
)
|
|
339
|
+
if not isinstance(data, (os.PathLike, str)):
|
|
340
|
+
msg = "The 'data' parameter must be a string or a path-like object."
|
|
341
|
+
raise EvaluationException(
|
|
342
|
+
message=msg,
|
|
343
|
+
target=ErrorTarget.EVALUATE,
|
|
344
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
345
|
+
blame=ErrorBlame.USER_ERROR,
|
|
346
|
+
)
|
|
347
|
+
if not os.path.exists(data):
|
|
348
|
+
msg = f"The input data file path '{data}' does not exist."
|
|
349
|
+
raise EvaluationException(
|
|
350
|
+
message=msg,
|
|
351
|
+
target=ErrorTarget.EVALUATE,
|
|
352
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
208
353
|
blame=ErrorBlame.USER_ERROR,
|
|
209
354
|
)
|
|
210
355
|
|
|
211
356
|
if target is not None:
|
|
212
357
|
if not callable(target):
|
|
213
|
-
msg = "target parameter must be a callable function."
|
|
358
|
+
msg = "The 'target' parameter must be a callable function."
|
|
214
359
|
raise EvaluationException(
|
|
215
360
|
message=msg,
|
|
216
|
-
internal_message=msg,
|
|
217
361
|
target=ErrorTarget.EVALUATE,
|
|
218
362
|
category=ErrorCategory.INVALID_VALUE,
|
|
219
363
|
blame=ErrorBlame.USER_ERROR,
|
|
220
364
|
)
|
|
221
365
|
|
|
222
|
-
if
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
366
|
+
if not evaluators:
|
|
367
|
+
msg = "The 'evaluators' parameter is required and cannot be None or empty."
|
|
368
|
+
raise EvaluationException(
|
|
369
|
+
message=msg,
|
|
370
|
+
target=ErrorTarget.EVALUATE,
|
|
371
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
372
|
+
blame=ErrorBlame.USER_ERROR,
|
|
373
|
+
)
|
|
374
|
+
if not isinstance(evaluators, dict):
|
|
375
|
+
msg = "The 'evaluators' parameter must be a dictionary."
|
|
376
|
+
raise EvaluationException(
|
|
377
|
+
message=msg,
|
|
378
|
+
target=ErrorTarget.EVALUATE,
|
|
379
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
380
|
+
blame=ErrorBlame.USER_ERROR,
|
|
381
|
+
)
|
|
232
382
|
|
|
233
|
-
if
|
|
234
|
-
if not isinstance(
|
|
235
|
-
msg = "
|
|
383
|
+
if output_path is not None:
|
|
384
|
+
if not isinstance(output_path, (os.PathLike, str)):
|
|
385
|
+
msg = "The 'output_path' parameter must be a string or a path-like object."
|
|
236
386
|
raise EvaluationException(
|
|
237
387
|
message=msg,
|
|
238
|
-
internal_message=msg,
|
|
239
388
|
target=ErrorTarget.EVALUATE,
|
|
240
389
|
category=ErrorCategory.INVALID_VALUE,
|
|
241
390
|
blame=ErrorBlame.USER_ERROR,
|
|
242
391
|
)
|
|
243
392
|
|
|
244
|
-
|
|
245
|
-
if not
|
|
246
|
-
msg = "
|
|
393
|
+
output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
|
|
394
|
+
if not os.path.exists(output_dir):
|
|
395
|
+
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
|
|
247
396
|
raise EvaluationException(
|
|
248
397
|
message=msg,
|
|
249
|
-
internal_message=msg,
|
|
250
398
|
target=ErrorTarget.EVALUATE,
|
|
251
399
|
category=ErrorCategory.INVALID_VALUE,
|
|
252
400
|
blame=ErrorBlame.USER_ERROR,
|
|
253
401
|
)
|
|
254
402
|
|
|
255
403
|
if azure_ai_project is not None:
|
|
256
|
-
|
|
257
|
-
msg = "azure_ai_project parameter must be a dictionary."
|
|
258
|
-
raise EvaluationException(
|
|
259
|
-
message=msg,
|
|
260
|
-
internal_message=msg,
|
|
261
|
-
target=ErrorTarget.EVALUATE,
|
|
262
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
263
|
-
blame=ErrorBlame.USER_ERROR,
|
|
264
|
-
)
|
|
404
|
+
validate_azure_ai_project(azure_ai_project)
|
|
265
405
|
|
|
266
406
|
if evaluation_name is not None:
|
|
267
|
-
if not isinstance(evaluation_name, str):
|
|
268
|
-
msg = "evaluation_name parameter must be a string."
|
|
407
|
+
if not isinstance(evaluation_name, str) or not evaluation_name.strip():
|
|
408
|
+
msg = "The 'evaluation_name' parameter must be a non-empty string."
|
|
269
409
|
raise EvaluationException(
|
|
270
410
|
message=msg,
|
|
271
|
-
internal_message=msg,
|
|
272
411
|
target=ErrorTarget.EVALUATE,
|
|
273
412
|
category=ErrorCategory.INVALID_VALUE,
|
|
274
413
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -278,8 +417,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
278
417
|
initial_data_df = pd.read_json(data, lines=True)
|
|
279
418
|
except Exception as e:
|
|
280
419
|
raise EvaluationException(
|
|
281
|
-
message=f"
|
|
282
|
-
internal_message="Failed to load data. Confirm that it is valid jsonl data.",
|
|
420
|
+
message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
|
|
283
421
|
target=ErrorTarget.EVALUATE,
|
|
284
422
|
category=ErrorCategory.INVALID_VALUE,
|
|
285
423
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -288,57 +426,13 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
288
426
|
return initial_data_df
|
|
289
427
|
|
|
290
428
|
|
|
291
|
-
def _validate_columns(
|
|
292
|
-
df: pd.DataFrame,
|
|
293
|
-
evaluators: Dict[str, Callable],
|
|
294
|
-
target: Optional[Callable],
|
|
295
|
-
column_mapping: Dict[str, Dict[str, str]],
|
|
296
|
-
) -> None:
|
|
297
|
-
"""
|
|
298
|
-
Check that all columns needed by evaluator or target function are present.
|
|
299
|
-
|
|
300
|
-
:param df: The data frame to be validated.
|
|
301
|
-
:type df: pd.DataFrame
|
|
302
|
-
:param evaluators: The dictionary of evaluators.
|
|
303
|
-
:type evaluators: Dict[str, Callable]
|
|
304
|
-
:param target: The callable to be applied to data set.
|
|
305
|
-
:type target: Optional[Callable]
|
|
306
|
-
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
|
|
307
|
-
:type column_mapping: Dict[str, Dict[str, str]]
|
|
308
|
-
:raises EvaluationException: If column starts from "__outputs." while target is defined.
|
|
309
|
-
"""
|
|
310
|
-
if target:
|
|
311
|
-
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
312
|
-
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
313
|
-
raise EvaluationException(
|
|
314
|
-
message=msg,
|
|
315
|
-
internal_message=msg,
|
|
316
|
-
target=ErrorTarget.EVALUATE,
|
|
317
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
318
|
-
blame=ErrorBlame.USER_ERROR,
|
|
319
|
-
)
|
|
320
|
-
# If the target function is given, it may return
|
|
321
|
-
# several columns and hence we cannot check the availability of columns
|
|
322
|
-
# without knowing target function semantics.
|
|
323
|
-
# Instead, here we will validate the columns, taken by target.
|
|
324
|
-
_validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
|
|
325
|
-
else:
|
|
326
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
327
|
-
# Apply column mapping
|
|
328
|
-
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
329
|
-
new_df = _apply_column_mapping(df, mapping_config)
|
|
330
|
-
|
|
331
|
-
# Validate input data for evaluator
|
|
332
|
-
_validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
|
|
333
|
-
|
|
334
|
-
|
|
335
429
|
def _apply_target_to_data(
|
|
336
430
|
target: Callable,
|
|
337
|
-
data: str,
|
|
431
|
+
data: Union[str, os.PathLike],
|
|
338
432
|
pf_client: PFClient,
|
|
339
433
|
initial_data: pd.DataFrame,
|
|
340
434
|
evaluation_name: Optional[str] = None,
|
|
341
|
-
|
|
435
|
+
**kwargs,
|
|
342
436
|
) -> Tuple[pd.DataFrame, Set[str], Run]:
|
|
343
437
|
"""
|
|
344
438
|
Apply the target function to the data set and return updated data and generated columns.
|
|
@@ -346,29 +440,45 @@ def _apply_target_to_data(
|
|
|
346
440
|
:param target: The function to be applied to data.
|
|
347
441
|
:type target: Callable
|
|
348
442
|
:param data: The path to input jsonl file.
|
|
349
|
-
:type data: str
|
|
443
|
+
:type data: Union[str, os.PathLike]
|
|
350
444
|
:param pf_client: The promptflow client to be used.
|
|
351
445
|
:type pf_client: PFClient
|
|
352
446
|
:param initial_data: The data frame with the loaded data.
|
|
353
447
|
:type initial_data: pd.DataFrame
|
|
354
448
|
:param evaluation_name: The name of the evaluation.
|
|
355
449
|
:type evaluation_name: Optional[str]
|
|
356
|
-
:param _run_name: The name of target run. Used for testing only.
|
|
357
|
-
:type _run_name: Optional[str]
|
|
358
450
|
:return: The tuple, containing data frame and the list of added columns.
|
|
359
451
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
360
452
|
"""
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
453
|
+
_run_name = kwargs.get("_run_name")
|
|
454
|
+
upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
with TargetRunContext(upload_target_snaphot):
|
|
458
|
+
run: Run = pf_client.run(
|
|
459
|
+
flow=target,
|
|
460
|
+
display_name=evaluation_name,
|
|
461
|
+
data=data,
|
|
462
|
+
properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
|
|
463
|
+
stream=True,
|
|
464
|
+
name=_run_name,
|
|
465
|
+
)
|
|
466
|
+
except (UserAuthenticationError, UploadInternalError) as ex:
|
|
467
|
+
if "Failed to upload run" in ex.message:
|
|
468
|
+
msg = (
|
|
469
|
+
"Failed to upload the target run to the cloud. "
|
|
470
|
+
"This may be caused by insufficient permission to access storage or other errors."
|
|
471
|
+
)
|
|
472
|
+
raise EvaluationException(
|
|
473
|
+
message=msg,
|
|
474
|
+
target=ErrorTarget.EVALUATE,
|
|
475
|
+
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
476
|
+
blame=ErrorBlame.USER_ERROR,
|
|
477
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
478
|
+
) from ex
|
|
479
|
+
|
|
480
|
+
raise ex
|
|
481
|
+
|
|
372
482
|
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
373
483
|
# Remove input and output prefix
|
|
374
484
|
generated_columns = {
|
|
@@ -456,15 +566,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
456
566
|
# @log_evaluate_activity
|
|
457
567
|
def evaluate(
|
|
458
568
|
*,
|
|
459
|
-
data: str,
|
|
569
|
+
data: Union[str, os.PathLike],
|
|
460
570
|
evaluators: Dict[str, Callable],
|
|
461
571
|
evaluation_name: Optional[str] = None,
|
|
462
572
|
target: Optional[Callable] = None,
|
|
463
573
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
464
574
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
465
|
-
output_path: Optional[str] = None,
|
|
575
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
466
576
|
**kwargs,
|
|
467
|
-
):
|
|
577
|
+
) -> EvaluationResult:
|
|
468
578
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
469
579
|
data will be run through target function and then results will be evaluated.
|
|
470
580
|
|
|
@@ -489,7 +599,7 @@ def evaluate(
|
|
|
489
599
|
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
|
|
490
600
|
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
|
|
491
601
|
:return: Evaluation results.
|
|
492
|
-
:rtype:
|
|
602
|
+
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
493
603
|
|
|
494
604
|
:Example:
|
|
495
605
|
|
|
@@ -563,7 +673,17 @@ def evaluate(
|
|
|
563
673
|
internal_message=error_message,
|
|
564
674
|
target=ErrorTarget.EVALUATE,
|
|
565
675
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
566
|
-
blame=ErrorBlame.
|
|
676
|
+
blame=ErrorBlame.USER_ERROR,
|
|
677
|
+
) from e
|
|
678
|
+
|
|
679
|
+
# Ensure a consistent user experience when encountering errors by converting
|
|
680
|
+
# all other exceptions to EvaluationException.
|
|
681
|
+
if not isinstance(e, EvaluationException):
|
|
682
|
+
raise EvaluationException(
|
|
683
|
+
message=str(e),
|
|
684
|
+
target=ErrorTarget.EVALUATE,
|
|
685
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
686
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
567
687
|
) from e
|
|
568
688
|
|
|
569
689
|
raise e
|
|
@@ -586,12 +706,12 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
586
706
|
evaluators: Dict[str, Callable],
|
|
587
707
|
evaluation_name: Optional[str] = None,
|
|
588
708
|
target: Optional[Callable] = None,
|
|
589
|
-
data: str,
|
|
709
|
+
data: Union[str, os.PathLike],
|
|
590
710
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
591
711
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
592
|
-
output_path: Optional[str] = None,
|
|
712
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
593
713
|
**kwargs,
|
|
594
|
-
) ->
|
|
714
|
+
) -> EvaluationResult:
|
|
595
715
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
596
716
|
|
|
597
717
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
@@ -604,7 +724,9 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
604
724
|
for evaluator_name, evaluator_configuration in evaluator_config.items()
|
|
605
725
|
}
|
|
606
726
|
)
|
|
607
|
-
|
|
727
|
+
|
|
728
|
+
if target is not None:
|
|
729
|
+
_validate_columns_for_target(input_data_df, target)
|
|
608
730
|
|
|
609
731
|
# Target Run
|
|
610
732
|
try:
|
|
@@ -623,7 +745,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
623
745
|
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
624
746
|
)
|
|
625
747
|
|
|
626
|
-
raise EvaluationException(
|
|
748
|
+
raise EvaluationException( # pylint: disable=raise-missing-from
|
|
627
749
|
message=msg,
|
|
628
750
|
target=ErrorTarget.EVALUATE,
|
|
629
751
|
category=ErrorCategory.MISSING_PACKAGE,
|
|
@@ -631,6 +753,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
631
753
|
)
|
|
632
754
|
|
|
633
755
|
trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
756
|
+
|
|
757
|
+
# Handle the case where the customer manually run "pf config set trace.destination=none"
|
|
758
|
+
if trace_destination and trace_destination.lower() == "none":
|
|
759
|
+
trace_destination = None
|
|
760
|
+
|
|
634
761
|
target_run: Optional[Run] = None
|
|
635
762
|
|
|
636
763
|
# Create default configuration for evaluators that directly maps
|
|
@@ -639,9 +766,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
639
766
|
column_mapping.setdefault("default", {})
|
|
640
767
|
|
|
641
768
|
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
769
|
+
target_generated_columns: Set[str] = set()
|
|
642
770
|
if data is not None and target is not None:
|
|
643
771
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
644
|
-
target, data, pf_client, input_data_df, evaluation_name,
|
|
772
|
+
target, data, pf_client, input_data_df, evaluation_name, **kwargs
|
|
645
773
|
)
|
|
646
774
|
|
|
647
775
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -656,9 +784,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
656
784
|
if col not in mapping and run_output not in mapped_to_values:
|
|
657
785
|
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
658
786
|
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
_validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
|
|
787
|
+
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
788
|
+
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
662
789
|
|
|
663
790
|
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
664
791
|
# via target mapping.
|
|
@@ -674,7 +801,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
674
801
|
def eval_batch_run(
|
|
675
802
|
batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
|
|
676
803
|
) -> Dict[str, __EvaluatorInfo]:
|
|
677
|
-
with
|
|
804
|
+
with EvalRunContext(batch_run_client):
|
|
678
805
|
runs = {
|
|
679
806
|
evaluator_name: batch_run_client.run(
|
|
680
807
|
flow=evaluator,
|
|
@@ -688,7 +815,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
688
815
|
for evaluator_name, evaluator in evaluators.items()
|
|
689
816
|
}
|
|
690
817
|
|
|
691
|
-
# get_details needs to be called within
|
|
818
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
692
819
|
return {
|
|
693
820
|
evaluator_name: {
|
|
694
821
|
"result": batch_run_client.get_details(run, all_results=True),
|
|
@@ -758,7 +885,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
758
885
|
evaluation_name,
|
|
759
886
|
)
|
|
760
887
|
|
|
761
|
-
|
|
888
|
+
result_df_dict = result_df.to_dict("records")
|
|
889
|
+
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
762
890
|
|
|
763
891
|
if output_path:
|
|
764
892
|
_write_output(output_path, result)
|
|
@@ -16,10 +16,10 @@ from promptflow.client import PFClient
|
|
|
16
16
|
from promptflow.core import Prompty as prompty_core
|
|
17
17
|
from typing_extensions import ParamSpec
|
|
18
18
|
|
|
19
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
19
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
20
20
|
|
|
21
21
|
from ..._user_agent import USER_AGENT
|
|
22
|
-
from .._utils import
|
|
22
|
+
from .._utils import _trace_destination_from_project_scope
|
|
23
23
|
|
|
24
24
|
LOGGER = logging.getLogger(__name__)
|
|
25
25
|
|
|
@@ -97,17 +97,17 @@ def _get_evaluator_properties(evaluator, evaluator_name):
|
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
# cspell:ignore isna
|
|
100
|
-
def log_evaluate_activity(func: Callable[P,
|
|
100
|
+
def log_evaluate_activity(func: Callable[P, EvaluationResult]) -> Callable[P, EvaluationResult]:
|
|
101
101
|
"""Decorator to log evaluate activity
|
|
102
102
|
|
|
103
103
|
:param func: The function to be decorated
|
|
104
104
|
:type func: Callable
|
|
105
105
|
:returns: The decorated function
|
|
106
|
-
:rtype: Callable[P,
|
|
106
|
+
:rtype: Callable[P, EvaluationResult]
|
|
107
107
|
"""
|
|
108
108
|
|
|
109
109
|
@functools.wraps(func)
|
|
110
|
-
def wrapper(*args: P.args, **kwargs: P.kwargs) ->
|
|
110
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluationResult:
|
|
111
111
|
from promptflow._sdk._telemetry import ActivityType, log_activity
|
|
112
112
|
from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
|
|
113
113
|
|