azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +22 -0
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +4 -0
- azure/ai/evaluation/_common/constants.py +5 -0
- azure/ai/evaluation/_common/math.py +73 -2
- azure/ai/evaluation/_common/rai_service.py +250 -62
- azure/ai/evaluation/_common/utils.py +196 -23
- azure/ai/evaluation/_constants.py +7 -6
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +13 -4
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +19 -6
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +55 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +312 -228
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +7 -6
- azure/ai/evaluation/_evaluate/_utils.py +46 -11
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +17 -18
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +67 -31
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +37 -24
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +21 -9
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +52 -16
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +91 -48
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +100 -26
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +94 -26
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +96 -26
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +97 -26
- azure/ai/evaluation/_evaluators/_eci/_eci.py +31 -4
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -13
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +67 -36
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +14 -16
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +106 -34
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +20 -27
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +87 -31
- azure/ai/evaluation/_evaluators/_qa/_qa.py +23 -31
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +72 -36
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +83 -125
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +26 -27
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +37 -28
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +94 -33
- azure/ai/evaluation/_exceptions.py +19 -0
- azure/ai/evaluation/_model_configurations.py +83 -15
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +20 -1
- azure/ai/evaluation/simulator/_adversarial_simulator.py +29 -35
- azure/ai/evaluation/simulator/_constants.py +11 -1
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +17 -9
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +90 -35
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +4 -2
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +8 -4
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
- azure/ai/evaluation/simulator/_simulator.py +165 -105
- azure/ai/evaluation/simulator/_utils.py +31 -13
- azure_ai_evaluation-1.0.1.dist-info/METADATA +600 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/NOTICE.txt +20 -0
- azure_ai_evaluation-1.0.1.dist-info/RECORD +119 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -322
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b4.dist-info/METADATA +0 -535
- azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
- /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
- {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -2,18 +2,20 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
5
7
|
import os
|
|
6
8
|
import re
|
|
7
9
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
|
|
8
|
-
import json
|
|
9
10
|
|
|
10
11
|
import pandas as pd
|
|
11
12
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
|
+
from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
|
|
12
14
|
from promptflow.client import PFClient
|
|
13
15
|
from promptflow.entities import Run
|
|
14
|
-
from promptflow._sdk._errors import MissingAzurePackage
|
|
15
16
|
|
|
16
|
-
from azure.ai.evaluation._common.math import
|
|
17
|
+
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
18
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
17
19
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
18
20
|
|
|
19
21
|
from .._constants import (
|
|
@@ -23,11 +25,10 @@ from .._constants import (
|
|
|
23
25
|
Prefixes,
|
|
24
26
|
_InternalEvaluationMetrics,
|
|
25
27
|
)
|
|
26
|
-
from .._model_configurations import AzureAIProject, EvaluatorConfig
|
|
28
|
+
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
27
29
|
from .._user_agent import USER_AGENT
|
|
28
|
-
from .
|
|
30
|
+
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
|
|
29
31
|
from ._utils import (
|
|
30
|
-
EvaluateResult,
|
|
31
32
|
_apply_column_mapping,
|
|
32
33
|
_log_metrics_and_instance_results,
|
|
33
34
|
_trace_destination_from_project_scope,
|
|
@@ -35,6 +36,15 @@ from ._utils import (
|
|
|
35
36
|
)
|
|
36
37
|
|
|
37
38
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
39
|
+
LOGGER = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
42
|
+
# originating column name, usually because the aggregation of the original value
|
|
43
|
+
# means something sufficiently different.
|
|
44
|
+
# Note that content safety metrics are handled seprately.
|
|
45
|
+
METRIC_COLUMN_NAME_REPLACEMENTS = {
|
|
46
|
+
"groundedness_pro_label": "groundedness_pro_passing_rate",
|
|
47
|
+
}
|
|
38
48
|
|
|
39
49
|
|
|
40
50
|
class __EvaluatorInfo(TypedDict):
|
|
@@ -43,6 +53,33 @@ class __EvaluatorInfo(TypedDict):
|
|
|
43
53
|
run_summary: Dict[str, Any]
|
|
44
54
|
|
|
45
55
|
|
|
56
|
+
def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
|
|
57
|
+
"""Identify and average various metrics that need to have the metric name be replaced,
|
|
58
|
+
instead of having the metric match the originating column name.
|
|
59
|
+
:param df: The dataframe of evaluation results.
|
|
60
|
+
:type df: ~pandas.DataFrame
|
|
61
|
+
:return: A tuple; the first element is a list of dataframe columns that were aggregated,
|
|
62
|
+
and the second element is a dictionary of resultant new metric column names and their values.
|
|
63
|
+
:rtype: Tuple[List[str], Dict[str, float]]
|
|
64
|
+
"""
|
|
65
|
+
renamed_cols = []
|
|
66
|
+
metric_columns = {}
|
|
67
|
+
for col in df.columns:
|
|
68
|
+
metric_prefix = col.split(".")[0]
|
|
69
|
+
metric_name = col.split(".")[1]
|
|
70
|
+
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
71
|
+
renamed_cols.append(col)
|
|
72
|
+
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
73
|
+
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
74
|
+
try:
|
|
75
|
+
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
76
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
77
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
78
|
+
LOGGER.warning(msg)
|
|
79
|
+
|
|
80
|
+
return renamed_cols, metric_columns
|
|
81
|
+
|
|
82
|
+
|
|
46
83
|
# pylint: disable=line-too-long
|
|
47
84
|
def _aggregate_content_safety_metrics(
|
|
48
85
|
df: pd.DataFrame, evaluators: Dict[str, Callable]
|
|
@@ -85,11 +122,15 @@ def _aggregate_content_safety_metrics(
|
|
|
85
122
|
for col in content_safety_df.columns:
|
|
86
123
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
87
124
|
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
125
|
+
try:
|
|
126
|
+
col_with_boolean_values = apply_transform_nan_safe(
|
|
127
|
+
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
|
|
128
|
+
)
|
|
129
|
+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
130
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
131
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
132
|
+
LOGGER.warning(msg)
|
|
133
|
+
|
|
93
134
|
return content_safety_cols, defect_rates
|
|
94
135
|
|
|
95
136
|
|
|
@@ -119,10 +160,11 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
119
160
|
for col in label_df.columns:
|
|
120
161
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
121
162
|
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
163
|
+
try:
|
|
164
|
+
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
165
|
+
except EvaluationException: # only exception that can be cause is all NaN values
|
|
166
|
+
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
167
|
+
LOGGER.warning(msg)
|
|
126
168
|
return label_cols, defect_rates
|
|
127
169
|
|
|
128
170
|
|
|
@@ -146,8 +188,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
146
188
|
# Rename certain columns as defect rates if we know that's what their aggregates represent
|
|
147
189
|
# Content safety metrics
|
|
148
190
|
content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
|
|
191
|
+
other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
|
|
149
192
|
handled_columns.extend(content_safety_cols)
|
|
193
|
+
handled_columns.extend(other_renamed_cols)
|
|
150
194
|
defect_rates.update(cs_defect_rates)
|
|
195
|
+
defect_rates.update(renamed_cols)
|
|
151
196
|
# Label-based (true/false) metrics where 'true' means 'something is wrong'
|
|
152
197
|
label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
|
|
153
198
|
handled_columns.extend(label_cols)
|
|
@@ -156,6 +201,9 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
156
201
|
# For rest of metrics, we will calculate mean
|
|
157
202
|
df.drop(columns=handled_columns, inplace=True)
|
|
158
203
|
|
|
204
|
+
# NOTE: nan/None values don't count as as booleans, so boolean columns with
|
|
205
|
+
# nan/None values won't have a mean produced from them.
|
|
206
|
+
# This is different from label-based known evaluators, which have special handling.
|
|
159
207
|
mean_value = df.mean(numeric_only=True)
|
|
160
208
|
metrics = mean_value.to_dict()
|
|
161
209
|
# Add defect rates back into metrics
|
|
@@ -163,34 +211,133 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
163
211
|
return metrics
|
|
164
212
|
|
|
165
213
|
|
|
166
|
-
def
|
|
214
|
+
def _validate_columns_for_target(
|
|
215
|
+
df: pd.DataFrame,
|
|
216
|
+
target: Callable,
|
|
217
|
+
) -> None:
|
|
218
|
+
"""
|
|
219
|
+
Check that all columns needed by target function are present.
|
|
220
|
+
|
|
221
|
+
:param df: The data frame to be validated.
|
|
222
|
+
:type df: pd.DataFrame
|
|
223
|
+
:param target: The callable to be applied to data set.
|
|
224
|
+
:type target: Optional[Callable]
|
|
225
|
+
:raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
|
|
226
|
+
"""
|
|
227
|
+
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
228
|
+
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
229
|
+
raise EvaluationException(
|
|
230
|
+
message=msg,
|
|
231
|
+
internal_message=msg,
|
|
232
|
+
target=ErrorTarget.EVALUATE,
|
|
233
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
234
|
+
blame=ErrorBlame.USER_ERROR,
|
|
235
|
+
)
|
|
236
|
+
# If the target function is given, it may return
|
|
237
|
+
# several columns and hence we cannot check the availability of columns
|
|
238
|
+
# without knowing target function semantics.
|
|
239
|
+
# Instead, here we will validate the columns, taken by target.
|
|
167
240
|
required_inputs = [
|
|
168
241
|
param.name
|
|
169
|
-
for param in inspect.signature(
|
|
242
|
+
for param in inspect.signature(target).parameters.values()
|
|
170
243
|
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
171
244
|
]
|
|
172
245
|
|
|
173
|
-
missing_inputs = [col for col in required_inputs if col not in
|
|
174
|
-
if missing_inputs and "conversation" in required_inputs:
|
|
175
|
-
non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
|
|
176
|
-
if len(missing_inputs) == len(non_conversation_inputs) and [
|
|
177
|
-
input in non_conversation_inputs for input in missing_inputs
|
|
178
|
-
]:
|
|
179
|
-
missing_inputs = []
|
|
246
|
+
missing_inputs = [col for col in required_inputs if col not in df.columns]
|
|
180
247
|
if missing_inputs:
|
|
181
|
-
|
|
182
|
-
msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
|
|
183
|
-
raise EvaluationException(
|
|
184
|
-
message=msg,
|
|
185
|
-
internal_message=msg,
|
|
186
|
-
target=ErrorTarget.EVALUATE,
|
|
187
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
188
|
-
blame=ErrorBlame.USER_ERROR,
|
|
189
|
-
)
|
|
190
|
-
msg = f"Missing required inputs for target : {missing_inputs}."
|
|
248
|
+
msg = f"Missing required inputs for target: {missing_inputs}."
|
|
191
249
|
raise EvaluationException(
|
|
192
250
|
message=msg,
|
|
193
|
-
|
|
251
|
+
target=ErrorTarget.EVALUATE,
|
|
252
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
253
|
+
blame=ErrorBlame.USER_ERROR,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _validate_columns_for_evaluators(
|
|
258
|
+
df: pd.DataFrame,
|
|
259
|
+
evaluators: Dict[str, Callable],
|
|
260
|
+
target: Optional[Callable],
|
|
261
|
+
target_generated_columns: Optional[Set[str]],
|
|
262
|
+
column_mapping: Dict[str, Dict[str, str]],
|
|
263
|
+
) -> None:
|
|
264
|
+
"""
|
|
265
|
+
Check that all columns needed by evaluators are present.
|
|
266
|
+
|
|
267
|
+
:param df: The data frame to be validated.
|
|
268
|
+
:type df: pd.DataFrame
|
|
269
|
+
:param evaluators: The dictionary of evaluators.
|
|
270
|
+
:type evaluators: Dict[str, Callable]
|
|
271
|
+
:param target: The callable to be applied to data set.
|
|
272
|
+
:type target: Optional[Callable]
|
|
273
|
+
:param target_generated_columns: The set of columns generated by the target callable.
|
|
274
|
+
:type target_generated_columns: Optional[Set[str]]
|
|
275
|
+
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
|
|
276
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
277
|
+
:raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
|
|
278
|
+
"""
|
|
279
|
+
missing_inputs_per_evaluator = {}
|
|
280
|
+
|
|
281
|
+
for evaluator_name, evaluator in evaluators.items():
|
|
282
|
+
# Apply column mapping
|
|
283
|
+
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
284
|
+
new_df = _apply_column_mapping(df, mapping_config)
|
|
285
|
+
|
|
286
|
+
# Validate input data for evaluator
|
|
287
|
+
is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
|
|
288
|
+
if is_built_in:
|
|
289
|
+
# Note that for built-in evaluators supporting the "conversation" parameter,
|
|
290
|
+
# input parameters are now optional.
|
|
291
|
+
evaluator_params = [
|
|
292
|
+
param.name
|
|
293
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
294
|
+
if param.name not in ["kwargs", "args", "self"]
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
if "conversation" in evaluator_params and "conversation" in new_df.columns:
|
|
298
|
+
# Ignore the missing fields if "conversation" presents in the input data
|
|
299
|
+
missing_inputs = []
|
|
300
|
+
else:
|
|
301
|
+
optional_params = (
|
|
302
|
+
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
303
|
+
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
304
|
+
else []
|
|
305
|
+
)
|
|
306
|
+
excluded_params = set(new_df.columns).union(optional_params)
|
|
307
|
+
missing_inputs = [col for col in evaluator_params if col not in excluded_params]
|
|
308
|
+
|
|
309
|
+
# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
|
|
310
|
+
# Otherwise, remove it from the missing inputs
|
|
311
|
+
if "conversation" in missing_inputs:
|
|
312
|
+
if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
|
|
313
|
+
missing_inputs.remove("conversation")
|
|
314
|
+
else:
|
|
315
|
+
evaluator_params = [
|
|
316
|
+
param.name
|
|
317
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
318
|
+
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
319
|
+
]
|
|
320
|
+
|
|
321
|
+
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
|
|
322
|
+
|
|
323
|
+
if missing_inputs:
|
|
324
|
+
missing_inputs_per_evaluator[evaluator_name] = missing_inputs
|
|
325
|
+
|
|
326
|
+
if missing_inputs_per_evaluator:
|
|
327
|
+
msg = "Some evaluators are missing required inputs:\n"
|
|
328
|
+
for evaluator_name, missing in missing_inputs_per_evaluator.items():
|
|
329
|
+
msg += f"- {evaluator_name}: {missing}\n"
|
|
330
|
+
|
|
331
|
+
# Add the additional notes
|
|
332
|
+
msg += "\nTo resolve this issue:\n"
|
|
333
|
+
msg += "- Ensure the data contains required inputs.\n"
|
|
334
|
+
if target is not None:
|
|
335
|
+
msg += "- Verify that the target is generating the necessary columns for the evaluators. "
|
|
336
|
+
msg += f"Currently generated columns: {target_generated_columns} \n"
|
|
337
|
+
msg += "- Check that the column mapping is correctly configured."
|
|
338
|
+
|
|
339
|
+
raise EvaluationException(
|
|
340
|
+
message=msg.strip(),
|
|
194
341
|
target=ErrorTarget.EVALUATE,
|
|
195
342
|
category=ErrorCategory.MISSING_FIELD,
|
|
196
343
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -199,76 +346,85 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
|
|
|
199
346
|
|
|
200
347
|
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
|
|
201
348
|
if data is None:
|
|
202
|
-
msg = "data parameter
|
|
349
|
+
msg = "The 'data' parameter is required for evaluation."
|
|
203
350
|
raise EvaluationException(
|
|
204
351
|
message=msg,
|
|
205
|
-
internal_message=msg,
|
|
206
352
|
target=ErrorTarget.EVALUATE,
|
|
207
|
-
category=ErrorCategory.
|
|
353
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
354
|
+
blame=ErrorBlame.USER_ERROR,
|
|
355
|
+
)
|
|
356
|
+
if not isinstance(data, (os.PathLike, str)):
|
|
357
|
+
msg = "The 'data' parameter must be a string or a path-like object."
|
|
358
|
+
raise EvaluationException(
|
|
359
|
+
message=msg,
|
|
360
|
+
target=ErrorTarget.EVALUATE,
|
|
361
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
362
|
+
blame=ErrorBlame.USER_ERROR,
|
|
363
|
+
)
|
|
364
|
+
if not os.path.exists(data):
|
|
365
|
+
msg = f"The input data file path '{data}' does not exist."
|
|
366
|
+
raise EvaluationException(
|
|
367
|
+
message=msg,
|
|
368
|
+
target=ErrorTarget.EVALUATE,
|
|
369
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
208
370
|
blame=ErrorBlame.USER_ERROR,
|
|
209
371
|
)
|
|
210
372
|
|
|
211
373
|
if target is not None:
|
|
212
374
|
if not callable(target):
|
|
213
|
-
msg = "target parameter must be a callable function."
|
|
375
|
+
msg = "The 'target' parameter must be a callable function."
|
|
214
376
|
raise EvaluationException(
|
|
215
377
|
message=msg,
|
|
216
|
-
internal_message=msg,
|
|
217
378
|
target=ErrorTarget.EVALUATE,
|
|
218
379
|
category=ErrorCategory.INVALID_VALUE,
|
|
219
380
|
blame=ErrorBlame.USER_ERROR,
|
|
220
381
|
)
|
|
221
382
|
|
|
222
|
-
if
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
383
|
+
if not evaluators:
|
|
384
|
+
msg = "The 'evaluators' parameter is required and cannot be None or empty."
|
|
385
|
+
raise EvaluationException(
|
|
386
|
+
message=msg,
|
|
387
|
+
target=ErrorTarget.EVALUATE,
|
|
388
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
389
|
+
blame=ErrorBlame.USER_ERROR,
|
|
390
|
+
)
|
|
391
|
+
if not isinstance(evaluators, dict):
|
|
392
|
+
msg = "The 'evaluators' parameter must be a dictionary."
|
|
393
|
+
raise EvaluationException(
|
|
394
|
+
message=msg,
|
|
395
|
+
target=ErrorTarget.EVALUATE,
|
|
396
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
397
|
+
blame=ErrorBlame.USER_ERROR,
|
|
398
|
+
)
|
|
232
399
|
|
|
233
|
-
if
|
|
234
|
-
if not isinstance(
|
|
235
|
-
msg = "
|
|
400
|
+
if output_path is not None:
|
|
401
|
+
if not isinstance(output_path, (os.PathLike, str)):
|
|
402
|
+
msg = "The 'output_path' parameter must be a string or a path-like object."
|
|
236
403
|
raise EvaluationException(
|
|
237
404
|
message=msg,
|
|
238
|
-
internal_message=msg,
|
|
239
405
|
target=ErrorTarget.EVALUATE,
|
|
240
406
|
category=ErrorCategory.INVALID_VALUE,
|
|
241
407
|
blame=ErrorBlame.USER_ERROR,
|
|
242
408
|
)
|
|
243
409
|
|
|
244
|
-
|
|
245
|
-
if not
|
|
246
|
-
msg = "
|
|
410
|
+
output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
|
|
411
|
+
if output_dir and not os.path.exists(output_dir):
|
|
412
|
+
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
|
|
247
413
|
raise EvaluationException(
|
|
248
414
|
message=msg,
|
|
249
|
-
internal_message=msg,
|
|
250
415
|
target=ErrorTarget.EVALUATE,
|
|
251
416
|
category=ErrorCategory.INVALID_VALUE,
|
|
252
417
|
blame=ErrorBlame.USER_ERROR,
|
|
253
418
|
)
|
|
254
419
|
|
|
255
420
|
if azure_ai_project is not None:
|
|
256
|
-
|
|
257
|
-
msg = "azure_ai_project parameter must be a dictionary."
|
|
258
|
-
raise EvaluationException(
|
|
259
|
-
message=msg,
|
|
260
|
-
internal_message=msg,
|
|
261
|
-
target=ErrorTarget.EVALUATE,
|
|
262
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
263
|
-
blame=ErrorBlame.USER_ERROR,
|
|
264
|
-
)
|
|
421
|
+
validate_azure_ai_project(azure_ai_project)
|
|
265
422
|
|
|
266
423
|
if evaluation_name is not None:
|
|
267
|
-
if not isinstance(evaluation_name, str):
|
|
268
|
-
msg = "evaluation_name parameter must be a string."
|
|
424
|
+
if not isinstance(evaluation_name, str) or not evaluation_name.strip():
|
|
425
|
+
msg = "The 'evaluation_name' parameter must be a non-empty string."
|
|
269
426
|
raise EvaluationException(
|
|
270
427
|
message=msg,
|
|
271
|
-
internal_message=msg,
|
|
272
428
|
target=ErrorTarget.EVALUATE,
|
|
273
429
|
category=ErrorCategory.INVALID_VALUE,
|
|
274
430
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -278,8 +434,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
278
434
|
initial_data_df = pd.read_json(data, lines=True)
|
|
279
435
|
except Exception as e:
|
|
280
436
|
raise EvaluationException(
|
|
281
|
-
message=f"
|
|
282
|
-
internal_message="Failed to load data. Confirm that it is valid jsonl data.",
|
|
437
|
+
message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
|
|
283
438
|
target=ErrorTarget.EVALUATE,
|
|
284
439
|
category=ErrorCategory.INVALID_VALUE,
|
|
285
440
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -288,57 +443,13 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
288
443
|
return initial_data_df
|
|
289
444
|
|
|
290
445
|
|
|
291
|
-
def _validate_columns(
|
|
292
|
-
df: pd.DataFrame,
|
|
293
|
-
evaluators: Dict[str, Callable],
|
|
294
|
-
target: Optional[Callable],
|
|
295
|
-
column_mapping: Dict[str, Dict[str, str]],
|
|
296
|
-
) -> None:
|
|
297
|
-
"""
|
|
298
|
-
Check that all columns needed by evaluator or target function are present.
|
|
299
|
-
|
|
300
|
-
:param df: The data frame to be validated.
|
|
301
|
-
:type df: pd.DataFrame
|
|
302
|
-
:param evaluators: The dictionary of evaluators.
|
|
303
|
-
:type evaluators: Dict[str, Callable]
|
|
304
|
-
:param target: The callable to be applied to data set.
|
|
305
|
-
:type target: Optional[Callable]
|
|
306
|
-
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
|
|
307
|
-
:type column_mapping: Dict[str, Dict[str, str]]
|
|
308
|
-
:raises EvaluationException: If column starts from "__outputs." while target is defined.
|
|
309
|
-
"""
|
|
310
|
-
if target:
|
|
311
|
-
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
312
|
-
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
313
|
-
raise EvaluationException(
|
|
314
|
-
message=msg,
|
|
315
|
-
internal_message=msg,
|
|
316
|
-
target=ErrorTarget.EVALUATE,
|
|
317
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
318
|
-
blame=ErrorBlame.USER_ERROR,
|
|
319
|
-
)
|
|
320
|
-
# If the target function is given, it may return
|
|
321
|
-
# several columns and hence we cannot check the availability of columns
|
|
322
|
-
# without knowing target function semantics.
|
|
323
|
-
# Instead, here we will validate the columns, taken by target.
|
|
324
|
-
_validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
|
|
325
|
-
else:
|
|
326
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
327
|
-
# Apply column mapping
|
|
328
|
-
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
329
|
-
new_df = _apply_column_mapping(df, mapping_config)
|
|
330
|
-
|
|
331
|
-
# Validate input data for evaluator
|
|
332
|
-
_validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
|
|
333
|
-
|
|
334
|
-
|
|
335
446
|
def _apply_target_to_data(
|
|
336
447
|
target: Callable,
|
|
337
|
-
data: str,
|
|
448
|
+
data: Union[str, os.PathLike],
|
|
338
449
|
pf_client: PFClient,
|
|
339
450
|
initial_data: pd.DataFrame,
|
|
340
451
|
evaluation_name: Optional[str] = None,
|
|
341
|
-
|
|
452
|
+
**kwargs,
|
|
342
453
|
) -> Tuple[pd.DataFrame, Set[str], Run]:
|
|
343
454
|
"""
|
|
344
455
|
Apply the target function to the data set and return updated data and generated columns.
|
|
@@ -346,29 +457,45 @@ def _apply_target_to_data(
|
|
|
346
457
|
:param target: The function to be applied to data.
|
|
347
458
|
:type target: Callable
|
|
348
459
|
:param data: The path to input jsonl file.
|
|
349
|
-
:type data: str
|
|
460
|
+
:type data: Union[str, os.PathLike]
|
|
350
461
|
:param pf_client: The promptflow client to be used.
|
|
351
462
|
:type pf_client: PFClient
|
|
352
463
|
:param initial_data: The data frame with the loaded data.
|
|
353
464
|
:type initial_data: pd.DataFrame
|
|
354
465
|
:param evaluation_name: The name of the evaluation.
|
|
355
466
|
:type evaluation_name: Optional[str]
|
|
356
|
-
:param _run_name: The name of target run. Used for testing only.
|
|
357
|
-
:type _run_name: Optional[str]
|
|
358
467
|
:return: The tuple, containing data frame and the list of added columns.
|
|
359
468
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
360
469
|
"""
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
470
|
+
_run_name = kwargs.get("_run_name")
|
|
471
|
+
upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
|
|
472
|
+
|
|
473
|
+
try:
|
|
474
|
+
with TargetRunContext(upload_target_snaphot):
|
|
475
|
+
run: Run = pf_client.run(
|
|
476
|
+
flow=target,
|
|
477
|
+
display_name=evaluation_name,
|
|
478
|
+
data=data,
|
|
479
|
+
properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
|
|
480
|
+
stream=True,
|
|
481
|
+
name=_run_name,
|
|
482
|
+
)
|
|
483
|
+
except (UserAuthenticationError, UploadInternalError) as ex:
|
|
484
|
+
if "Failed to upload run" in ex.message:
|
|
485
|
+
msg = (
|
|
486
|
+
"Failed to upload the target run to the cloud. "
|
|
487
|
+
"This may be caused by insufficient permission to access storage or other errors."
|
|
488
|
+
)
|
|
489
|
+
raise EvaluationException(
|
|
490
|
+
message=msg,
|
|
491
|
+
target=ErrorTarget.EVALUATE,
|
|
492
|
+
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
493
|
+
blame=ErrorBlame.USER_ERROR,
|
|
494
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
495
|
+
) from ex
|
|
496
|
+
|
|
497
|
+
raise ex
|
|
498
|
+
|
|
372
499
|
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
373
500
|
# Remove input and output prefix
|
|
374
501
|
generated_columns = {
|
|
@@ -456,15 +583,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
456
583
|
# @log_evaluate_activity
|
|
457
584
|
def evaluate(
|
|
458
585
|
*,
|
|
459
|
-
data: str,
|
|
586
|
+
data: Union[str, os.PathLike],
|
|
460
587
|
evaluators: Dict[str, Callable],
|
|
461
588
|
evaluation_name: Optional[str] = None,
|
|
462
589
|
target: Optional[Callable] = None,
|
|
463
590
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
464
591
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
465
|
-
output_path: Optional[str] = None,
|
|
592
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
466
593
|
**kwargs,
|
|
467
|
-
):
|
|
594
|
+
) -> EvaluationResult:
|
|
468
595
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
469
596
|
data will be run through target function and then results will be evaluated.
|
|
470
597
|
|
|
@@ -489,50 +616,16 @@ def evaluate(
|
|
|
489
616
|
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
|
|
490
617
|
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
|
|
491
618
|
:return: Evaluation results.
|
|
492
|
-
:rtype:
|
|
619
|
+
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
493
620
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
Evaluate API can be used as follows:
|
|
497
|
-
|
|
498
|
-
.. code-block:: python
|
|
499
|
-
|
|
500
|
-
from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
model_config = {
|
|
504
|
-
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
505
|
-
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
|
|
506
|
-
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
|
|
507
|
-
}
|
|
508
|
-
|
|
509
|
-
coherence_eval = CoherenceEvaluator(model_config=model_config)
|
|
510
|
-
relevance_eval = RelevanceEvaluator(model_config=model_config)
|
|
511
|
-
|
|
512
|
-
path = "evaluate_test_data.jsonl"
|
|
513
|
-
result = evaluate(
|
|
514
|
-
data=path,
|
|
515
|
-
evaluators={
|
|
516
|
-
"coherence": coherence_eval,
|
|
517
|
-
"relevance": relevance_eval,
|
|
518
|
-
},
|
|
519
|
-
evaluator_config={
|
|
520
|
-
"coherence": {
|
|
521
|
-
"column_mapping": {
|
|
522
|
-
"response": "${data.response}",
|
|
523
|
-
"query": "${data.query}",
|
|
524
|
-
},
|
|
525
|
-
},
|
|
526
|
-
"relevance": {
|
|
527
|
-
"column_mapping": {
|
|
528
|
-
"response": "${data.response}",
|
|
529
|
-
"context": "${data.context}",
|
|
530
|
-
"query": "${data.query}",
|
|
531
|
-
},
|
|
532
|
-
},
|
|
533
|
-
},
|
|
534
|
-
)
|
|
621
|
+
.. admonition:: Example:
|
|
535
622
|
|
|
623
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
624
|
+
:start-after: [START evaluate_method]
|
|
625
|
+
:end-before: [END evaluate_method]
|
|
626
|
+
:language: python
|
|
627
|
+
:dedent: 8
|
|
628
|
+
:caption: Run an evaluation on local data with Coherence and Relevance evaluators.
|
|
536
629
|
"""
|
|
537
630
|
try:
|
|
538
631
|
return _evaluate(
|
|
@@ -563,7 +656,17 @@ def evaluate(
|
|
|
563
656
|
internal_message=error_message,
|
|
564
657
|
target=ErrorTarget.EVALUATE,
|
|
565
658
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
566
|
-
blame=ErrorBlame.
|
|
659
|
+
blame=ErrorBlame.USER_ERROR,
|
|
660
|
+
) from e
|
|
661
|
+
|
|
662
|
+
# Ensure a consistent user experience when encountering errors by converting
|
|
663
|
+
# all other exceptions to EvaluationException.
|
|
664
|
+
if not isinstance(e, EvaluationException):
|
|
665
|
+
raise EvaluationException(
|
|
666
|
+
message=str(e),
|
|
667
|
+
target=ErrorTarget.EVALUATE,
|
|
668
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
669
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
567
670
|
) from e
|
|
568
671
|
|
|
569
672
|
raise e
|
|
@@ -578,7 +681,7 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
|
578
681
|
if output_dict:
|
|
579
682
|
print("======= Combined Run Summary (Per Evaluator) =======\n")
|
|
580
683
|
print(json.dumps(output_dict, indent=4))
|
|
581
|
-
print("\n
|
|
684
|
+
print("\n====================================================\n")
|
|
582
685
|
|
|
583
686
|
|
|
584
687
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
@@ -586,12 +689,12 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
586
689
|
evaluators: Dict[str, Callable],
|
|
587
690
|
evaluation_name: Optional[str] = None,
|
|
588
691
|
target: Optional[Callable] = None,
|
|
589
|
-
data: str,
|
|
692
|
+
data: Union[str, os.PathLike],
|
|
590
693
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
591
694
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
592
|
-
output_path: Optional[str] = None,
|
|
695
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
593
696
|
**kwargs,
|
|
594
|
-
) ->
|
|
697
|
+
) -> EvaluationResult:
|
|
595
698
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
596
699
|
|
|
597
700
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
@@ -604,33 +707,11 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
604
707
|
for evaluator_name, evaluator_configuration in evaluator_config.items()
|
|
605
708
|
}
|
|
606
709
|
)
|
|
607
|
-
_validate_columns(input_data_df, evaluators, target, column_mapping)
|
|
608
710
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
pf_client = PFClient(
|
|
612
|
-
config=(
|
|
613
|
-
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
614
|
-
if azure_ai_project
|
|
615
|
-
else None
|
|
616
|
-
),
|
|
617
|
-
user_agent=USER_AGENT,
|
|
618
|
-
)
|
|
619
|
-
# pylint: disable=raise-missing-from
|
|
620
|
-
except MissingAzurePackage:
|
|
621
|
-
msg = (
|
|
622
|
-
"The required packages for remote tracking are missing.\n"
|
|
623
|
-
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
624
|
-
)
|
|
625
|
-
|
|
626
|
-
raise EvaluationException(
|
|
627
|
-
message=msg,
|
|
628
|
-
target=ErrorTarget.EVALUATE,
|
|
629
|
-
category=ErrorCategory.MISSING_PACKAGE,
|
|
630
|
-
blame=ErrorBlame.USER_ERROR,
|
|
631
|
-
)
|
|
711
|
+
if target is not None:
|
|
712
|
+
_validate_columns_for_target(input_data_df, target)
|
|
632
713
|
|
|
633
|
-
|
|
714
|
+
pf_client = PFClient(user_agent=USER_AGENT)
|
|
634
715
|
target_run: Optional[Run] = None
|
|
635
716
|
|
|
636
717
|
# Create default configuration for evaluators that directly maps
|
|
@@ -639,9 +720,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
639
720
|
column_mapping.setdefault("default", {})
|
|
640
721
|
|
|
641
722
|
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
723
|
+
target_generated_columns: Set[str] = set()
|
|
642
724
|
if data is not None and target is not None:
|
|
643
725
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
644
|
-
target, data, pf_client, input_data_df, evaluation_name,
|
|
726
|
+
target, data, pf_client, input_data_df, evaluation_name, **kwargs
|
|
645
727
|
)
|
|
646
728
|
|
|
647
729
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -656,9 +738,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
656
738
|
if col not in mapping and run_output not in mapped_to_values:
|
|
657
739
|
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
658
740
|
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
_validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
|
|
741
|
+
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
742
|
+
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
662
743
|
|
|
663
744
|
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
664
745
|
# via target mapping.
|
|
@@ -674,7 +755,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
674
755
|
def eval_batch_run(
|
|
675
756
|
batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
|
|
676
757
|
) -> Dict[str, __EvaluatorInfo]:
|
|
677
|
-
with
|
|
758
|
+
with EvalRunContext(batch_run_client):
|
|
678
759
|
runs = {
|
|
679
760
|
evaluator_name: batch_run_client.run(
|
|
680
761
|
flow=evaluator,
|
|
@@ -688,7 +769,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
688
769
|
for evaluator_name, evaluator in evaluators.items()
|
|
689
770
|
}
|
|
690
771
|
|
|
691
|
-
# get_details needs to be called within
|
|
772
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
692
773
|
return {
|
|
693
774
|
evaluator_name: {
|
|
694
775
|
"result": batch_run_client.get_details(run, all_results=True),
|
|
@@ -704,11 +785,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
704
785
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
705
786
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
706
787
|
data = os.path.abspath(data)
|
|
707
|
-
|
|
708
|
-
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
709
|
-
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
710
|
-
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
711
|
-
per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
|
|
788
|
+
per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
|
|
712
789
|
else:
|
|
713
790
|
data = input_data_df
|
|
714
791
|
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
@@ -750,19 +827,26 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
750
827
|
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
751
828
|
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
752
829
|
metrics.update(evaluators_metric)
|
|
753
|
-
studio_url = _log_metrics_and_instance_results(
|
|
754
|
-
metrics,
|
|
755
|
-
result_df,
|
|
756
|
-
trace_destination,
|
|
757
|
-
target_run,
|
|
758
|
-
evaluation_name,
|
|
759
|
-
)
|
|
760
830
|
|
|
761
|
-
|
|
831
|
+
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
832
|
+
target_run = None
|
|
833
|
+
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
834
|
+
studio_url = None
|
|
835
|
+
if trace_destination:
|
|
836
|
+
studio_url = _log_metrics_and_instance_results(
|
|
837
|
+
metrics,
|
|
838
|
+
result_df,
|
|
839
|
+
trace_destination,
|
|
840
|
+
target_run,
|
|
841
|
+
evaluation_name,
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
result_df_dict = result_df.to_dict("records")
|
|
845
|
+
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
846
|
+
|
|
847
|
+
_print_summary(per_evaluator_results)
|
|
762
848
|
|
|
763
849
|
if output_path:
|
|
764
850
|
_write_output(output_path, result)
|
|
765
851
|
|
|
766
|
-
_print_summary(per_evaluator_results)
|
|
767
|
-
|
|
768
852
|
return result
|