azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +23 -1
- azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +20 -9
- azure/ai/evaluation/_common/constants.py +9 -2
- azure/ai/evaluation/_common/math.py +29 -0
- azure/ai/evaluation/_common/rai_service.py +222 -93
- azure/ai/evaluation/_common/utils.py +328 -19
- azure/ai/evaluation/_constants.py +16 -8
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +33 -17
- azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +14 -7
- azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +22 -4
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +47 -14
- azure/ai/evaluation/_evaluate/_evaluate.py +370 -188
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +15 -16
- azure/ai/evaluation/_evaluate/_utils.py +77 -25
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +16 -10
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +76 -46
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +26 -19
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +62 -25
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -36
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +67 -46
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +33 -4
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +33 -4
- azure/ai/evaluation/_evaluators/_eci/_eci.py +7 -5
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +14 -6
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +22 -21
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +51 -16
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +46 -13
- azure/ai/evaluation/_evaluators/_qa/_qa.py +11 -6
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +23 -20
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +126 -80
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +32 -15
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +36 -10
- azure/ai/evaluation/_exceptions.py +26 -6
- azure/ai/evaluation/_http_utils.py +203 -132
- azure/ai/evaluation/_model_configurations.py +23 -6
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -1
- azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +88 -60
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -12
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +24 -66
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +98 -95
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +67 -21
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +28 -11
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +68 -24
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -9
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
- azure/ai/evaluation/simulator/_simulator.py +222 -169
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- azure/ai/evaluation/simulator/_utils.py +6 -6
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +237 -52
- azure_ai_evaluation-1.0.0b5.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
- azure_ai_evaluation-1.0.0b3.dist-info/RECORD +0 -98
- {azure_ai_evaluation-1.0.0b3.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
|
@@ -2,26 +2,31 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
|
+
import json
|
|
5
6
|
import os
|
|
6
7
|
import re
|
|
7
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple,
|
|
8
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
|
|
8
9
|
|
|
9
|
-
import numpy as np
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
12
|
+
from promptflow._sdk._errors import MissingAzurePackage, UserAuthenticationError, UploadInternalError
|
|
12
13
|
from promptflow.client import PFClient
|
|
14
|
+
from promptflow.entities import Run
|
|
13
15
|
|
|
16
|
+
from azure.ai.evaluation._common.math import list_sum
|
|
17
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
14
18
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
15
19
|
|
|
16
20
|
from .._constants import (
|
|
17
21
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
18
22
|
EvaluationMetrics,
|
|
23
|
+
EvaluationRunProperties,
|
|
19
24
|
Prefixes,
|
|
20
25
|
_InternalEvaluationMetrics,
|
|
21
26
|
)
|
|
22
|
-
from .._model_configurations import AzureAIProject, EvaluatorConfig
|
|
27
|
+
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
23
28
|
from .._user_agent import USER_AGENT
|
|
24
|
-
from .
|
|
29
|
+
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
|
|
25
30
|
from ._utils import (
|
|
26
31
|
_apply_column_mapping,
|
|
27
32
|
_log_metrics_and_instance_results,
|
|
@@ -29,10 +34,52 @@ from ._utils import (
|
|
|
29
34
|
_write_output,
|
|
30
35
|
)
|
|
31
36
|
|
|
37
|
+
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
38
|
+
|
|
39
|
+
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
40
|
+
# originating column name, usually because the aggregation of the original value
|
|
41
|
+
# means something sufficiently different.
|
|
42
|
+
# Note that content safety metrics are handled seprately.
|
|
43
|
+
METRIC_COLUMN_NAME_REPLACEMENTS = {
|
|
44
|
+
"groundedness_pro_label": "groundedness_pro_passing_rate",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class __EvaluatorInfo(TypedDict):
|
|
49
|
+
result: pd.DataFrame
|
|
50
|
+
metrics: Dict[str, Any]
|
|
51
|
+
run_summary: Dict[str, Any]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
|
|
55
|
+
"""Identify and average various metrics that need to have the metric name be replaced,
|
|
56
|
+
instead of having the metric match the originating column name.
|
|
57
|
+
:param df: The dataframe of evaluation results.
|
|
58
|
+
:type df: ~pandas.DataFrame
|
|
59
|
+
:return: A tuple; the first element is a list of dataframe columns that were aggregated,
|
|
60
|
+
and the second element is a dictionary of resultant new metric column names and their values.
|
|
61
|
+
:rtype: Tuple[List[str], Dict[str, float]]
|
|
62
|
+
"""
|
|
63
|
+
renamed_cols = []
|
|
64
|
+
metric_columns = {}
|
|
65
|
+
for col in df.columns:
|
|
66
|
+
metric_prefix = col.split(".")[0]
|
|
67
|
+
metric_name = col.split(".")[1]
|
|
68
|
+
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
69
|
+
renamed_cols.append(col)
|
|
70
|
+
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
71
|
+
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
72
|
+
metric_columns[new_col_name] = round(
|
|
73
|
+
list_sum(col_with_numeric_values) / col_with_numeric_values.count(),
|
|
74
|
+
2,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return renamed_cols, metric_columns
|
|
78
|
+
|
|
32
79
|
|
|
33
80
|
# pylint: disable=line-too-long
|
|
34
81
|
def _aggregate_content_safety_metrics(
|
|
35
|
-
df: pd.DataFrame, evaluators: Dict[str,
|
|
82
|
+
df: pd.DataFrame, evaluators: Dict[str, Callable]
|
|
36
83
|
) -> Tuple[List[str], Dict[str, float]]:
|
|
37
84
|
"""Find and aggregate defect rates for content safety metrics. Returns both a list
|
|
38
85
|
of columns that were used to calculate defect rates and the defect rates themselves.
|
|
@@ -73,7 +120,7 @@ def _aggregate_content_safety_metrics(
|
|
|
73
120
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
74
121
|
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
75
122
|
defect_rates[defect_rate_name] = round(
|
|
76
|
-
|
|
123
|
+
list_sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
|
|
77
124
|
/ col_with_numeric_values.count(),
|
|
78
125
|
2,
|
|
79
126
|
)
|
|
@@ -107,13 +154,13 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
107
154
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
108
155
|
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
109
156
|
defect_rates[defect_rate_name] = round(
|
|
110
|
-
|
|
157
|
+
list_sum(col_with_boolean_values) / col_with_boolean_values.count(),
|
|
111
158
|
2,
|
|
112
159
|
)
|
|
113
160
|
return label_cols, defect_rates
|
|
114
161
|
|
|
115
162
|
|
|
116
|
-
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str,
|
|
163
|
+
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
117
164
|
"""Aggregate metrics from the evaluation results.
|
|
118
165
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
119
166
|
that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
|
|
@@ -122,7 +169,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
|
|
|
122
169
|
:param df: The dataframe of evaluation results.
|
|
123
170
|
:type df: ~pandas.DataFrame
|
|
124
171
|
:param evaluators: A dictionary mapping of strings to evaluator classes.
|
|
125
|
-
:type evaluators: Dict[str,
|
|
172
|
+
:type evaluators: Dict[str, Callable]
|
|
126
173
|
:return: The aggregated metrics.
|
|
127
174
|
:rtype: Dict[str, float]
|
|
128
175
|
"""
|
|
@@ -133,8 +180,11 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
|
|
|
133
180
|
# Rename certain columns as defect rates if we know that's what their aggregates represent
|
|
134
181
|
# Content safety metrics
|
|
135
182
|
content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
|
|
183
|
+
other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
|
|
136
184
|
handled_columns.extend(content_safety_cols)
|
|
185
|
+
handled_columns.extend(other_renamed_cols)
|
|
137
186
|
defect_rates.update(cs_defect_rates)
|
|
187
|
+
defect_rates.update(renamed_cols)
|
|
138
188
|
# Label-based (true/false) metrics where 'true' means 'something is wrong'
|
|
139
189
|
label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
|
|
140
190
|
handled_columns.extend(label_cols)
|
|
@@ -150,34 +200,127 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
|
|
|
150
200
|
return metrics
|
|
151
201
|
|
|
152
202
|
|
|
153
|
-
def
|
|
203
|
+
def _validate_columns_for_target(
|
|
204
|
+
df: pd.DataFrame,
|
|
205
|
+
target: Callable,
|
|
206
|
+
) -> None:
|
|
207
|
+
"""
|
|
208
|
+
Check that all columns needed by target function are present.
|
|
209
|
+
|
|
210
|
+
:param df: The data frame to be validated.
|
|
211
|
+
:type df: pd.DataFrame
|
|
212
|
+
:param target: The callable to be applied to data set.
|
|
213
|
+
:type target: Optional[Callable]
|
|
214
|
+
:raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
|
|
215
|
+
"""
|
|
216
|
+
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
217
|
+
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
218
|
+
raise EvaluationException(
|
|
219
|
+
message=msg,
|
|
220
|
+
internal_message=msg,
|
|
221
|
+
target=ErrorTarget.EVALUATE,
|
|
222
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
223
|
+
blame=ErrorBlame.USER_ERROR,
|
|
224
|
+
)
|
|
225
|
+
# If the target function is given, it may return
|
|
226
|
+
# several columns and hence we cannot check the availability of columns
|
|
227
|
+
# without knowing target function semantics.
|
|
228
|
+
# Instead, here we will validate the columns, taken by target.
|
|
154
229
|
required_inputs = [
|
|
155
230
|
param.name
|
|
156
|
-
for param in inspect.signature(
|
|
231
|
+
for param in inspect.signature(target).parameters.values()
|
|
157
232
|
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
158
233
|
]
|
|
159
234
|
|
|
160
|
-
missing_inputs = [col for col in required_inputs if col not in
|
|
161
|
-
if missing_inputs and "conversation" in required_inputs:
|
|
162
|
-
non_conversation_inputs = [val for val in required_inputs if val != "conversation"]
|
|
163
|
-
if len(missing_inputs) == len(non_conversation_inputs) and [
|
|
164
|
-
input in non_conversation_inputs for input in missing_inputs
|
|
165
|
-
]:
|
|
166
|
-
missing_inputs = []
|
|
235
|
+
missing_inputs = [col for col in required_inputs if col not in df.columns]
|
|
167
236
|
if missing_inputs:
|
|
168
|
-
|
|
169
|
-
msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
|
|
170
|
-
raise EvaluationException(
|
|
171
|
-
message=msg,
|
|
172
|
-
internal_message=msg,
|
|
173
|
-
target=ErrorTarget.EVALUATE,
|
|
174
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
175
|
-
blame=ErrorBlame.USER_ERROR,
|
|
176
|
-
)
|
|
177
|
-
msg = f"Missing required inputs for target : {missing_inputs}."
|
|
237
|
+
msg = f"Missing required inputs for target: {missing_inputs}."
|
|
178
238
|
raise EvaluationException(
|
|
179
239
|
message=msg,
|
|
180
|
-
|
|
240
|
+
target=ErrorTarget.EVALUATE,
|
|
241
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
242
|
+
blame=ErrorBlame.USER_ERROR,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _validate_columns_for_evaluators(
|
|
247
|
+
df: pd.DataFrame,
|
|
248
|
+
evaluators: Dict[str, Callable],
|
|
249
|
+
target: Optional[Callable],
|
|
250
|
+
target_generated_columns: Optional[Set[str]],
|
|
251
|
+
column_mapping: Dict[str, Dict[str, str]],
|
|
252
|
+
) -> None:
|
|
253
|
+
"""
|
|
254
|
+
Check that all columns needed by evaluators are present.
|
|
255
|
+
|
|
256
|
+
:param df: The data frame to be validated.
|
|
257
|
+
:type df: pd.DataFrame
|
|
258
|
+
:param evaluators: The dictionary of evaluators.
|
|
259
|
+
:type evaluators: Dict[str, Callable]
|
|
260
|
+
:param target: The callable to be applied to data set.
|
|
261
|
+
:type target: Optional[Callable]
|
|
262
|
+
:param target_generated_columns: The set of columns generated by the target callable.
|
|
263
|
+
:type target_generated_columns: Optional[Set[str]]
|
|
264
|
+
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
|
|
265
|
+
:type column_mapping: Dict[str, Dict[str, str]]
|
|
266
|
+
:raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
|
|
267
|
+
"""
|
|
268
|
+
missing_inputs_per_evaluator = {}
|
|
269
|
+
|
|
270
|
+
for evaluator_name, evaluator in evaluators.items():
|
|
271
|
+
# Apply column mapping
|
|
272
|
+
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
273
|
+
new_df = _apply_column_mapping(df, mapping_config)
|
|
274
|
+
|
|
275
|
+
# Validate input data for evaluator
|
|
276
|
+
is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
|
|
277
|
+
if is_built_in:
|
|
278
|
+
# Note that for built-in evaluators supporting the "conversation" parameter,
|
|
279
|
+
# input parameters are now optional.
|
|
280
|
+
evaluator_params = [
|
|
281
|
+
param.name
|
|
282
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
283
|
+
if param.name not in ["kwargs", "args", "self"]
|
|
284
|
+
]
|
|
285
|
+
|
|
286
|
+
if "conversation" in evaluator_params and "conversation" in new_df.columns:
|
|
287
|
+
# Ignore the missing fields if "conversation" presents in the input data
|
|
288
|
+
missing_inputs = []
|
|
289
|
+
else:
|
|
290
|
+
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
|
|
291
|
+
|
|
292
|
+
# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
|
|
293
|
+
# Otherwise, remove it from the missing inputs
|
|
294
|
+
if "conversation" in missing_inputs:
|
|
295
|
+
if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
|
|
296
|
+
missing_inputs.remove("conversation")
|
|
297
|
+
else:
|
|
298
|
+
evaluator_params = [
|
|
299
|
+
param.name
|
|
300
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
301
|
+
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
|
|
305
|
+
|
|
306
|
+
if missing_inputs:
|
|
307
|
+
missing_inputs_per_evaluator[evaluator_name] = missing_inputs
|
|
308
|
+
|
|
309
|
+
if missing_inputs_per_evaluator:
|
|
310
|
+
msg = "Some evaluators are missing required inputs:\n"
|
|
311
|
+
for evaluator_name, missing in missing_inputs_per_evaluator.items():
|
|
312
|
+
msg += f"- {evaluator_name}: {missing}\n"
|
|
313
|
+
|
|
314
|
+
# Add the additional notes
|
|
315
|
+
msg += "\nTo resolve this issue:\n"
|
|
316
|
+
msg += "- Ensure the data contains required inputs.\n"
|
|
317
|
+
if target is not None:
|
|
318
|
+
msg += "- Verify that the target is generating the necessary columns for the evaluators. "
|
|
319
|
+
msg += f"Currently generated columns: {target_generated_columns} \n"
|
|
320
|
+
msg += "- Check that the column mapping is correctly configured."
|
|
321
|
+
|
|
322
|
+
raise EvaluationException(
|
|
323
|
+
message=msg.strip(),
|
|
181
324
|
target=ErrorTarget.EVALUATE,
|
|
182
325
|
category=ErrorCategory.MISSING_FIELD,
|
|
183
326
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -186,76 +329,85 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta
|
|
|
186
329
|
|
|
187
330
|
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
|
|
188
331
|
if data is None:
|
|
189
|
-
msg = "data parameter
|
|
332
|
+
msg = "The 'data' parameter is required for evaluation."
|
|
190
333
|
raise EvaluationException(
|
|
191
334
|
message=msg,
|
|
192
|
-
internal_message=msg,
|
|
193
335
|
target=ErrorTarget.EVALUATE,
|
|
194
|
-
category=ErrorCategory.
|
|
336
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
337
|
+
blame=ErrorBlame.USER_ERROR,
|
|
338
|
+
)
|
|
339
|
+
if not isinstance(data, (os.PathLike, str)):
|
|
340
|
+
msg = "The 'data' parameter must be a string or a path-like object."
|
|
341
|
+
raise EvaluationException(
|
|
342
|
+
message=msg,
|
|
343
|
+
target=ErrorTarget.EVALUATE,
|
|
344
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
345
|
+
blame=ErrorBlame.USER_ERROR,
|
|
346
|
+
)
|
|
347
|
+
if not os.path.exists(data):
|
|
348
|
+
msg = f"The input data file path '{data}' does not exist."
|
|
349
|
+
raise EvaluationException(
|
|
350
|
+
message=msg,
|
|
351
|
+
target=ErrorTarget.EVALUATE,
|
|
352
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
195
353
|
blame=ErrorBlame.USER_ERROR,
|
|
196
354
|
)
|
|
197
355
|
|
|
198
356
|
if target is not None:
|
|
199
357
|
if not callable(target):
|
|
200
|
-
msg = "target parameter must be a callable function."
|
|
358
|
+
msg = "The 'target' parameter must be a callable function."
|
|
201
359
|
raise EvaluationException(
|
|
202
360
|
message=msg,
|
|
203
|
-
internal_message=msg,
|
|
204
361
|
target=ErrorTarget.EVALUATE,
|
|
205
362
|
category=ErrorCategory.INVALID_VALUE,
|
|
206
363
|
blame=ErrorBlame.USER_ERROR,
|
|
207
364
|
)
|
|
208
365
|
|
|
209
|
-
if
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
366
|
+
if not evaluators:
|
|
367
|
+
msg = "The 'evaluators' parameter is required and cannot be None or empty."
|
|
368
|
+
raise EvaluationException(
|
|
369
|
+
message=msg,
|
|
370
|
+
target=ErrorTarget.EVALUATE,
|
|
371
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
372
|
+
blame=ErrorBlame.USER_ERROR,
|
|
373
|
+
)
|
|
374
|
+
if not isinstance(evaluators, dict):
|
|
375
|
+
msg = "The 'evaluators' parameter must be a dictionary."
|
|
376
|
+
raise EvaluationException(
|
|
377
|
+
message=msg,
|
|
378
|
+
target=ErrorTarget.EVALUATE,
|
|
379
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
380
|
+
blame=ErrorBlame.USER_ERROR,
|
|
381
|
+
)
|
|
219
382
|
|
|
220
|
-
if
|
|
221
|
-
if not isinstance(
|
|
222
|
-
msg = "
|
|
383
|
+
if output_path is not None:
|
|
384
|
+
if not isinstance(output_path, (os.PathLike, str)):
|
|
385
|
+
msg = "The 'output_path' parameter must be a string or a path-like object."
|
|
223
386
|
raise EvaluationException(
|
|
224
387
|
message=msg,
|
|
225
|
-
internal_message=msg,
|
|
226
388
|
target=ErrorTarget.EVALUATE,
|
|
227
389
|
category=ErrorCategory.INVALID_VALUE,
|
|
228
390
|
blame=ErrorBlame.USER_ERROR,
|
|
229
391
|
)
|
|
230
392
|
|
|
231
|
-
|
|
232
|
-
if not
|
|
233
|
-
msg = "
|
|
393
|
+
output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
|
|
394
|
+
if not os.path.exists(output_dir):
|
|
395
|
+
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
|
|
234
396
|
raise EvaluationException(
|
|
235
397
|
message=msg,
|
|
236
|
-
internal_message=msg,
|
|
237
398
|
target=ErrorTarget.EVALUATE,
|
|
238
399
|
category=ErrorCategory.INVALID_VALUE,
|
|
239
400
|
blame=ErrorBlame.USER_ERROR,
|
|
240
401
|
)
|
|
241
402
|
|
|
242
403
|
if azure_ai_project is not None:
|
|
243
|
-
|
|
244
|
-
msg = "azure_ai_project parameter must be a dictionary."
|
|
245
|
-
raise EvaluationException(
|
|
246
|
-
message=msg,
|
|
247
|
-
internal_message=msg,
|
|
248
|
-
target=ErrorTarget.EVALUATE,
|
|
249
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
250
|
-
blame=ErrorBlame.USER_ERROR,
|
|
251
|
-
)
|
|
404
|
+
validate_azure_ai_project(azure_ai_project)
|
|
252
405
|
|
|
253
406
|
if evaluation_name is not None:
|
|
254
|
-
if not isinstance(evaluation_name, str):
|
|
255
|
-
msg = "evaluation_name parameter must be a string."
|
|
407
|
+
if not isinstance(evaluation_name, str) or not evaluation_name.strip():
|
|
408
|
+
msg = "The 'evaluation_name' parameter must be a non-empty string."
|
|
256
409
|
raise EvaluationException(
|
|
257
410
|
message=msg,
|
|
258
|
-
internal_message=msg,
|
|
259
411
|
target=ErrorTarget.EVALUATE,
|
|
260
412
|
category=ErrorCategory.INVALID_VALUE,
|
|
261
413
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -265,8 +417,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
265
417
|
initial_data_df = pd.read_json(data, lines=True)
|
|
266
418
|
except Exception as e:
|
|
267
419
|
raise EvaluationException(
|
|
268
|
-
message=f"
|
|
269
|
-
internal_message="Failed to load data. Confirm that it is valid jsonl data.",
|
|
420
|
+
message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
|
|
270
421
|
target=ErrorTarget.EVALUATE,
|
|
271
422
|
category=ErrorCategory.INVALID_VALUE,
|
|
272
423
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -275,88 +426,60 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
275
426
|
return initial_data_df
|
|
276
427
|
|
|
277
428
|
|
|
278
|
-
def _validate_columns(
|
|
279
|
-
df: pd.DataFrame,
|
|
280
|
-
evaluators: Dict[str, Any],
|
|
281
|
-
target: Optional[Callable],
|
|
282
|
-
column_mapping: Dict[str, Dict[str, str]],
|
|
283
|
-
) -> None:
|
|
284
|
-
"""
|
|
285
|
-
Check that all columns needed by evaluator or target function are present.
|
|
286
|
-
|
|
287
|
-
:param df: The data frame to be validated.
|
|
288
|
-
:type df: pd.DataFrame
|
|
289
|
-
:param evaluators: The dictionary of evaluators.
|
|
290
|
-
:type evaluators: Dict[str, Any]
|
|
291
|
-
:param target: The callable to be applied to data set.
|
|
292
|
-
:type target: Optional[Callable]
|
|
293
|
-
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping
|
|
294
|
-
:type column_mapping: Dict[str, Dict[str, str]]
|
|
295
|
-
:raises EvaluationException: If column starts from "__outputs." while target is defined.
|
|
296
|
-
"""
|
|
297
|
-
if target:
|
|
298
|
-
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
299
|
-
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
300
|
-
raise EvaluationException(
|
|
301
|
-
message=msg,
|
|
302
|
-
internal_message=msg,
|
|
303
|
-
target=ErrorTarget.EVALUATE,
|
|
304
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
305
|
-
blame=ErrorBlame.USER_ERROR,
|
|
306
|
-
)
|
|
307
|
-
# If the target function is given, it may return
|
|
308
|
-
# several columns and hence we cannot check the availability of columns
|
|
309
|
-
# without knowing target function semantics.
|
|
310
|
-
# Instead, here we will validate the columns, taken by target.
|
|
311
|
-
_validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
|
|
312
|
-
else:
|
|
313
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
314
|
-
# Apply column mapping
|
|
315
|
-
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
316
|
-
new_df = _apply_column_mapping(df, mapping_config)
|
|
317
|
-
|
|
318
|
-
# Validate input data for evaluator
|
|
319
|
-
_validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
|
|
320
|
-
|
|
321
|
-
|
|
322
429
|
def _apply_target_to_data(
|
|
323
430
|
target: Callable,
|
|
324
|
-
data: str,
|
|
431
|
+
data: Union[str, os.PathLike],
|
|
325
432
|
pf_client: PFClient,
|
|
326
433
|
initial_data: pd.DataFrame,
|
|
327
434
|
evaluation_name: Optional[str] = None,
|
|
328
|
-
|
|
329
|
-
) -> Tuple[pd.DataFrame, Set[str]]:
|
|
435
|
+
**kwargs,
|
|
436
|
+
) -> Tuple[pd.DataFrame, Set[str], Run]:
|
|
330
437
|
"""
|
|
331
438
|
Apply the target function to the data set and return updated data and generated columns.
|
|
332
439
|
|
|
333
440
|
:param target: The function to be applied to data.
|
|
334
441
|
:type target: Callable
|
|
335
442
|
:param data: The path to input jsonl file.
|
|
336
|
-
:type data: str
|
|
443
|
+
:type data: Union[str, os.PathLike]
|
|
337
444
|
:param pf_client: The promptflow client to be used.
|
|
338
445
|
:type pf_client: PFClient
|
|
339
446
|
:param initial_data: The data frame with the loaded data.
|
|
340
447
|
:type initial_data: pd.DataFrame
|
|
341
448
|
:param evaluation_name: The name of the evaluation.
|
|
342
449
|
:type evaluation_name: Optional[str]
|
|
343
|
-
:param _run_name: The name of target run. Used for testing only.
|
|
344
|
-
:type _run_name: Optional[str]
|
|
345
450
|
:return: The tuple, containing data frame and the list of added columns.
|
|
346
451
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
347
452
|
"""
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
453
|
+
_run_name = kwargs.get("_run_name")
|
|
454
|
+
upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
with TargetRunContext(upload_target_snaphot):
|
|
458
|
+
run: Run = pf_client.run(
|
|
459
|
+
flow=target,
|
|
460
|
+
display_name=evaluation_name,
|
|
461
|
+
data=data,
|
|
462
|
+
properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
|
|
463
|
+
stream=True,
|
|
464
|
+
name=_run_name,
|
|
465
|
+
)
|
|
466
|
+
except (UserAuthenticationError, UploadInternalError) as ex:
|
|
467
|
+
if "Failed to upload run" in ex.message:
|
|
468
|
+
msg = (
|
|
469
|
+
"Failed to upload the target run to the cloud. "
|
|
470
|
+
"This may be caused by insufficient permission to access storage or other errors."
|
|
471
|
+
)
|
|
472
|
+
raise EvaluationException(
|
|
473
|
+
message=msg,
|
|
474
|
+
target=ErrorTarget.EVALUATE,
|
|
475
|
+
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
476
|
+
blame=ErrorBlame.USER_ERROR,
|
|
477
|
+
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
478
|
+
) from ex
|
|
479
|
+
|
|
480
|
+
raise ex
|
|
481
|
+
|
|
482
|
+
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
360
483
|
# Remove input and output prefix
|
|
361
484
|
generated_columns = {
|
|
362
485
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -378,16 +501,18 @@ def _apply_target_to_data(
|
|
|
378
501
|
return target_output, generated_columns, run
|
|
379
502
|
|
|
380
503
|
|
|
381
|
-
def _process_column_mappings(
|
|
504
|
+
def _process_column_mappings(
|
|
505
|
+
column_mapping: Dict[str, Optional[Dict[str, str]]],
|
|
506
|
+
) -> Dict[str, Dict[str, str]]:
|
|
382
507
|
"""Process column_mapping to replace ${target.} with ${data.}
|
|
383
508
|
|
|
384
509
|
:param column_mapping: The configuration for evaluators.
|
|
385
|
-
:type column_mapping: Dict[str, Dict[str, str]]
|
|
510
|
+
:type column_mapping: Dict[str, Optional[Dict[str, str]]]
|
|
386
511
|
:return: The processed configuration.
|
|
387
512
|
:rtype: Dict[str, Dict[str, str]]
|
|
388
513
|
"""
|
|
389
514
|
|
|
390
|
-
processed_config = {}
|
|
515
|
+
processed_config: Dict[str, Dict[str, str]] = {}
|
|
391
516
|
|
|
392
517
|
unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
|
|
393
518
|
|
|
@@ -441,15 +566,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
441
566
|
# @log_evaluate_activity
|
|
442
567
|
def evaluate(
|
|
443
568
|
*,
|
|
444
|
-
data: str,
|
|
569
|
+
data: Union[str, os.PathLike],
|
|
445
570
|
evaluators: Dict[str, Callable],
|
|
446
571
|
evaluation_name: Optional[str] = None,
|
|
447
572
|
target: Optional[Callable] = None,
|
|
448
573
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
449
574
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
450
|
-
output_path: Optional[str] = None,
|
|
575
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
451
576
|
**kwargs,
|
|
452
|
-
):
|
|
577
|
+
) -> EvaluationResult:
|
|
453
578
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
454
579
|
data will be run through target function and then results will be evaluated.
|
|
455
580
|
|
|
@@ -474,7 +599,7 @@ def evaluate(
|
|
|
474
599
|
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
|
|
475
600
|
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
|
|
476
601
|
:return: Evaluation results.
|
|
477
|
-
:rtype:
|
|
602
|
+
:rtype: ~azure.ai.evaluation.EvaluationResult
|
|
478
603
|
|
|
479
604
|
:Example:
|
|
480
605
|
|
|
@@ -548,47 +673,92 @@ def evaluate(
|
|
|
548
673
|
internal_message=error_message,
|
|
549
674
|
target=ErrorTarget.EVALUATE,
|
|
550
675
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
551
|
-
blame=ErrorBlame.
|
|
676
|
+
blame=ErrorBlame.USER_ERROR,
|
|
677
|
+
) from e
|
|
678
|
+
|
|
679
|
+
# Ensure a consistent user experience when encountering errors by converting
|
|
680
|
+
# all other exceptions to EvaluationException.
|
|
681
|
+
if not isinstance(e, EvaluationException):
|
|
682
|
+
raise EvaluationException(
|
|
683
|
+
message=str(e),
|
|
684
|
+
target=ErrorTarget.EVALUATE,
|
|
685
|
+
category=ErrorCategory.FAILED_EXECUTION,
|
|
686
|
+
blame=ErrorBlame.SYSTEM_ERROR,
|
|
552
687
|
) from e
|
|
553
688
|
|
|
554
689
|
raise e
|
|
555
690
|
|
|
556
691
|
|
|
692
|
+
def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
|
|
693
|
+
# Extract evaluators with a non-empty "run_summary"
|
|
694
|
+
output_dict = {
|
|
695
|
+
name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
if output_dict:
|
|
699
|
+
print("======= Combined Run Summary (Per Evaluator) =======\n")
|
|
700
|
+
print(json.dumps(output_dict, indent=4))
|
|
701
|
+
print("\n====================================================")
|
|
702
|
+
|
|
703
|
+
|
|
557
704
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
558
705
|
*,
|
|
706
|
+
evaluators: Dict[str, Callable],
|
|
559
707
|
evaluation_name: Optional[str] = None,
|
|
560
708
|
target: Optional[Callable] = None,
|
|
561
|
-
data:
|
|
562
|
-
evaluators: Optional[Dict[str, Callable]] = None,
|
|
709
|
+
data: Union[str, os.PathLike],
|
|
563
710
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
564
711
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
565
|
-
output_path: Optional[str] = None,
|
|
712
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
566
713
|
**kwargs,
|
|
567
|
-
):
|
|
714
|
+
) -> EvaluationResult:
|
|
568
715
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
569
716
|
|
|
570
717
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
571
718
|
if evaluator_config is None:
|
|
572
719
|
evaluator_config = {}
|
|
573
720
|
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
574
|
-
column_mapping =
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
721
|
+
column_mapping = _process_column_mappings(
|
|
722
|
+
{
|
|
723
|
+
evaluator_name: evaluator_configuration.get("column_mapping", None)
|
|
724
|
+
for evaluator_name, evaluator_configuration in evaluator_config.items()
|
|
725
|
+
}
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
if target is not None:
|
|
729
|
+
_validate_columns_for_target(input_data_df, target)
|
|
580
730
|
|
|
581
731
|
# Target Run
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
732
|
+
try:
|
|
733
|
+
pf_client = PFClient(
|
|
734
|
+
config=(
|
|
735
|
+
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)}
|
|
736
|
+
if azure_ai_project
|
|
737
|
+
else None
|
|
738
|
+
),
|
|
739
|
+
user_agent=USER_AGENT,
|
|
740
|
+
)
|
|
741
|
+
# pylint: disable=raise-missing-from
|
|
742
|
+
except MissingAzurePackage:
|
|
743
|
+
msg = (
|
|
744
|
+
"The required packages for remote tracking are missing.\n"
|
|
745
|
+
'To resolve this, please install them by running "pip install azure-ai-evaluation[remote]".'
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
raise EvaluationException( # pylint: disable=raise-missing-from
|
|
749
|
+
message=msg,
|
|
750
|
+
target=ErrorTarget.EVALUATE,
|
|
751
|
+
category=ErrorCategory.MISSING_PACKAGE,
|
|
752
|
+
blame=ErrorBlame.USER_ERROR,
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
trace_destination: Optional[str] = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
588
756
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
757
|
+
# Handle the case where the customer manually run "pf config set trace.destination=none"
|
|
758
|
+
if trace_destination and trace_destination.lower() == "none":
|
|
759
|
+
trace_destination = None
|
|
760
|
+
|
|
761
|
+
target_run: Optional[Run] = None
|
|
592
762
|
|
|
593
763
|
# Create default configuration for evaluators that directly maps
|
|
594
764
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
@@ -596,9 +766,10 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
596
766
|
column_mapping.setdefault("default", {})
|
|
597
767
|
|
|
598
768
|
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
769
|
+
target_generated_columns: Set[str] = set()
|
|
599
770
|
if data is not None and target is not None:
|
|
600
771
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
601
|
-
target, data, pf_client, input_data_df, evaluation_name,
|
|
772
|
+
target, data, pf_client, input_data_df, evaluation_name, **kwargs
|
|
602
773
|
)
|
|
603
774
|
|
|
604
775
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -613,9 +784,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
613
784
|
if col not in mapping and run_output not in mapped_to_values:
|
|
614
785
|
column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
615
786
|
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
_validate_columns(input_data_df, evaluators, target=None, column_mapping=column_mapping)
|
|
787
|
+
# After we have generated all columns, we can check if we have everything we need for evaluators.
|
|
788
|
+
_validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
|
|
619
789
|
|
|
620
790
|
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
621
791
|
# via target mapping.
|
|
@@ -627,45 +797,54 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
627
797
|
# Also ignore columns that are already in config, since they've been covered by target mapping.
|
|
628
798
|
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
629
799
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
800
|
+
|
|
801
|
+
def eval_batch_run(
|
|
802
|
+
batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
|
|
803
|
+
) -> Dict[str, __EvaluatorInfo]:
|
|
804
|
+
with EvalRunContext(batch_run_client):
|
|
805
|
+
runs = {
|
|
806
|
+
evaluator_name: batch_run_client.run(
|
|
807
|
+
flow=evaluator,
|
|
808
|
+
run=target_run,
|
|
809
|
+
evaluator_name=evaluator_name,
|
|
810
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
811
|
+
data=data,
|
|
812
|
+
stream=True,
|
|
813
|
+
name=kwargs.get("_run_name"),
|
|
814
|
+
)
|
|
815
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
819
|
+
return {
|
|
820
|
+
evaluator_name: {
|
|
821
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
822
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
823
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
824
|
+
}
|
|
825
|
+
for evaluator_name, run in runs.items()
|
|
826
|
+
}
|
|
827
|
+
|
|
630
828
|
# Batch Run
|
|
631
|
-
evaluators_info = {}
|
|
632
829
|
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
633
830
|
if use_pf_client:
|
|
634
|
-
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
635
|
-
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
636
|
-
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
637
|
-
batch_run_client = ProxyClient(PFClient(user_agent=USER_AGENT))
|
|
638
|
-
|
|
639
831
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
640
832
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
641
833
|
data = os.path.abspath(data)
|
|
834
|
+
|
|
835
|
+
# A user reported intermittent errors when PFClient uploads evaluation runs to the cloud.
|
|
836
|
+
# The root cause is still unclear, but it seems related to a conflict between the async run uploader
|
|
837
|
+
# and the async batch run. As a quick mitigation, use a PFClient without a trace destination for batch runs.
|
|
838
|
+
per_evaluator_results = eval_batch_run(ProxyClient(PFClient(user_agent=USER_AGENT)), data=data)
|
|
642
839
|
else:
|
|
643
|
-
batch_run_client = CodeClient()
|
|
644
840
|
data = input_data_df
|
|
645
|
-
|
|
646
|
-
with BatchRunContext(batch_run_client):
|
|
647
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
648
|
-
evaluators_info[evaluator_name] = {}
|
|
649
|
-
evaluators_info[evaluator_name]["run"] = batch_run_client.run(
|
|
650
|
-
flow=evaluator,
|
|
651
|
-
run=target_run,
|
|
652
|
-
evaluator_name=evaluator_name,
|
|
653
|
-
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
654
|
-
data=data,
|
|
655
|
-
stream=True,
|
|
656
|
-
name=kwargs.get("_run_name"),
|
|
657
|
-
)
|
|
658
|
-
|
|
659
|
-
# get_details needs to be called within BatchRunContext scope in order to have user agent populated
|
|
660
|
-
for evaluator_name, evaluator_info in evaluators_info.items():
|
|
661
|
-
evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
|
|
662
|
-
evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
|
|
841
|
+
per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
|
|
663
842
|
|
|
664
843
|
# Concatenate all results
|
|
665
844
|
evaluators_result_df = None
|
|
666
845
|
evaluators_metric = {}
|
|
667
|
-
for evaluator_name,
|
|
668
|
-
evaluator_result_df =
|
|
846
|
+
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
847
|
+
evaluator_result_df = evaluator_result["result"]
|
|
669
848
|
|
|
670
849
|
# drop input columns
|
|
671
850
|
evaluator_result_df = evaluator_result_df.drop(
|
|
@@ -688,7 +867,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
688
867
|
else evaluator_result_df
|
|
689
868
|
)
|
|
690
869
|
|
|
691
|
-
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in
|
|
870
|
+
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
|
|
692
871
|
|
|
693
872
|
# Rename columns, generated by target function to outputs instead of inputs.
|
|
694
873
|
# If target generates columns, already present in the input data, these columns
|
|
@@ -706,9 +885,12 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
706
885
|
evaluation_name,
|
|
707
886
|
)
|
|
708
887
|
|
|
709
|
-
|
|
888
|
+
result_df_dict = result_df.to_dict("records")
|
|
889
|
+
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
710
890
|
|
|
711
891
|
if output_path:
|
|
712
892
|
_write_output(output_path, result)
|
|
713
893
|
|
|
894
|
+
_print_summary(per_evaluator_results)
|
|
895
|
+
|
|
714
896
|
return result
|