azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +5 -31
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +120 -300
- azure/ai/evaluation/_common/utils.py +23 -381
- azure/ai/evaluation/_constants.py +6 -19
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +7 -23
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +17 -33
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/proxy_client.py +4 -32
- azure/ai/evaluation/_evaluate/_eval_run.py +24 -81
- azure/ai/evaluation/_evaluate/_evaluate.py +239 -393
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +17 -17
- azure/ai/evaluation/_evaluate/_utils.py +28 -82
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +18 -17
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +357 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +157 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +88 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +67 -105
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +34 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +301 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +54 -44
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +19 -34
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +89 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +16 -14
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +87 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -20
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +30 -23
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +96 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -26
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +38 -53
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +105 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +132 -203
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +1 -2
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +92 -111
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +12 -13
- azure/ai/evaluation/simulator/_conversation/_conversation.py +4 -4
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +67 -33
- azure/ai/evaluation/simulator/_helpers/__init__.py +2 -1
- azure/ai/evaluation/{_common → simulator/_helpers}/_experimental.py +9 -24
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +5 -26
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +94 -107
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +11 -28
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +4 -8
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +10 -10
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_simulator.py +207 -277
- azure/ai/evaluation/simulator/_tracing.py +4 -4
- azure/ai/evaluation/simulator/_utils.py +13 -31
- azure_ai_evaluation-1.0.0b2.dist-info/METADATA +449 -0
- azure_ai_evaluation-1.0.0b2.dist-info/RECORD +99 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b2.dist-info}/top_level.txt +0 -0
|
@@ -2,32 +2,26 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
import inspect
|
|
5
|
-
import json
|
|
6
|
-
import logging
|
|
7
5
|
import os
|
|
8
6
|
import re
|
|
9
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple,
|
|
7
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
|
|
10
8
|
|
|
9
|
+
import numpy as np
|
|
11
10
|
import pandas as pd
|
|
12
11
|
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
|
-
from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
|
|
14
12
|
from promptflow.client import PFClient
|
|
15
|
-
from promptflow.entities import Run
|
|
16
13
|
|
|
17
|
-
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
18
|
-
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
19
14
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
20
15
|
|
|
21
16
|
from .._constants import (
|
|
22
17
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
23
18
|
EvaluationMetrics,
|
|
24
|
-
EvaluationRunProperties,
|
|
25
19
|
Prefixes,
|
|
26
20
|
_InternalEvaluationMetrics,
|
|
27
21
|
)
|
|
28
|
-
from .._model_configurations import AzureAIProject
|
|
22
|
+
from .._model_configurations import AzureAIProject
|
|
29
23
|
from .._user_agent import USER_AGENT
|
|
30
|
-
from .
|
|
24
|
+
from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
|
|
31
25
|
from ._utils import (
|
|
32
26
|
_apply_column_mapping,
|
|
33
27
|
_log_metrics_and_instance_results,
|
|
@@ -35,54 +29,10 @@ from ._utils import (
|
|
|
35
29
|
_write_output,
|
|
36
30
|
)
|
|
37
31
|
|
|
38
|
-
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
39
|
-
LOGGER = logging.getLogger(__name__)
|
|
40
|
-
|
|
41
|
-
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
42
|
-
# originating column name, usually because the aggregation of the original value
|
|
43
|
-
# means something sufficiently different.
|
|
44
|
-
# Note that content safety metrics are handled seprately.
|
|
45
|
-
METRIC_COLUMN_NAME_REPLACEMENTS = {
|
|
46
|
-
"groundedness_pro_label": "groundedness_pro_passing_rate",
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class __EvaluatorInfo(TypedDict):
|
|
51
|
-
result: pd.DataFrame
|
|
52
|
-
metrics: Dict[str, Any]
|
|
53
|
-
run_summary: Dict[str, Any]
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
|
|
57
|
-
"""Identify and average various metrics that need to have the metric name be replaced,
|
|
58
|
-
instead of having the metric match the originating column name.
|
|
59
|
-
:param df: The dataframe of evaluation results.
|
|
60
|
-
:type df: ~pandas.DataFrame
|
|
61
|
-
:return: A tuple; the first element is a list of dataframe columns that were aggregated,
|
|
62
|
-
and the second element is a dictionary of resultant new metric column names and their values.
|
|
63
|
-
:rtype: Tuple[List[str], Dict[str, float]]
|
|
64
|
-
"""
|
|
65
|
-
renamed_cols = []
|
|
66
|
-
metric_columns = {}
|
|
67
|
-
for col in df.columns:
|
|
68
|
-
metric_prefix = col.split(".")[0]
|
|
69
|
-
metric_name = col.split(".")[1]
|
|
70
|
-
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
71
|
-
renamed_cols.append(col)
|
|
72
|
-
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
73
|
-
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
74
|
-
try:
|
|
75
|
-
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
76
|
-
except EvaluationException: # only exception that can be cause is all NaN values
|
|
77
|
-
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
78
|
-
LOGGER.warning(msg)
|
|
79
|
-
|
|
80
|
-
return renamed_cols, metric_columns
|
|
81
|
-
|
|
82
32
|
|
|
83
33
|
# pylint: disable=line-too-long
|
|
84
34
|
def _aggregate_content_safety_metrics(
|
|
85
|
-
df: pd.DataFrame, evaluators: Dict[str,
|
|
35
|
+
df: pd.DataFrame, evaluators: Dict[str, Type]
|
|
86
36
|
) -> Tuple[List[str], Dict[str, float]]:
|
|
87
37
|
"""Find and aggregate defect rates for content safety metrics. Returns both a list
|
|
88
38
|
of columns that were used to calculate defect rates and the defect rates themselves.
|
|
@@ -122,15 +72,11 @@ def _aggregate_content_safety_metrics(
|
|
|
122
72
|
for col in content_safety_df.columns:
|
|
123
73
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
124
74
|
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
except EvaluationException: # only exception that can be cause is all NaN values
|
|
131
|
-
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
132
|
-
LOGGER.warning(msg)
|
|
133
|
-
|
|
75
|
+
defect_rates[defect_rate_name] = round(
|
|
76
|
+
np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
|
|
77
|
+
/ col_with_numeric_values.count(),
|
|
78
|
+
2,
|
|
79
|
+
)
|
|
134
80
|
return content_safety_cols, defect_rates
|
|
135
81
|
|
|
136
82
|
|
|
@@ -160,15 +106,14 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
160
106
|
for col in label_df.columns:
|
|
161
107
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
162
108
|
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
LOGGER.warning(msg)
|
|
109
|
+
defect_rates[defect_rate_name] = round(
|
|
110
|
+
np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
|
|
111
|
+
2,
|
|
112
|
+
)
|
|
168
113
|
return label_cols, defect_rates
|
|
169
114
|
|
|
170
115
|
|
|
171
|
-
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str,
|
|
116
|
+
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
|
|
172
117
|
"""Aggregate metrics from the evaluation results.
|
|
173
118
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
174
119
|
that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
|
|
@@ -177,7 +122,7 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
177
122
|
:param df: The dataframe of evaluation results.
|
|
178
123
|
:type df: ~pandas.DataFrame
|
|
179
124
|
:param evaluators: A dictionary mapping of strings to evaluator classes.
|
|
180
|
-
:type evaluators: Dict[str,
|
|
125
|
+
:type evaluators: Dict[str, Type]
|
|
181
126
|
:return: The aggregated metrics.
|
|
182
127
|
:rtype: Dict[str, float]
|
|
183
128
|
"""
|
|
@@ -188,11 +133,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
188
133
|
# Rename certain columns as defect rates if we know that's what their aggregates represent
|
|
189
134
|
# Content safety metrics
|
|
190
135
|
content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
|
|
191
|
-
other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
|
|
192
136
|
handled_columns.extend(content_safety_cols)
|
|
193
|
-
handled_columns.extend(other_renamed_cols)
|
|
194
137
|
defect_rates.update(cs_defect_rates)
|
|
195
|
-
defect_rates.update(renamed_cols)
|
|
196
138
|
# Label-based (true/false) metrics where 'true' means 'something is wrong'
|
|
197
139
|
label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
|
|
198
140
|
handled_columns.extend(label_cols)
|
|
@@ -201,9 +143,6 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
201
143
|
# For rest of metrics, we will calculate mean
|
|
202
144
|
df.drop(columns=handled_columns, inplace=True)
|
|
203
145
|
|
|
204
|
-
# NOTE: nan/None values don't count as as booleans, so boolean columns with
|
|
205
|
-
# nan/None values won't have a mean produced from them.
|
|
206
|
-
# This is different from label-based known evaluators, which have special handling.
|
|
207
146
|
mean_value = df.mean(numeric_only=True)
|
|
208
147
|
metrics = mean_value.to_dict()
|
|
209
148
|
# Add defect rates back into metrics
|
|
@@ -211,133 +150,28 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
211
150
|
return metrics
|
|
212
151
|
|
|
213
152
|
|
|
214
|
-
def
|
|
215
|
-
df: pd.DataFrame,
|
|
216
|
-
target: Callable,
|
|
217
|
-
) -> None:
|
|
218
|
-
"""
|
|
219
|
-
Check that all columns needed by target function are present.
|
|
220
|
-
|
|
221
|
-
:param df: The data frame to be validated.
|
|
222
|
-
:type df: pd.DataFrame
|
|
223
|
-
:param target: The callable to be applied to data set.
|
|
224
|
-
:type target: Optional[Callable]
|
|
225
|
-
:raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
|
|
226
|
-
"""
|
|
227
|
-
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
228
|
-
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
229
|
-
raise EvaluationException(
|
|
230
|
-
message=msg,
|
|
231
|
-
internal_message=msg,
|
|
232
|
-
target=ErrorTarget.EVALUATE,
|
|
233
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
234
|
-
blame=ErrorBlame.USER_ERROR,
|
|
235
|
-
)
|
|
236
|
-
# If the target function is given, it may return
|
|
237
|
-
# several columns and hence we cannot check the availability of columns
|
|
238
|
-
# without knowing target function semantics.
|
|
239
|
-
# Instead, here we will validate the columns, taken by target.
|
|
153
|
+
def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_target_fn=False):
|
|
240
154
|
required_inputs = [
|
|
241
155
|
param.name
|
|
242
|
-
for param in inspect.signature(
|
|
156
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
243
157
|
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
244
158
|
]
|
|
245
159
|
|
|
246
|
-
missing_inputs = [col for col in required_inputs if col not in
|
|
160
|
+
missing_inputs = [col for col in required_inputs if col not in df_data.columns]
|
|
247
161
|
if missing_inputs:
|
|
248
|
-
|
|
162
|
+
if not is_target_fn:
|
|
163
|
+
msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
|
|
164
|
+
raise EvaluationException(
|
|
165
|
+
message=msg,
|
|
166
|
+
internal_message=msg,
|
|
167
|
+
target=ErrorTarget.EVALUATE,
|
|
168
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
169
|
+
blame=ErrorBlame.USER_ERROR,
|
|
170
|
+
)
|
|
171
|
+
msg = f"Missing required inputs for target : {missing_inputs}."
|
|
249
172
|
raise EvaluationException(
|
|
250
173
|
message=msg,
|
|
251
|
-
|
|
252
|
-
category=ErrorCategory.MISSING_FIELD,
|
|
253
|
-
blame=ErrorBlame.USER_ERROR,
|
|
254
|
-
)
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
def _validate_columns_for_evaluators(
|
|
258
|
-
df: pd.DataFrame,
|
|
259
|
-
evaluators: Dict[str, Callable],
|
|
260
|
-
target: Optional[Callable],
|
|
261
|
-
target_generated_columns: Optional[Set[str]],
|
|
262
|
-
column_mapping: Dict[str, Dict[str, str]],
|
|
263
|
-
) -> None:
|
|
264
|
-
"""
|
|
265
|
-
Check that all columns needed by evaluators are present.
|
|
266
|
-
|
|
267
|
-
:param df: The data frame to be validated.
|
|
268
|
-
:type df: pd.DataFrame
|
|
269
|
-
:param evaluators: The dictionary of evaluators.
|
|
270
|
-
:type evaluators: Dict[str, Callable]
|
|
271
|
-
:param target: The callable to be applied to data set.
|
|
272
|
-
:type target: Optional[Callable]
|
|
273
|
-
:param target_generated_columns: The set of columns generated by the target callable.
|
|
274
|
-
:type target_generated_columns: Optional[Set[str]]
|
|
275
|
-
:param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
|
|
276
|
-
:type column_mapping: Dict[str, Dict[str, str]]
|
|
277
|
-
:raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
|
|
278
|
-
"""
|
|
279
|
-
missing_inputs_per_evaluator = {}
|
|
280
|
-
|
|
281
|
-
for evaluator_name, evaluator in evaluators.items():
|
|
282
|
-
# Apply column mapping
|
|
283
|
-
mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
|
|
284
|
-
new_df = _apply_column_mapping(df, mapping_config)
|
|
285
|
-
|
|
286
|
-
# Validate input data for evaluator
|
|
287
|
-
is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
|
|
288
|
-
if is_built_in:
|
|
289
|
-
# Note that for built-in evaluators supporting the "conversation" parameter,
|
|
290
|
-
# input parameters are now optional.
|
|
291
|
-
evaluator_params = [
|
|
292
|
-
param.name
|
|
293
|
-
for param in inspect.signature(evaluator).parameters.values()
|
|
294
|
-
if param.name not in ["kwargs", "args", "self"]
|
|
295
|
-
]
|
|
296
|
-
|
|
297
|
-
if "conversation" in evaluator_params and "conversation" in new_df.columns:
|
|
298
|
-
# Ignore the missing fields if "conversation" presents in the input data
|
|
299
|
-
missing_inputs = []
|
|
300
|
-
else:
|
|
301
|
-
optional_params = (
|
|
302
|
-
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
303
|
-
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
304
|
-
else []
|
|
305
|
-
)
|
|
306
|
-
excluded_params = set(new_df.columns).union(optional_params)
|
|
307
|
-
missing_inputs = [col for col in evaluator_params if col not in excluded_params]
|
|
308
|
-
|
|
309
|
-
# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
|
|
310
|
-
# Otherwise, remove it from the missing inputs
|
|
311
|
-
if "conversation" in missing_inputs:
|
|
312
|
-
if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
|
|
313
|
-
missing_inputs.remove("conversation")
|
|
314
|
-
else:
|
|
315
|
-
evaluator_params = [
|
|
316
|
-
param.name
|
|
317
|
-
for param in inspect.signature(evaluator).parameters.values()
|
|
318
|
-
if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
|
|
319
|
-
]
|
|
320
|
-
|
|
321
|
-
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
|
|
322
|
-
|
|
323
|
-
if missing_inputs:
|
|
324
|
-
missing_inputs_per_evaluator[evaluator_name] = missing_inputs
|
|
325
|
-
|
|
326
|
-
if missing_inputs_per_evaluator:
|
|
327
|
-
msg = "Some evaluators are missing required inputs:\n"
|
|
328
|
-
for evaluator_name, missing in missing_inputs_per_evaluator.items():
|
|
329
|
-
msg += f"- {evaluator_name}: {missing}\n"
|
|
330
|
-
|
|
331
|
-
# Add the additional notes
|
|
332
|
-
msg += "\nTo resolve this issue:\n"
|
|
333
|
-
msg += "- Ensure the data contains required inputs.\n"
|
|
334
|
-
if target is not None:
|
|
335
|
-
msg += "- Verify that the target is generating the necessary columns for the evaluators. "
|
|
336
|
-
msg += f"Currently generated columns: {target_generated_columns} \n"
|
|
337
|
-
msg += "- Check that the column mapping is correctly configured."
|
|
338
|
-
|
|
339
|
-
raise EvaluationException(
|
|
340
|
-
message=msg.strip(),
|
|
174
|
+
internal_message=msg,
|
|
341
175
|
target=ErrorTarget.EVALUATE,
|
|
342
176
|
category=ErrorCategory.MISSING_FIELD,
|
|
343
177
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -346,85 +180,76 @@ def _validate_columns_for_evaluators(
|
|
|
346
180
|
|
|
347
181
|
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
|
|
348
182
|
if data is None:
|
|
349
|
-
msg = "
|
|
350
|
-
raise EvaluationException(
|
|
351
|
-
message=msg,
|
|
352
|
-
target=ErrorTarget.EVALUATE,
|
|
353
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
354
|
-
blame=ErrorBlame.USER_ERROR,
|
|
355
|
-
)
|
|
356
|
-
if not isinstance(data, (os.PathLike, str)):
|
|
357
|
-
msg = "The 'data' parameter must be a string or a path-like object."
|
|
358
|
-
raise EvaluationException(
|
|
359
|
-
message=msg,
|
|
360
|
-
target=ErrorTarget.EVALUATE,
|
|
361
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
362
|
-
blame=ErrorBlame.USER_ERROR,
|
|
363
|
-
)
|
|
364
|
-
if not os.path.exists(data):
|
|
365
|
-
msg = f"The input data file path '{data}' does not exist."
|
|
183
|
+
msg = "data parameter must be provided for evaluation."
|
|
366
184
|
raise EvaluationException(
|
|
367
185
|
message=msg,
|
|
186
|
+
internal_message=msg,
|
|
368
187
|
target=ErrorTarget.EVALUATE,
|
|
369
|
-
category=ErrorCategory.
|
|
188
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
370
189
|
blame=ErrorBlame.USER_ERROR,
|
|
371
190
|
)
|
|
372
191
|
|
|
373
192
|
if target is not None:
|
|
374
193
|
if not callable(target):
|
|
375
|
-
msg = "
|
|
194
|
+
msg = "target parameter must be a callable function."
|
|
376
195
|
raise EvaluationException(
|
|
377
196
|
message=msg,
|
|
197
|
+
internal_message=msg,
|
|
378
198
|
target=ErrorTarget.EVALUATE,
|
|
379
199
|
category=ErrorCategory.INVALID_VALUE,
|
|
380
200
|
blame=ErrorBlame.USER_ERROR,
|
|
381
201
|
)
|
|
382
202
|
|
|
383
|
-
if not
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
raise EvaluationException(
|
|
394
|
-
message=msg,
|
|
395
|
-
target=ErrorTarget.EVALUATE,
|
|
396
|
-
category=ErrorCategory.INVALID_VALUE,
|
|
397
|
-
blame=ErrorBlame.USER_ERROR,
|
|
398
|
-
)
|
|
203
|
+
if data is not None:
|
|
204
|
+
if not isinstance(data, str):
|
|
205
|
+
msg = "data parameter must be a string."
|
|
206
|
+
raise EvaluationException(
|
|
207
|
+
message=msg,
|
|
208
|
+
internal_message=msg,
|
|
209
|
+
target=ErrorTarget.EVALUATE,
|
|
210
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
211
|
+
blame=ErrorBlame.USER_ERROR,
|
|
212
|
+
)
|
|
399
213
|
|
|
400
|
-
if
|
|
401
|
-
if not isinstance(
|
|
402
|
-
msg = "
|
|
214
|
+
if evaluators is not None:
|
|
215
|
+
if not isinstance(evaluators, dict):
|
|
216
|
+
msg = "evaluators parameter must be a dictionary."
|
|
403
217
|
raise EvaluationException(
|
|
404
218
|
message=msg,
|
|
219
|
+
internal_message=msg,
|
|
405
220
|
target=ErrorTarget.EVALUATE,
|
|
406
221
|
category=ErrorCategory.INVALID_VALUE,
|
|
407
222
|
blame=ErrorBlame.USER_ERROR,
|
|
408
223
|
)
|
|
409
224
|
|
|
410
|
-
|
|
411
|
-
if
|
|
412
|
-
msg =
|
|
225
|
+
if output_path is not None:
|
|
226
|
+
if not isinstance(output_path, str):
|
|
227
|
+
msg = "output_path parameter must be a string."
|
|
413
228
|
raise EvaluationException(
|
|
414
229
|
message=msg,
|
|
230
|
+
internal_message=msg,
|
|
415
231
|
target=ErrorTarget.EVALUATE,
|
|
416
232
|
category=ErrorCategory.INVALID_VALUE,
|
|
417
233
|
blame=ErrorBlame.USER_ERROR,
|
|
418
234
|
)
|
|
419
235
|
|
|
420
236
|
if azure_ai_project is not None:
|
|
421
|
-
|
|
237
|
+
if not isinstance(azure_ai_project, Dict):
|
|
238
|
+
msg = "azure_ai_project parameter must be a dictionary."
|
|
239
|
+
raise EvaluationException(
|
|
240
|
+
message=msg,
|
|
241
|
+
internal_message=msg,
|
|
242
|
+
target=ErrorTarget.EVALUATE,
|
|
243
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
244
|
+
blame=ErrorBlame.USER_ERROR,
|
|
245
|
+
)
|
|
422
246
|
|
|
423
247
|
if evaluation_name is not None:
|
|
424
|
-
if not isinstance(evaluation_name, str)
|
|
425
|
-
msg = "
|
|
248
|
+
if not isinstance(evaluation_name, str):
|
|
249
|
+
msg = "evaluation_name parameter must be a string."
|
|
426
250
|
raise EvaluationException(
|
|
427
251
|
message=msg,
|
|
252
|
+
internal_message=msg,
|
|
428
253
|
target=ErrorTarget.EVALUATE,
|
|
429
254
|
category=ErrorCategory.INVALID_VALUE,
|
|
430
255
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -434,7 +259,8 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
434
259
|
initial_data_df = pd.read_json(data, lines=True)
|
|
435
260
|
except Exception as e:
|
|
436
261
|
raise EvaluationException(
|
|
437
|
-
message=f"
|
|
262
|
+
message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
|
|
263
|
+
internal_message="Failed to load data. Confirm that it is valid jsonl data.",
|
|
438
264
|
target=ErrorTarget.EVALUATE,
|
|
439
265
|
category=ErrorCategory.INVALID_VALUE,
|
|
440
266
|
blame=ErrorBlame.USER_ERROR,
|
|
@@ -443,60 +269,88 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
443
269
|
return initial_data_df
|
|
444
270
|
|
|
445
271
|
|
|
272
|
+
def _validate_columns(
|
|
273
|
+
df: pd.DataFrame,
|
|
274
|
+
evaluators: Dict[str, Any],
|
|
275
|
+
target: Optional[Callable],
|
|
276
|
+
evaluator_config: Dict[str, Dict[str, str]],
|
|
277
|
+
) -> None:
|
|
278
|
+
"""
|
|
279
|
+
Check that all columns needed by evaluator or target function are present.
|
|
280
|
+
|
|
281
|
+
:param df: The data frame to be validated.
|
|
282
|
+
:type df: pd.DataFrame
|
|
283
|
+
:param evaluators: The dictionary of evaluators.
|
|
284
|
+
:type evaluators: Dict[str, Any]
|
|
285
|
+
:param target: The callable to be applied to data set.
|
|
286
|
+
:type target: Optional[Callable]
|
|
287
|
+
:param evaluator_config: The configuration for evaluators.
|
|
288
|
+
:type evaluator_config: Dict[str, Dict[str, str]]
|
|
289
|
+
:raises EvaluationException: If column starts from "__outputs." while target is defined.
|
|
290
|
+
"""
|
|
291
|
+
if target:
|
|
292
|
+
if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
|
|
293
|
+
msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
|
|
294
|
+
raise EvaluationException(
|
|
295
|
+
message=msg,
|
|
296
|
+
internal_message=msg,
|
|
297
|
+
target=ErrorTarget.EVALUATE,
|
|
298
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
299
|
+
blame=ErrorBlame.USER_ERROR,
|
|
300
|
+
)
|
|
301
|
+
# If the target function is given, it may return
|
|
302
|
+
# several columns and hence we cannot check the availability of columns
|
|
303
|
+
# without knowing target function semantics.
|
|
304
|
+
# Instead, here we will validate the columns, taken by target.
|
|
305
|
+
_validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
|
|
306
|
+
else:
|
|
307
|
+
for evaluator_name, evaluator in evaluators.items():
|
|
308
|
+
# Apply column mapping
|
|
309
|
+
mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
|
|
310
|
+
new_df = _apply_column_mapping(df, mapping_config)
|
|
311
|
+
|
|
312
|
+
# Validate input data for evaluator
|
|
313
|
+
_validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
|
|
314
|
+
|
|
315
|
+
|
|
446
316
|
def _apply_target_to_data(
|
|
447
317
|
target: Callable,
|
|
448
|
-
data:
|
|
318
|
+
data: str,
|
|
449
319
|
pf_client: PFClient,
|
|
450
320
|
initial_data: pd.DataFrame,
|
|
451
321
|
evaluation_name: Optional[str] = None,
|
|
452
|
-
|
|
453
|
-
) -> Tuple[pd.DataFrame, Set[str]
|
|
322
|
+
_run_name: Optional[str] = None,
|
|
323
|
+
) -> Tuple[pd.DataFrame, Set[str]]:
|
|
454
324
|
"""
|
|
455
325
|
Apply the target function to the data set and return updated data and generated columns.
|
|
456
326
|
|
|
457
327
|
:param target: The function to be applied to data.
|
|
458
328
|
:type target: Callable
|
|
459
329
|
:param data: The path to input jsonl file.
|
|
460
|
-
:type data:
|
|
330
|
+
:type data: str
|
|
461
331
|
:param pf_client: The promptflow client to be used.
|
|
462
332
|
:type pf_client: PFClient
|
|
463
333
|
:param initial_data: The data frame with the loaded data.
|
|
464
334
|
:type initial_data: pd.DataFrame
|
|
465
335
|
:param evaluation_name: The name of the evaluation.
|
|
466
336
|
:type evaluation_name: Optional[str]
|
|
337
|
+
:param _run_name: The name of target run. Used for testing only.
|
|
338
|
+
:type _run_name: Optional[str]
|
|
467
339
|
:return: The tuple, containing data frame and the list of added columns.
|
|
468
340
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
469
341
|
"""
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
)
|
|
483
|
-
except (UserAuthenticationError, UploadInternalError) as ex:
|
|
484
|
-
if "Failed to upload run" in ex.message:
|
|
485
|
-
msg = (
|
|
486
|
-
"Failed to upload the target run to the cloud. "
|
|
487
|
-
"This may be caused by insufficient permission to access storage or other errors."
|
|
488
|
-
)
|
|
489
|
-
raise EvaluationException(
|
|
490
|
-
message=msg,
|
|
491
|
-
target=ErrorTarget.EVALUATE,
|
|
492
|
-
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
493
|
-
blame=ErrorBlame.USER_ERROR,
|
|
494
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
495
|
-
) from ex
|
|
496
|
-
|
|
497
|
-
raise ex
|
|
498
|
-
|
|
499
|
-
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
|
|
342
|
+
# We are manually creating the temporary directory for the flow
|
|
343
|
+
# because the way tempdir remove temporary directories will
|
|
344
|
+
# hang the debugger, because promptflow will keep flow directory.
|
|
345
|
+
run = pf_client.run(
|
|
346
|
+
flow=target,
|
|
347
|
+
display_name=evaluation_name,
|
|
348
|
+
data=data,
|
|
349
|
+
properties={"runType": "eval_run", "isEvaluatorRun": "true"},
|
|
350
|
+
stream=True,
|
|
351
|
+
name=_run_name,
|
|
352
|
+
)
|
|
353
|
+
target_output = pf_client.runs.get_details(run, all_results=True)
|
|
500
354
|
# Remove input and output prefix
|
|
501
355
|
generated_columns = {
|
|
502
356
|
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
|
|
@@ -518,30 +372,28 @@ def _apply_target_to_data(
|
|
|
518
372
|
return target_output, generated_columns, run
|
|
519
373
|
|
|
520
374
|
|
|
521
|
-
def
|
|
522
|
-
|
|
523
|
-
) -> Dict[str, Dict[str, str]]:
|
|
524
|
-
"""Process column_mapping to replace ${target.} with ${data.}
|
|
375
|
+
def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
|
|
376
|
+
"""Process evaluator_config to replace ${target.} with ${data.}
|
|
525
377
|
|
|
526
|
-
:param
|
|
527
|
-
:type
|
|
378
|
+
:param evaluator_config: The configuration for evaluators.
|
|
379
|
+
:type evaluator_config: Dict[str, Dict[str, str]]
|
|
528
380
|
:return: The processed configuration.
|
|
529
381
|
:rtype: Dict[str, Dict[str, str]]
|
|
530
382
|
"""
|
|
531
383
|
|
|
532
|
-
processed_config
|
|
384
|
+
processed_config = {}
|
|
533
385
|
|
|
534
386
|
unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
|
|
535
387
|
|
|
536
|
-
if
|
|
537
|
-
for evaluator, mapping_config in
|
|
388
|
+
if evaluator_config:
|
|
389
|
+
for evaluator, mapping_config in evaluator_config.items():
|
|
538
390
|
if isinstance(mapping_config, dict):
|
|
539
391
|
processed_config[evaluator] = {}
|
|
540
392
|
|
|
541
393
|
for map_to_key, map_value in mapping_config.items():
|
|
542
394
|
# Check if there's any unexpected reference other than ${target.} or ${data.}
|
|
543
395
|
if unexpected_references.search(map_value):
|
|
544
|
-
msg = "Unexpected references detected in '
|
|
396
|
+
msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
|
|
545
397
|
raise EvaluationException(
|
|
546
398
|
message=msg,
|
|
547
399
|
internal_message=msg,
|
|
@@ -583,15 +435,15 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
583
435
|
# @log_evaluate_activity
|
|
584
436
|
def evaluate(
|
|
585
437
|
*,
|
|
586
|
-
data:
|
|
438
|
+
data: str,
|
|
587
439
|
evaluators: Dict[str, Callable],
|
|
588
440
|
evaluation_name: Optional[str] = None,
|
|
589
441
|
target: Optional[Callable] = None,
|
|
590
|
-
evaluator_config: Optional[Dict[str,
|
|
442
|
+
evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
|
|
591
443
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
592
|
-
output_path: Optional[
|
|
444
|
+
output_path: Optional[str] = None,
|
|
593
445
|
**kwargs,
|
|
594
|
-
)
|
|
446
|
+
):
|
|
595
447
|
"""Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
|
|
596
448
|
data will be run through target function and then results will be evaluated.
|
|
597
449
|
|
|
@@ -606,26 +458,56 @@ def evaluate(
|
|
|
606
458
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
607
459
|
:paramtype target: Optional[Callable]
|
|
608
460
|
:keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
|
|
609
|
-
names as keys and a
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
:paramtype evaluator_config: Optional[Dict[str,
|
|
461
|
+
names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
|
|
462
|
+
keys as the column names in the evaluator input and values as the column names in the input data or data
|
|
463
|
+
generated by target.
|
|
464
|
+
:paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
|
|
613
465
|
:keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
|
|
614
466
|
the results will be saved to a file named `evaluation_results.json` in the folder.
|
|
615
467
|
:paramtype output_path: Optional[str]
|
|
616
468
|
:keyword azure_ai_project: Logs evaluation results to AI Studio if set.
|
|
617
469
|
:paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
|
|
618
470
|
:return: Evaluation results.
|
|
619
|
-
:rtype:
|
|
471
|
+
:rtype: dict
|
|
472
|
+
|
|
473
|
+
:Example:
|
|
474
|
+
|
|
475
|
+
Evaluate API can be used as follows:
|
|
476
|
+
|
|
477
|
+
.. code-block:: python
|
|
478
|
+
|
|
479
|
+
from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
|
|
480
|
+
|
|
620
481
|
|
|
621
|
-
|
|
482
|
+
model_config = {
|
|
483
|
+
"azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
|
|
484
|
+
"api_key": os.environ.get("AZURE_OPENAI_KEY"),
|
|
485
|
+
"azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
coherence_eval = CoherenceEvaluator(model_config=model_config)
|
|
489
|
+
relevance_eval = RelevanceEvaluator(model_config=model_config)
|
|
490
|
+
|
|
491
|
+
path = "evaluate_test_data.jsonl"
|
|
492
|
+
result = evaluate(
|
|
493
|
+
data=path,
|
|
494
|
+
evaluators={
|
|
495
|
+
"coherence": coherence_eval,
|
|
496
|
+
"relevance": relevance_eval,
|
|
497
|
+
},
|
|
498
|
+
evaluator_config={
|
|
499
|
+
"coherence": {
|
|
500
|
+
"response": "${data.response}",
|
|
501
|
+
"query": "${data.query}"
|
|
502
|
+
},
|
|
503
|
+
"relevance": {
|
|
504
|
+
"response": "${data.response}",
|
|
505
|
+
"context": "${data.context}",
|
|
506
|
+
"query": "${data.query}"
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
)
|
|
622
510
|
|
|
623
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
624
|
-
:start-after: [START evaluate_method]
|
|
625
|
-
:end-before: [END evaluate_method]
|
|
626
|
-
:language: python
|
|
627
|
-
:dedent: 8
|
|
628
|
-
:caption: Run an evaluation on local data with Coherence and Relevance evaluators.
|
|
629
511
|
"""
|
|
630
512
|
try:
|
|
631
513
|
return _evaluate(
|
|
@@ -656,90 +538,69 @@ def evaluate(
|
|
|
656
538
|
internal_message=error_message,
|
|
657
539
|
target=ErrorTarget.EVALUATE,
|
|
658
540
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
659
|
-
blame=ErrorBlame.
|
|
660
|
-
) from e
|
|
661
|
-
|
|
662
|
-
# Ensure a consistent user experience when encountering errors by converting
|
|
663
|
-
# all other exceptions to EvaluationException.
|
|
664
|
-
if not isinstance(e, EvaluationException):
|
|
665
|
-
raise EvaluationException(
|
|
666
|
-
message=str(e),
|
|
667
|
-
target=ErrorTarget.EVALUATE,
|
|
668
|
-
category=ErrorCategory.FAILED_EXECUTION,
|
|
669
|
-
blame=ErrorBlame.SYSTEM_ERROR,
|
|
541
|
+
blame=ErrorBlame.UNKNOWN,
|
|
670
542
|
) from e
|
|
671
543
|
|
|
672
544
|
raise e
|
|
673
545
|
|
|
674
546
|
|
|
675
|
-
def
|
|
676
|
-
# Extract evaluators with a non-empty "run_summary"
|
|
677
|
-
output_dict = {
|
|
678
|
-
name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
if output_dict:
|
|
682
|
-
print("======= Combined Run Summary (Per Evaluator) =======\n")
|
|
683
|
-
print(json.dumps(output_dict, indent=4))
|
|
684
|
-
print("\n====================================================\n")
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
547
|
+
def _evaluate( # pylint: disable=too-many-locals
|
|
688
548
|
*,
|
|
689
|
-
evaluators: Dict[str, Callable],
|
|
690
549
|
evaluation_name: Optional[str] = None,
|
|
691
550
|
target: Optional[Callable] = None,
|
|
692
|
-
data:
|
|
693
|
-
|
|
551
|
+
data: Optional[str] = None,
|
|
552
|
+
evaluators: Optional[Dict[str, Callable]] = None,
|
|
553
|
+
evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
|
|
694
554
|
azure_ai_project: Optional[AzureAIProject] = None,
|
|
695
|
-
output_path: Optional[
|
|
555
|
+
output_path: Optional[str] = None,
|
|
696
556
|
**kwargs,
|
|
697
|
-
)
|
|
557
|
+
):
|
|
698
558
|
input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
|
|
699
559
|
|
|
700
560
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
701
561
|
if evaluator_config is None:
|
|
702
562
|
evaluator_config = {}
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
563
|
+
evaluator_config = _process_evaluator_config(evaluator_config)
|
|
564
|
+
_validate_columns(input_data_df, evaluators, target, evaluator_config)
|
|
565
|
+
|
|
566
|
+
# Target Run
|
|
567
|
+
pf_client = PFClient(
|
|
568
|
+
config=(
|
|
569
|
+
{"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
|
|
570
|
+
),
|
|
571
|
+
user_agent=USER_AGENT,
|
|
709
572
|
)
|
|
710
573
|
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
pf_client = PFClient(user_agent=USER_AGENT)
|
|
715
|
-
target_run: Optional[Run] = None
|
|
574
|
+
trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
|
|
575
|
+
target_run = None
|
|
576
|
+
target_generated_columns = set()
|
|
716
577
|
|
|
717
578
|
# Create default configuration for evaluators that directly maps
|
|
718
579
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
719
|
-
|
|
720
|
-
|
|
580
|
+
evaluator_config = evaluator_config or {}
|
|
581
|
+
evaluator_config.setdefault("default", {})
|
|
721
582
|
|
|
722
583
|
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
723
|
-
target_generated_columns: Set[str] = set()
|
|
724
584
|
if data is not None and target is not None:
|
|
725
585
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
726
|
-
target, data, pf_client, input_data_df, evaluation_name,
|
|
586
|
+
target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
|
|
727
587
|
)
|
|
728
588
|
|
|
729
|
-
for evaluator_name, mapping in
|
|
589
|
+
for evaluator_name, mapping in evaluator_config.items():
|
|
730
590
|
mapped_to_values = set(mapping.values())
|
|
731
591
|
for col in target_generated_columns:
|
|
732
592
|
# If user defined mapping differently, do not change it.
|
|
733
593
|
# If it was mapped to target, we have already changed it
|
|
734
|
-
# in
|
|
594
|
+
# in _process_evaluator_config
|
|
735
595
|
run_output = f"${{run.outputs.{col}}}"
|
|
736
596
|
# We will add our mapping only if
|
|
737
597
|
# customer did not mapped target output.
|
|
738
598
|
if col not in mapping and run_output not in mapped_to_values:
|
|
739
|
-
|
|
599
|
+
evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
|
|
740
600
|
|
|
741
|
-
|
|
742
|
-
|
|
601
|
+
# After we have generated all columns we can check if we have
|
|
602
|
+
# everything we need for evaluators.
|
|
603
|
+
_validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
|
|
743
604
|
|
|
744
605
|
# Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
|
|
745
606
|
# via target mapping.
|
|
@@ -749,52 +610,44 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
749
610
|
for col in input_data_df.columns:
|
|
750
611
|
# Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
|
|
751
612
|
# Also ignore columns that are already in config, since they've been covered by target mapping.
|
|
752
|
-
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
def eval_batch_run(
|
|
756
|
-
batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
|
|
757
|
-
) -> Dict[str, __EvaluatorInfo]:
|
|
758
|
-
with EvalRunContext(batch_run_client):
|
|
759
|
-
runs = {
|
|
760
|
-
evaluator_name: batch_run_client.run(
|
|
761
|
-
flow=evaluator,
|
|
762
|
-
run=target_run,
|
|
763
|
-
evaluator_name=evaluator_name,
|
|
764
|
-
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
765
|
-
data=data,
|
|
766
|
-
stream=True,
|
|
767
|
-
name=kwargs.get("_run_name"),
|
|
768
|
-
)
|
|
769
|
-
for evaluator_name, evaluator in evaluators.items()
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
773
|
-
return {
|
|
774
|
-
evaluator_name: {
|
|
775
|
-
"result": batch_run_client.get_details(run, all_results=True),
|
|
776
|
-
"metrics": batch_run_client.get_metrics(run),
|
|
777
|
-
"run_summary": batch_run_client.get_run_summary(run),
|
|
778
|
-
}
|
|
779
|
-
for evaluator_name, run in runs.items()
|
|
780
|
-
}
|
|
781
|
-
|
|
613
|
+
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
|
|
614
|
+
evaluator_config["default"][col] = f"${{data.{col}}}"
|
|
782
615
|
# Batch Run
|
|
616
|
+
evaluators_info = {}
|
|
783
617
|
use_pf_client = kwargs.get("_use_pf_client", True)
|
|
784
618
|
if use_pf_client:
|
|
619
|
+
batch_run_client = ProxyClient(pf_client)
|
|
620
|
+
|
|
785
621
|
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
786
622
|
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
787
623
|
data = os.path.abspath(data)
|
|
788
|
-
per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
|
|
789
624
|
else:
|
|
625
|
+
batch_run_client = CodeClient()
|
|
790
626
|
data = input_data_df
|
|
791
|
-
|
|
627
|
+
|
|
628
|
+
with BatchRunContext(batch_run_client):
|
|
629
|
+
for evaluator_name, evaluator in evaluators.items():
|
|
630
|
+
evaluators_info[evaluator_name] = {}
|
|
631
|
+
evaluators_info[evaluator_name]["run"] = batch_run_client.run(
|
|
632
|
+
flow=evaluator,
|
|
633
|
+
run=target_run,
|
|
634
|
+
evaluator_name=evaluator_name,
|
|
635
|
+
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
|
|
636
|
+
data=data,
|
|
637
|
+
stream=True,
|
|
638
|
+
name=kwargs.get("_run_name"),
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
# get_details needs to be called within BatchRunContext scope in order to have user agent populated
|
|
642
|
+
for evaluator_name, evaluator_info in evaluators_info.items():
|
|
643
|
+
evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
|
|
644
|
+
evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
|
|
792
645
|
|
|
793
646
|
# Concatenate all results
|
|
794
647
|
evaluators_result_df = None
|
|
795
648
|
evaluators_metric = {}
|
|
796
|
-
for evaluator_name,
|
|
797
|
-
evaluator_result_df =
|
|
649
|
+
for evaluator_name, evaluator_info in evaluators_info.items():
|
|
650
|
+
evaluator_result_df = evaluator_info["result"]
|
|
798
651
|
|
|
799
652
|
# drop input columns
|
|
800
653
|
evaluator_result_df = evaluator_result_df.drop(
|
|
@@ -817,7 +670,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
817
670
|
else evaluator_result_df
|
|
818
671
|
)
|
|
819
672
|
|
|
820
|
-
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in
|
|
673
|
+
evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
|
|
821
674
|
|
|
822
675
|
# Rename columns, generated by target function to outputs instead of inputs.
|
|
823
676
|
# If target generates columns, already present in the input data, these columns
|
|
@@ -827,10 +680,6 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
827
680
|
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
828
681
|
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
829
682
|
metrics.update(evaluators_metric)
|
|
830
|
-
|
|
831
|
-
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
832
|
-
target_run = None
|
|
833
|
-
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
834
683
|
studio_url = _log_metrics_and_instance_results(
|
|
835
684
|
metrics,
|
|
836
685
|
result_df,
|
|
@@ -839,10 +688,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
839
688
|
evaluation_name,
|
|
840
689
|
)
|
|
841
690
|
|
|
842
|
-
|
|
843
|
-
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
844
|
-
|
|
845
|
-
_print_summary(per_evaluator_results)
|
|
691
|
+
result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
|
|
846
692
|
|
|
847
693
|
if output_path:
|
|
848
694
|
_write_output(output_path, result)
|