azure-ai-evaluation 1.4.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +9 -16
- azure/ai/evaluation/_aoai/__init__.py +10 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
- azure/ai/evaluation/_aoai/label_grader.py +66 -0
- azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
- azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
- azure/ai/evaluation/_azure/_clients.py +4 -4
- azure/ai/evaluation/_azure/_envs.py +208 -0
- azure/ai/evaluation/_azure/_token_manager.py +12 -7
- azure/ai/evaluation/_common/__init__.py +5 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +118 -0
- azure/ai/evaluation/_common/onedp/__init__.py +32 -0
- azure/ai/evaluation/_common/onedp/_client.py +139 -0
- azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
- azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
- azure/ai/evaluation/_common/onedp/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
- azure/ai/evaluation/_common/onedp/_types.py +21 -0
- azure/ai/evaluation/_common/onedp/_validation.py +50 -0
- azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
- azure/ai/evaluation/_common/onedp/_version.py +9 -0
- azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
- azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
- azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
- azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
- azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
- azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
- azure/ai/evaluation/_common/onedp/py.typed +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
- azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
- azure/ai/evaluation/_common/rai_service.py +159 -29
- azure/ai/evaluation/_common/raiclient/_version.py +1 -1
- azure/ai/evaluation/_common/utils.py +80 -2
- azure/ai/evaluation/_constants.py +16 -0
- azure/ai/evaluation/_converters/__init__.py +1 -1
- azure/ai/evaluation/_converters/_ai_services.py +4 -4
- azure/ai/evaluation/_eval_mapping.py +71 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +17 -4
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
- azure/ai/evaluation/_evaluate/_eval_run.py +2 -2
- azure/ai/evaluation/_evaluate/_evaluate.py +372 -105
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +534 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -89
- azure/ai/evaluation/_evaluate/_utils.py +120 -7
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +9 -4
- azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +1 -1
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
- azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
- azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +467 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +1 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +2 -2
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +6 -2
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +1 -1
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +8 -2
- azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +1 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +5 -2
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +6 -2
- azure/ai/evaluation/_exceptions.py +2 -0
- azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
- azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
- azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
- azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
- azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
- azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
- azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
- azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
- azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
- azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
- azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
- azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
- azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
- azure/ai/evaluation/_legacy/_batch_engine/_result.py +7 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
- azure/ai/evaluation/_legacy/_batch_engine/_status.py +1 -1
- azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
- azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
- azure/ai/evaluation/{_red_team/_utils → _legacy/_common}/__init__.py +1 -1
- azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
- azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
- azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
- azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
- azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
- azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
- azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +90 -17
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/__init__.py +19 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_objective_generator.py +3 -0
- azure/ai/evaluation/{_red_team → red_team}/_attack_strategy.py +4 -1
- azure/ai/evaluation/{_red_team → red_team}/_red_team.py +885 -481
- azure/ai/evaluation/red_team/_red_team_result.py +382 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/constants.py +2 -1
- azure/ai/evaluation/{_red_team → red_team}/_utils/formatting_utils.py +23 -22
- azure/ai/evaluation/{_red_team → red_team}/_utils/logging_utils.py +1 -1
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
- azure/ai/evaluation/{_red_team → red_team}/_utils/strategy_utils.py +9 -5
- azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
- azure/ai/evaluation/simulator/_constants.py +1 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
- azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +35 -22
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +40 -25
- azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +24 -18
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +9 -5
- azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
- azure/ai/evaluation/simulator/_simulator.py +1 -1
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/METADATA +36 -2
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/RECORD +148 -80
- azure/ai/evaluation/_red_team/_red_team_result.py +0 -246
- azure/ai/evaluation/simulator/_tracing.py +0 -89
- /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_callback_chat_target.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team}/_default_converter.py +0 -0
- /azure/ai/evaluation/{_red_team → red_team/_utils}/__init__.py +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.4.0.dist-info → azure_ai_evaluation-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -6,43 +6,59 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import os
|
|
8
8
|
import re
|
|
9
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict,
|
|
9
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
|
|
10
10
|
|
|
11
|
+
from openai import OpenAI, AzureOpenAI
|
|
12
|
+
from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
|
|
13
|
+
from azure.ai.evaluation._legacy._adapters.entities import Run
|
|
11
14
|
import pandas as pd
|
|
12
|
-
from promptflow._sdk._constants import LINE_NUMBER
|
|
13
|
-
from promptflow.client import PFClient
|
|
14
|
-
from promptflow.entities import Run
|
|
15
|
-
from promptflow._sdk._configuration import Configuration
|
|
16
15
|
|
|
17
16
|
from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
|
|
18
|
-
from azure.ai.evaluation._common.utils import validate_azure_ai_project
|
|
17
|
+
from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
|
|
19
18
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
20
19
|
|
|
20
|
+
from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
|
|
21
|
+
|
|
21
22
|
from .._constants import (
|
|
22
23
|
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
|
|
24
|
+
EVALUATION_PASS_FAIL_MAPPING,
|
|
23
25
|
EvaluationMetrics,
|
|
24
26
|
DefaultOpenEncoding,
|
|
25
27
|
Prefixes,
|
|
26
28
|
_InternalEvaluationMetrics,
|
|
29
|
+
BINARY_AGGREGATE_SUFFIX,
|
|
30
|
+
DEFAULT_OAI_EVAL_RUN_NAME
|
|
27
31
|
)
|
|
28
32
|
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
|
|
29
33
|
from .._user_agent import USER_AGENT
|
|
30
|
-
from ._batch_run import
|
|
34
|
+
from ._batch_run import (
|
|
35
|
+
EvalRunContext,
|
|
36
|
+
CodeClient,
|
|
37
|
+
ProxyClient,
|
|
38
|
+
TargetRunContext,
|
|
39
|
+
RunSubmitterClient,
|
|
40
|
+
)
|
|
31
41
|
from ._utils import (
|
|
32
42
|
_apply_column_mapping,
|
|
33
43
|
_log_metrics_and_instance_results,
|
|
34
44
|
_trace_destination_from_project_scope,
|
|
35
45
|
_write_output,
|
|
36
|
-
DataLoaderFactory,
|
|
46
|
+
DataLoaderFactory, _log_metrics_and_instance_results_onedp,
|
|
37
47
|
)
|
|
48
|
+
from ._batch_run.batch_clients import BatchClient, BatchClientRun
|
|
38
49
|
|
|
39
|
-
|
|
50
|
+
from ._evaluate_aoai import (
|
|
51
|
+
_begin_aoai_evaluation,
|
|
52
|
+
_split_evaluators_and_grader_configs,
|
|
53
|
+
_get_evaluation_run_results,
|
|
54
|
+
OAIEvalRunCreationInfo
|
|
55
|
+
)
|
|
40
56
|
LOGGER = logging.getLogger(__name__)
|
|
41
57
|
|
|
42
58
|
# For metrics (aggregates) whose metric names intentionally differ from their
|
|
43
59
|
# originating column name, usually because the aggregation of the original value
|
|
44
60
|
# means something sufficiently different.
|
|
45
|
-
# Note that content safety metrics are handled
|
|
61
|
+
# Note that content safety metrics are handled separately.
|
|
46
62
|
METRIC_COLUMN_NAME_REPLACEMENTS = {
|
|
47
63
|
"groundedness_pro_label": "groundedness_pro_passing_rate",
|
|
48
64
|
}
|
|
@@ -53,6 +69,19 @@ class __EvaluatorInfo(TypedDict):
|
|
|
53
69
|
metrics: Dict[str, Any]
|
|
54
70
|
run_summary: Dict[str, Any]
|
|
55
71
|
|
|
72
|
+
class __ValidatedData(TypedDict):
|
|
73
|
+
'''
|
|
74
|
+
Simple dictionary that contains ALL pre-processed data and
|
|
75
|
+
the resultant objects that are needed for downstream evaluation.
|
|
76
|
+
'''
|
|
77
|
+
evaluators: Dict[str, Callable]
|
|
78
|
+
graders: Dict[str, AzureOpenAIGrader]
|
|
79
|
+
input_data_df: pd.DataFrame
|
|
80
|
+
column_mapping: Dict[str, Dict[str, str]]
|
|
81
|
+
target_run: Optional[BatchClientRun]
|
|
82
|
+
batch_run_client: BatchClient
|
|
83
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame]
|
|
84
|
+
|
|
56
85
|
|
|
57
86
|
def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
|
|
58
87
|
"""Identify and average various metrics that need to have the metric name be replaced,
|
|
@@ -71,7 +100,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
|
|
|
71
100
|
if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
|
|
72
101
|
renamed_cols.append(col)
|
|
73
102
|
new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
|
|
74
|
-
col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
|
|
103
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
|
|
75
104
|
try:
|
|
76
105
|
metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
|
|
77
106
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
@@ -122,7 +151,7 @@ def _aggregate_content_safety_metrics(
|
|
|
122
151
|
defect_rates = {}
|
|
123
152
|
for col in content_safety_df.columns:
|
|
124
153
|
defect_rate_name = col.replace("_score", "_defect_rate")
|
|
125
|
-
col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
|
|
154
|
+
col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
|
|
126
155
|
try:
|
|
127
156
|
col_with_boolean_values = apply_transform_nan_safe(
|
|
128
157
|
col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
|
|
@@ -161,37 +190,40 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
|
|
|
161
190
|
metric_name = col.split(".")[1]
|
|
162
191
|
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
|
|
163
192
|
label_cols.append(col)
|
|
164
|
-
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
|
|
193
|
+
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
|
|
165
194
|
details_cols = col
|
|
166
195
|
|
|
167
196
|
label_df = df[label_cols]
|
|
168
197
|
defect_rates = {}
|
|
169
198
|
for col in label_df.columns:
|
|
170
199
|
defect_rate_name = col.replace("_label", "_defect_rate")
|
|
171
|
-
col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
|
|
200
|
+
col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
|
|
172
201
|
try:
|
|
173
202
|
defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
|
|
174
203
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
175
204
|
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
|
|
176
205
|
LOGGER.warning(msg)
|
|
177
|
-
|
|
206
|
+
|
|
178
207
|
if details_cols:
|
|
179
208
|
details_df = df[details_cols]
|
|
180
209
|
detail_defect_rates = {}
|
|
181
|
-
|
|
210
|
+
|
|
182
211
|
for key, value in details_df.items():
|
|
183
212
|
_process_rows(value, detail_defect_rates)
|
|
184
|
-
|
|
213
|
+
|
|
185
214
|
for key, value in detail_defect_rates.items():
|
|
186
215
|
col_with_boolean_values = pd.to_numeric(value, errors="coerce")
|
|
187
216
|
try:
|
|
188
|
-
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
|
|
217
|
+
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
|
|
218
|
+
list_mean_nan_safe(col_with_boolean_values), 2
|
|
219
|
+
)
|
|
189
220
|
except EvaluationException: # only exception that can be cause is all NaN values
|
|
190
221
|
msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
|
|
191
222
|
LOGGER.warning(msg)
|
|
192
|
-
|
|
223
|
+
|
|
193
224
|
return label_cols, defect_rates
|
|
194
225
|
|
|
226
|
+
|
|
195
227
|
def _process_rows(row, detail_defect_rates):
|
|
196
228
|
for key, value in row.items():
|
|
197
229
|
if key not in detail_defect_rates:
|
|
@@ -199,6 +231,49 @@ def _process_rows(row, detail_defect_rates):
|
|
|
199
231
|
detail_defect_rates[key].append(value)
|
|
200
232
|
return detail_defect_rates
|
|
201
233
|
|
|
234
|
+
|
|
235
|
+
def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
|
|
236
|
+
"""
|
|
237
|
+
Aggregate binary output results (pass/fail) from evaluation dataframe.
|
|
238
|
+
|
|
239
|
+
For each evaluator, calculates the proportion of "pass" results.
|
|
240
|
+
|
|
241
|
+
:param df: The dataframe of evaluation results.
|
|
242
|
+
:type df: ~pandas.DataFrame
|
|
243
|
+
:return: A dictionary mapping evaluator names to the proportion of pass results.
|
|
244
|
+
:rtype: Dict[str, float]
|
|
245
|
+
"""
|
|
246
|
+
results = {}
|
|
247
|
+
|
|
248
|
+
# Find all columns that end with "_result"
|
|
249
|
+
result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
|
|
250
|
+
|
|
251
|
+
for col in result_columns:
|
|
252
|
+
# Extract the evaluator name from the column name
|
|
253
|
+
# (outputs.<evaluator>.<metric>_result)
|
|
254
|
+
parts = col.split(".")
|
|
255
|
+
evaluator_name = None
|
|
256
|
+
if len(parts) >= 3:
|
|
257
|
+
evaluator_name = parts[1]
|
|
258
|
+
else:
|
|
259
|
+
LOGGER.warning("Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col)
|
|
260
|
+
continue
|
|
261
|
+
if evaluator_name:
|
|
262
|
+
# Count the occurrences of each unique value (pass/fail)
|
|
263
|
+
value_counts = df[col].value_counts().to_dict()
|
|
264
|
+
|
|
265
|
+
# Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
|
|
266
|
+
total_rows = len(df)
|
|
267
|
+
pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
|
|
268
|
+
proportion = pass_count / total_rows if total_rows > 0 else 0.0
|
|
269
|
+
|
|
270
|
+
# Set the result with the evaluator name as the key
|
|
271
|
+
result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
|
|
272
|
+
results[result_key] = round(proportion, 2)
|
|
273
|
+
|
|
274
|
+
return results
|
|
275
|
+
|
|
276
|
+
|
|
202
277
|
def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
|
|
203
278
|
"""Aggregate metrics from the evaluation results.
|
|
204
279
|
On top of naively calculating the mean of most metrics, this function also identifies certain columns
|
|
@@ -212,6 +287,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
212
287
|
:return: The aggregated metrics.
|
|
213
288
|
:rtype: Dict[str, float]
|
|
214
289
|
"""
|
|
290
|
+
binary_metrics = _aggregation_binary_output(df)
|
|
291
|
+
|
|
215
292
|
df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
|
|
216
293
|
|
|
217
294
|
handled_columns = []
|
|
@@ -239,6 +316,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
|
|
|
239
316
|
metrics = mean_value.to_dict()
|
|
240
317
|
# Add defect rates back into metrics
|
|
241
318
|
metrics.update(defect_rates)
|
|
319
|
+
|
|
320
|
+
# Add binary threshold metrics based on pass/fail results
|
|
321
|
+
metrics.update(binary_metrics)
|
|
322
|
+
|
|
242
323
|
return metrics
|
|
243
324
|
|
|
244
325
|
|
|
@@ -330,7 +411,7 @@ def _validate_columns_for_evaluators(
|
|
|
330
411
|
missing_inputs = []
|
|
331
412
|
else:
|
|
332
413
|
optional_params = (
|
|
333
|
-
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
414
|
+
cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
|
|
334
415
|
if hasattr(evaluator, "_OPTIONAL_PARAMS")
|
|
335
416
|
else []
|
|
336
417
|
)
|
|
@@ -477,12 +558,12 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
|
|
|
477
558
|
|
|
478
559
|
def _apply_target_to_data(
|
|
479
560
|
target: Callable,
|
|
480
|
-
data: Union[str, os.PathLike],
|
|
481
|
-
batch_client:
|
|
561
|
+
data: Union[str, os.PathLike, pd.DataFrame],
|
|
562
|
+
batch_client: BatchClient,
|
|
482
563
|
initial_data: pd.DataFrame,
|
|
483
564
|
evaluation_name: Optional[str] = None,
|
|
484
565
|
**kwargs,
|
|
485
|
-
) -> Tuple[pd.DataFrame, Set[str],
|
|
566
|
+
) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
|
|
486
567
|
"""
|
|
487
568
|
Apply the target function to the data set and return updated data and generated columns.
|
|
488
569
|
|
|
@@ -499,18 +580,19 @@ def _apply_target_to_data(
|
|
|
499
580
|
:return: The tuple, containing data frame and the list of added columns.
|
|
500
581
|
:rtype: Tuple[pandas.DataFrame, List[str]]
|
|
501
582
|
"""
|
|
583
|
+
|
|
502
584
|
_run_name = kwargs.get("_run_name")
|
|
503
|
-
with TargetRunContext():
|
|
504
|
-
run:
|
|
585
|
+
with TargetRunContext(batch_client):
|
|
586
|
+
run: BatchClientRun = batch_client.run(
|
|
505
587
|
flow=target,
|
|
506
588
|
display_name=evaluation_name,
|
|
507
589
|
data=data,
|
|
508
590
|
stream=True,
|
|
509
591
|
name=_run_name,
|
|
592
|
+
evaluator_name=getattr(target, "__qualname__", "TARGET"),
|
|
510
593
|
)
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
run_summary = batch_client.get_run_summary(run)
|
|
594
|
+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
|
|
595
|
+
run_summary = batch_client.get_run_summary(run)
|
|
514
596
|
|
|
515
597
|
if run_summary["completed_lines"] == 0:
|
|
516
598
|
msg = (
|
|
@@ -541,7 +623,7 @@ def _apply_target_to_data(
|
|
|
541
623
|
# Concatenate output to input
|
|
542
624
|
target_output = pd.concat([target_output, initial_data], axis=1)
|
|
543
625
|
|
|
544
|
-
return target_output, generated_columns, run
|
|
626
|
+
return target_output, generated_columns, run
|
|
545
627
|
|
|
546
628
|
|
|
547
629
|
def _process_column_mappings(
|
|
@@ -557,7 +639,7 @@ def _process_column_mappings(
|
|
|
557
639
|
|
|
558
640
|
processed_config: Dict[str, Dict[str, str]] = {}
|
|
559
641
|
|
|
560
|
-
expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-
|
|
642
|
+
expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
|
|
561
643
|
|
|
562
644
|
if column_mapping:
|
|
563
645
|
for evaluator, mapping_config in column_mapping.items():
|
|
@@ -606,15 +688,14 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
606
688
|
return df
|
|
607
689
|
|
|
608
690
|
|
|
609
|
-
# @log_evaluate_activity
|
|
610
691
|
def evaluate(
|
|
611
692
|
*,
|
|
612
693
|
data: Union[str, os.PathLike],
|
|
613
|
-
evaluators: Dict[str, Callable],
|
|
694
|
+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
614
695
|
evaluation_name: Optional[str] = None,
|
|
615
696
|
target: Optional[Callable] = None,
|
|
616
697
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
617
|
-
azure_ai_project: Optional[AzureAIProject] = None,
|
|
698
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
618
699
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
619
700
|
fail_on_evaluator_errors: bool = False,
|
|
620
701
|
**kwargs,
|
|
@@ -626,8 +707,9 @@ def evaluate(
|
|
|
626
707
|
JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
|
|
627
708
|
:paramtype data: str
|
|
628
709
|
:keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
|
|
629
|
-
and value as the evaluator function.
|
|
630
|
-
|
|
710
|
+
and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
|
|
711
|
+
Required.
|
|
712
|
+
:paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
|
|
631
713
|
:keyword evaluation_name: Display name of the evaluation.
|
|
632
714
|
:paramtype evaluation_name: Optional[str]
|
|
633
715
|
:keyword target: Target to be evaluated. `target` and `data` both cannot be None
|
|
@@ -664,7 +746,7 @@ def evaluate(
|
|
|
664
746
|
evaluation_name=evaluation_name,
|
|
665
747
|
target=target,
|
|
666
748
|
data=data,
|
|
667
|
-
|
|
749
|
+
evaluators_and_graders=evaluators,
|
|
668
750
|
evaluator_config=evaluator_config,
|
|
669
751
|
azure_ai_project=azure_ai_project,
|
|
670
752
|
output_path=output_path,
|
|
@@ -729,23 +811,157 @@ def _print_fail_flag_warning() -> None:
|
|
|
729
811
|
|
|
730
812
|
def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
731
813
|
*,
|
|
732
|
-
|
|
814
|
+
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
733
815
|
evaluation_name: Optional[str] = None,
|
|
734
816
|
target: Optional[Callable] = None,
|
|
735
817
|
data: Union[str, os.PathLike],
|
|
736
818
|
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
737
|
-
azure_ai_project: Optional[AzureAIProject] = None,
|
|
819
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
738
820
|
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
739
821
|
fail_on_evaluator_errors: bool = False,
|
|
740
822
|
**kwargs,
|
|
741
823
|
) -> EvaluationResult:
|
|
742
824
|
if fail_on_evaluator_errors:
|
|
743
825
|
_print_fail_flag_warning()
|
|
744
|
-
|
|
826
|
+
|
|
827
|
+
# Turn inputted mess of data into a dataframe, apply targets if needed
|
|
828
|
+
# split graders and evaluators, and verify that column mappings are sensible.
|
|
829
|
+
validated_data = _preprocess_data(
|
|
830
|
+
data=data,
|
|
831
|
+
evaluators_and_graders=evaluators_and_graders,
|
|
832
|
+
evaluator_config=evaluator_config,
|
|
833
|
+
target=target,
|
|
834
|
+
output_path=output_path,
|
|
835
|
+
azure_ai_project=azure_ai_project,
|
|
836
|
+
evaluation_name=evaluation_name,
|
|
837
|
+
**kwargs,
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
# extract relevant info from validated data
|
|
841
|
+
column_mapping = validated_data["column_mapping"]
|
|
842
|
+
evaluators = validated_data["evaluators"]
|
|
843
|
+
graders = validated_data["graders"]
|
|
844
|
+
input_data_df = validated_data["input_data_df"]
|
|
845
|
+
results_df = pd.DataFrame()
|
|
846
|
+
metrics: Dict[str, float] = {}
|
|
847
|
+
eval_run_info_list: List[OAIEvalRunCreationInfo] = []
|
|
848
|
+
|
|
849
|
+
# Start OAI eval runs if any graders are present.
|
|
850
|
+
need_oai_run = len(graders) > 0
|
|
851
|
+
need_local_run = len(evaluators) > 0
|
|
852
|
+
need_get_oai_results = False
|
|
853
|
+
got_local_results = False
|
|
854
|
+
if need_oai_run:
|
|
855
|
+
try:
|
|
856
|
+
aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
|
|
857
|
+
eval_run_info_list = _begin_aoai_evaluation(
|
|
858
|
+
graders,
|
|
859
|
+
column_mapping,
|
|
860
|
+
input_data_df,
|
|
861
|
+
aoi_name
|
|
862
|
+
)
|
|
863
|
+
need_get_oai_results = len(eval_run_info_list) > 0
|
|
864
|
+
except EvaluationException as e:
|
|
865
|
+
if need_local_run:
|
|
866
|
+
# If there are normal evaluators, don't stop execution and try to run
|
|
867
|
+
# those.
|
|
868
|
+
LOGGER.warning("Remote Azure Open AI grader evaluations failed during run creation." +
|
|
869
|
+
" Continuing with local evaluators.")
|
|
870
|
+
LOGGER.warning(e)
|
|
871
|
+
else:
|
|
872
|
+
raise e
|
|
873
|
+
|
|
874
|
+
# Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
|
|
875
|
+
if need_local_run:
|
|
876
|
+
try:
|
|
877
|
+
eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
|
|
878
|
+
validated_data=validated_data,
|
|
879
|
+
fail_on_evaluator_errors=fail_on_evaluator_errors
|
|
880
|
+
)
|
|
881
|
+
results_df = eval_result_df
|
|
882
|
+
metrics = eval_metrics
|
|
883
|
+
got_local_results = True
|
|
884
|
+
# TODO figure out how to update this printing to include OAI results?
|
|
885
|
+
_print_summary(per_evaluator_results)
|
|
886
|
+
except EvaluationException as e:
|
|
887
|
+
if need_get_oai_results:
|
|
888
|
+
# If there are OAI graders, we only print a warning on local failures.
|
|
889
|
+
LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
|
|
890
|
+
LOGGER.warning(e)
|
|
891
|
+
else:
|
|
892
|
+
raise e
|
|
893
|
+
|
|
894
|
+
# Retrieve OAI eval run results if needed.
|
|
895
|
+
if need_get_oai_results:
|
|
896
|
+
try:
|
|
897
|
+
aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
|
|
898
|
+
# Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
|
|
899
|
+
|
|
900
|
+
# Combine results if both evaluators and graders are present
|
|
901
|
+
if len(evaluators) > 0:
|
|
902
|
+
results_df = pd.concat([results_df, aoai_results], axis=1)
|
|
903
|
+
metrics.update(aoai_metrics)
|
|
904
|
+
else:
|
|
905
|
+
# Otherwise combine aoai results with input data df to include input columns in outputs.
|
|
906
|
+
results_df = pd.concat([input_data_df, aoai_results], axis=1)
|
|
907
|
+
metrics = aoai_metrics
|
|
908
|
+
except EvaluationException as e:
|
|
909
|
+
if got_local_results:
|
|
910
|
+
# If there are local eval results, we only print a warning on OAI failure.
|
|
911
|
+
LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
|
|
912
|
+
LOGGER.warning(e)
|
|
913
|
+
else:
|
|
914
|
+
raise e
|
|
915
|
+
|
|
916
|
+
# Done with all evaluations, message outputs into final forms, and log results if needed.
|
|
917
|
+
name_map = _map_names_to_builtins(evaluators, graders)
|
|
918
|
+
if is_onedp_project(azure_ai_project):
|
|
919
|
+
studio_url = _log_metrics_and_instance_results_onedp(
|
|
920
|
+
metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
|
|
921
|
+
)
|
|
922
|
+
else:
|
|
923
|
+
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
924
|
+
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
925
|
+
studio_url = None
|
|
926
|
+
if trace_destination:
|
|
927
|
+
studio_url = _log_metrics_and_instance_results(
|
|
928
|
+
metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
|
|
929
|
+
)
|
|
930
|
+
|
|
931
|
+
result_df_dict = results_df.to_dict("records")
|
|
932
|
+
result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
|
|
933
|
+
|
|
934
|
+
if output_path:
|
|
935
|
+
_write_output(output_path, result)
|
|
936
|
+
|
|
937
|
+
return result
|
|
745
938
|
|
|
939
|
+
|
|
940
|
+
def _preprocess_data(
|
|
941
|
+
data: Union[str, os.PathLike],
|
|
942
|
+
evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
|
|
943
|
+
evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
|
|
944
|
+
target: Optional[Callable] = None,
|
|
945
|
+
output_path: Optional[Union[str, os.PathLike]] = None,
|
|
946
|
+
azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
|
|
947
|
+
evaluation_name: Optional[str] = None,
|
|
948
|
+
**kwargs,
|
|
949
|
+
) -> __ValidatedData:
|
|
746
950
|
# Process evaluator config to replace ${target.} with ${data.}
|
|
747
951
|
if evaluator_config is None:
|
|
748
952
|
evaluator_config = {}
|
|
953
|
+
|
|
954
|
+
input_data_df = _validate_and_load_data(
|
|
955
|
+
target,
|
|
956
|
+
data,
|
|
957
|
+
evaluators_and_graders,
|
|
958
|
+
output_path,
|
|
959
|
+
azure_ai_project,
|
|
960
|
+
evaluation_name
|
|
961
|
+
)
|
|
962
|
+
if target is not None:
|
|
963
|
+
_validate_columns_for_target(input_data_df, target)
|
|
964
|
+
|
|
749
965
|
# extract column mapping dicts into dictionary mapping evaluator name to column mapping
|
|
750
966
|
column_mapping = _process_column_mappings(
|
|
751
967
|
{
|
|
@@ -754,23 +970,46 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
754
970
|
}
|
|
755
971
|
)
|
|
756
972
|
|
|
757
|
-
if target is not None:
|
|
758
|
-
_validate_columns_for_target(input_data_df, target)
|
|
759
|
-
|
|
760
|
-
Configuration.get_instance().set_config("trace.destination", "none")
|
|
761
|
-
pf_client = PFClient(user_agent=USER_AGENT)
|
|
762
|
-
target_run: Optional[Run] = None
|
|
763
|
-
|
|
764
973
|
# Create default configuration for evaluators that directly maps
|
|
765
974
|
# input data names to keyword inputs of the same name in the evaluators.
|
|
766
975
|
column_mapping = column_mapping or {}
|
|
767
976
|
column_mapping.setdefault("default", {})
|
|
768
977
|
|
|
769
|
-
#
|
|
978
|
+
# Split normal evaluators and OAI graders
|
|
979
|
+
evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
|
|
980
|
+
|
|
981
|
+
input_data_df = _validate_and_load_data(
|
|
982
|
+
target,
|
|
983
|
+
data,
|
|
984
|
+
evaluators_and_graders,
|
|
985
|
+
output_path,
|
|
986
|
+
azure_ai_project,
|
|
987
|
+
evaluation_name
|
|
988
|
+
)
|
|
989
|
+
if target is not None:
|
|
990
|
+
_validate_columns_for_target(input_data_df, target)
|
|
991
|
+
|
|
992
|
+
target_run: Optional[BatchClientRun] = None
|
|
770
993
|
target_generated_columns: Set[str] = set()
|
|
994
|
+
batch_run_client: BatchClient
|
|
995
|
+
batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
|
|
996
|
+
|
|
997
|
+
if kwargs.pop("_use_run_submitter_client", False):
|
|
998
|
+
batch_run_client = RunSubmitterClient()
|
|
999
|
+
batch_run_data = input_data_df
|
|
1000
|
+
elif kwargs.pop("_use_pf_client", True):
|
|
1001
|
+
batch_run_client = ProxyClient(user_agent=USER_AGENT)
|
|
1002
|
+
# Ensure the absolute path is passed to pf.run, as relative path doesn't work with
|
|
1003
|
+
# multiple evaluators. If the path is already absolute, abspath will return the original path.
|
|
1004
|
+
batch_run_data = os.path.abspath(data)
|
|
1005
|
+
else:
|
|
1006
|
+
batch_run_client = CodeClient()
|
|
1007
|
+
batch_run_data = input_data_df
|
|
1008
|
+
|
|
1009
|
+
# If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
|
|
771
1010
|
if data is not None and target is not None:
|
|
772
1011
|
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
|
|
773
|
-
target,
|
|
1012
|
+
target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
|
|
774
1013
|
)
|
|
775
1014
|
|
|
776
1015
|
for evaluator_name, mapping in column_mapping.items():
|
|
@@ -799,46 +1038,55 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
799
1038
|
if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
|
|
800
1039
|
column_mapping["default"][col] = f"${{data.{col}}}"
|
|
801
1040
|
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
812
|
-
data=data,
|
|
813
|
-
stream=True,
|
|
814
|
-
name=kwargs.get("_run_name"),
|
|
815
|
-
)
|
|
816
|
-
for evaluator_name, evaluator in evaluators.items()
|
|
817
|
-
}
|
|
1041
|
+
return __ValidatedData(
|
|
1042
|
+
evaluators=evaluators,
|
|
1043
|
+
graders=graders,
|
|
1044
|
+
input_data_df=input_data_df,
|
|
1045
|
+
column_mapping=column_mapping,
|
|
1046
|
+
target_run=target_run,
|
|
1047
|
+
batch_run_client=batch_run_client,
|
|
1048
|
+
batch_run_data=batch_run_data,
|
|
1049
|
+
)
|
|
818
1050
|
|
|
819
|
-
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
820
|
-
return {
|
|
821
|
-
evaluator_name: {
|
|
822
|
-
"result": batch_run_client.get_details(run, all_results=True),
|
|
823
|
-
"metrics": batch_run_client.get_metrics(run),
|
|
824
|
-
"run_summary": batch_run_client.get_run_summary(run),
|
|
825
|
-
}
|
|
826
|
-
for evaluator_name, run in runs.items()
|
|
827
|
-
}
|
|
828
1051
|
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
1052
|
+
def _run_callable_evaluators(
|
|
1053
|
+
validated_data: __ValidatedData,
|
|
1054
|
+
fail_on_evaluator_errors: bool = False,
|
|
1055
|
+
**kwargs,
|
|
1056
|
+
) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
|
|
1057
|
+
|
|
1058
|
+
# Extract needed values
|
|
1059
|
+
batch_run_client = validated_data["batch_run_client"]
|
|
1060
|
+
target_run = validated_data["target_run"]
|
|
1061
|
+
batch_run_data = validated_data["batch_run_data"]
|
|
1062
|
+
column_mapping = validated_data["column_mapping"]
|
|
1063
|
+
evaluators = validated_data["evaluators"]
|
|
1064
|
+
with EvalRunContext(batch_run_client):
|
|
1065
|
+
runs = {
|
|
1066
|
+
evaluator_name: batch_run_client.run(
|
|
1067
|
+
flow=evaluator,
|
|
1068
|
+
data=batch_run_data,
|
|
1069
|
+
run=target_run,
|
|
1070
|
+
evaluator_name=evaluator_name,
|
|
1071
|
+
column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
|
|
1072
|
+
stream=True,
|
|
1073
|
+
name=kwargs.get("_run_name"),
|
|
1074
|
+
)
|
|
1075
|
+
for evaluator_name, evaluator in evaluators.items()
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
# get_details needs to be called within EvalRunContext scope in order to have user agent populated
|
|
1079
|
+
per_evaluator_results: Dict[str, __EvaluatorInfo] = {
|
|
1080
|
+
evaluator_name: {
|
|
1081
|
+
"result": batch_run_client.get_details(run, all_results=True),
|
|
1082
|
+
"metrics": batch_run_client.get_metrics(run),
|
|
1083
|
+
"run_summary": batch_run_client.get_run_summary(run),
|
|
1084
|
+
}
|
|
1085
|
+
for evaluator_name, run in runs.items()
|
|
1086
|
+
}
|
|
839
1087
|
|
|
840
1088
|
# Concatenate all results
|
|
841
|
-
evaluators_result_df =
|
|
1089
|
+
evaluators_result_df = pd.DataFrame()
|
|
842
1090
|
evaluators_metric = {}
|
|
843
1091
|
for evaluator_name, evaluator_result in per_evaluator_results.items():
|
|
844
1092
|
if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
|
|
@@ -873,31 +1121,50 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
|
|
|
873
1121
|
# Rename columns, generated by target function to outputs instead of inputs.
|
|
874
1122
|
# If target generates columns, already present in the input data, these columns
|
|
875
1123
|
# will be marked as outputs already so we do not need to rename them.
|
|
876
|
-
input_data_df = _rename_columns_conditionally(input_data_df)
|
|
877
|
-
|
|
878
|
-
result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
879
|
-
metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
880
|
-
metrics.update(evaluators_metric)
|
|
881
|
-
|
|
882
|
-
# Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
|
|
883
|
-
target_run = None
|
|
884
|
-
trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
|
|
885
|
-
studio_url = None
|
|
886
|
-
if trace_destination:
|
|
887
|
-
studio_url = _log_metrics_and_instance_results(
|
|
888
|
-
metrics, result_df, trace_destination, target_run, evaluation_name, **kwargs
|
|
889
|
-
)
|
|
890
1124
|
|
|
891
|
-
|
|
892
|
-
|
|
1125
|
+
input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
|
|
1126
|
+
eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
|
|
1127
|
+
eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
|
|
1128
|
+
eval_metrics.update(evaluators_metric)
|
|
893
1129
|
|
|
894
|
-
|
|
1130
|
+
return eval_result_df, eval_metrics, per_evaluator_results
|
|
895
1131
|
|
|
896
|
-
|
|
897
|
-
|
|
1132
|
+
def _map_names_to_builtins(
|
|
1133
|
+
evaluators: Dict[str, Callable],
|
|
1134
|
+
graders: Dict[str, AzureOpenAIGrader],
|
|
1135
|
+
) -> Dict[str, str]:
|
|
1136
|
+
"""
|
|
1137
|
+
Construct a mapping from user-supplied evaluator names to which known, built-in
|
|
1138
|
+
evaluator or grader they refer to. Custom or otherwise unknown evaluators are
|
|
1139
|
+
mapped to the "unknown" value.
|
|
898
1140
|
|
|
899
|
-
|
|
1141
|
+
:param evaluators: The dictionary of evaluators.
|
|
1142
|
+
:type evaluators: Dict[str, Callable]
|
|
1143
|
+
:param graders: The dictionary of graders.
|
|
1144
|
+
:type graders: Dict[str, AzureOpenAIGrader]
|
|
1145
|
+
:param evaluator_config: The configuration for evaluators.
|
|
1146
|
+
:type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
|
|
1147
|
+
|
|
1148
|
+
"""
|
|
1149
|
+
from .._eval_mapping import EVAL_CLASS_MAP
|
|
1150
|
+
name_map = {}
|
|
1151
|
+
|
|
1152
|
+
for name, evaluator in evaluators.items():
|
|
1153
|
+
# Check if the evaluator is a known built-in evaluator
|
|
1154
|
+
found_eval = False
|
|
1155
|
+
for eval_class, eval_id in EVAL_CLASS_MAP.items():
|
|
1156
|
+
if isinstance(evaluator, eval_class):
|
|
1157
|
+
name_map[name] = eval_id
|
|
1158
|
+
found_eval = True
|
|
1159
|
+
break
|
|
1160
|
+
if not found_eval:
|
|
1161
|
+
# If not found, map to "unknown"
|
|
1162
|
+
name_map[name] = "unknown"
|
|
1163
|
+
|
|
1164
|
+
for name, grader in graders.items():
|
|
1165
|
+
name_map[name] = grader.id
|
|
900
1166
|
|
|
1167
|
+
return name_map
|
|
901
1168
|
|
|
902
1169
|
def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
903
1170
|
"""Produce an EvaluationException using the contents of the inputted
|
|
@@ -913,4 +1180,4 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
|
|
|
913
1180
|
target=ErrorTarget.EVALUATE,
|
|
914
1181
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
915
1182
|
blame=ErrorBlame.UNKNOWN,
|
|
916
|
-
)
|
|
1183
|
+
)
|