azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import json
|
|
5
6
|
import logging
|
|
6
7
|
import re
|
|
7
8
|
|
|
@@ -60,6 +61,7 @@ def _split_evaluators_and_grader_configs(
|
|
|
60
61
|
:return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
|
|
61
62
|
:rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
|
|
62
63
|
"""
|
|
64
|
+
LOGGER.info(f"AOAI: Splitting {len(evaluators)} evaluators into AOAI graders and standard evaluators...")
|
|
63
65
|
true_evaluators = {}
|
|
64
66
|
aoai_graders = {}
|
|
65
67
|
for key, value in evaluators.items():
|
|
@@ -67,6 +69,7 @@ def _split_evaluators_and_grader_configs(
|
|
|
67
69
|
aoai_graders[key] = value
|
|
68
70
|
else:
|
|
69
71
|
true_evaluators[key] = value
|
|
72
|
+
LOGGER.info(f"AOAI: Found {len(aoai_graders)} AOAI graders and {len(true_evaluators)} standard evaluators.")
|
|
70
73
|
return true_evaluators, aoai_graders
|
|
71
74
|
|
|
72
75
|
|
|
@@ -103,11 +106,18 @@ def _begin_aoai_evaluation(
|
|
|
103
106
|
LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
|
|
104
107
|
all_eval_run_info: List[OAIEvalRunCreationInfo] = []
|
|
105
108
|
|
|
106
|
-
|
|
109
|
+
grader_mapping_list = list(_get_graders_and_column_mappings(graders, column_mappings))
|
|
110
|
+
LOGGER.info(f"AOAI: Will create {len(grader_mapping_list)} separate evaluation run(s) based on column mappings.")
|
|
111
|
+
|
|
112
|
+
for idx, (selected_graders, selected_column_mapping) in enumerate(grader_mapping_list):
|
|
113
|
+
LOGGER.info(
|
|
114
|
+
f"AOAI: Starting evaluation run {idx + 1}/{len(grader_mapping_list)} with {len(selected_graders)} grader(s)..."
|
|
115
|
+
)
|
|
107
116
|
all_eval_run_info.append(
|
|
108
117
|
_begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
|
|
109
118
|
)
|
|
110
119
|
|
|
120
|
+
LOGGER.info(f"AOAI: Successfully created {len(all_eval_run_info)} evaluation run(s).")
|
|
111
121
|
return all_eval_run_info
|
|
112
122
|
|
|
113
123
|
|
|
@@ -133,6 +143,7 @@ def _begin_single_aoai_evaluation(
|
|
|
133
143
|
"""
|
|
134
144
|
|
|
135
145
|
# Format data for eval group creation
|
|
146
|
+
LOGGER.info(f"AOAI: Preparing evaluation for {len(graders)} grader(s): {list(graders.keys())}")
|
|
136
147
|
grader_name_list = []
|
|
137
148
|
grader_list = []
|
|
138
149
|
# It's expected that all graders supplied for a single eval run use the same credentials
|
|
@@ -143,10 +154,12 @@ def _begin_single_aoai_evaluation(
|
|
|
143
154
|
grader_name_list.append(name)
|
|
144
155
|
grader_list.append(grader._grader_config)
|
|
145
156
|
effective_column_mapping: Dict[str, str] = column_mapping or {}
|
|
157
|
+
LOGGER.info(f"AOAI: Generating data source config with {len(effective_column_mapping)} column mapping(s)...")
|
|
146
158
|
data_source_config = _generate_data_source_config(data, effective_column_mapping)
|
|
159
|
+
LOGGER.info(f"AOAI: Data source config generated with schema type: {data_source_config.get('type')}")
|
|
147
160
|
|
|
148
161
|
# Create eval group
|
|
149
|
-
|
|
162
|
+
LOGGER.info(f"AOAI: Creating eval group with {len(grader_list)} testing criteria...")
|
|
150
163
|
eval_group_info = client.evals.create(
|
|
151
164
|
data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
|
|
152
165
|
)
|
|
@@ -167,6 +180,7 @@ def _begin_single_aoai_evaluation(
|
|
|
167
180
|
grader_name_map[criteria.id] = name
|
|
168
181
|
|
|
169
182
|
# Create eval run
|
|
183
|
+
LOGGER.info(f"AOAI: Creating eval run '{run_name}' with {len(data)} data rows...")
|
|
170
184
|
eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping)
|
|
171
185
|
LOGGER.info(
|
|
172
186
|
f"AOAI: Eval run created with id {eval_run_id}."
|
|
@@ -197,13 +211,16 @@ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> T
|
|
|
197
211
|
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
|
|
198
212
|
"""
|
|
199
213
|
|
|
214
|
+
LOGGER.info(f"AOAI: Retrieving results from {len(all_run_info)} evaluation run(s)...")
|
|
200
215
|
run_metrics = {}
|
|
201
216
|
output_df = pd.DataFrame()
|
|
202
|
-
for run_info in all_run_info:
|
|
217
|
+
for idx, run_info in enumerate(all_run_info):
|
|
218
|
+
LOGGER.info(f"AOAI: Fetching results for run {idx + 1}/{len(all_run_info)} (ID: {run_info['eval_run_id']})...")
|
|
203
219
|
cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
|
|
204
220
|
output_df = pd.concat([output_df, cur_output_df], axis=1)
|
|
205
221
|
run_metrics.update(cur_run_metrics)
|
|
206
222
|
|
|
223
|
+
LOGGER.info(f"AOAI: Successfully retrieved all results. Combined dataframe shape: {output_df.shape}")
|
|
207
224
|
return output_df, run_metrics
|
|
208
225
|
|
|
209
226
|
|
|
@@ -223,8 +240,10 @@ def _get_single_run_results(
|
|
|
223
240
|
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
|
|
224
241
|
"""
|
|
225
242
|
# Wait for evaluation run to complete
|
|
243
|
+
LOGGER.info(f"AOAI: Waiting for eval run {run_info['eval_run_id']} to complete...")
|
|
226
244
|
run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
|
|
227
245
|
|
|
246
|
+
LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}")
|
|
228
247
|
if run_results.status != "completed":
|
|
229
248
|
raise EvaluationException(
|
|
230
249
|
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
|
|
@@ -235,6 +254,7 @@ def _get_single_run_results(
|
|
|
235
254
|
)
|
|
236
255
|
|
|
237
256
|
# Convert run results into a dictionary of metrics
|
|
257
|
+
LOGGER.info(f"AOAI: Processing results and calculating metrics for run {run_info['eval_run_id']}...")
|
|
238
258
|
run_metrics: Dict[str, Any] = {}
|
|
239
259
|
if run_results.per_testing_criteria_results is None:
|
|
240
260
|
msg = (
|
|
@@ -255,8 +275,10 @@ def _get_single_run_results(
|
|
|
255
275
|
ratio = passed / (passed + failed) if (passed + failed) else 0.0
|
|
256
276
|
formatted_column_name = f"{grader_name}.pass_rate"
|
|
257
277
|
run_metrics[formatted_column_name] = ratio
|
|
278
|
+
LOGGER.info(f"AOAI: Grader '{grader_name}': {passed} passed, {failed} failed, pass_rate={ratio:.4f}")
|
|
258
279
|
|
|
259
280
|
# Collect all results with pagination
|
|
281
|
+
LOGGER.info(f"AOAI: Collecting output items for run {run_info['eval_run_id']} with pagination...")
|
|
260
282
|
all_results: List[Any] = []
|
|
261
283
|
next_cursor: Optional[str] = None
|
|
262
284
|
limit = 100 # Max allowed by API
|
|
@@ -280,6 +302,7 @@ def _get_single_run_results(
|
|
|
280
302
|
else:
|
|
281
303
|
break
|
|
282
304
|
|
|
305
|
+
LOGGER.info(f"AOAI: Collected {len(all_results)} total output items across all pages.")
|
|
283
306
|
listed_results: Dict[str, List[Any]] = {"index": []}
|
|
284
307
|
# Raw data has no order guarantees; capture datasource_item_id per row for ordering.
|
|
285
308
|
for row_result in all_results:
|
|
@@ -329,6 +352,7 @@ def _get_single_run_results(
|
|
|
329
352
|
|
|
330
353
|
# Ensure all columns are the same length as the 'index' list
|
|
331
354
|
num_rows = len(listed_results["index"])
|
|
355
|
+
LOGGER.info(f"AOAI: Processing {num_rows} result rows into dataframe...")
|
|
332
356
|
for col_name in list(listed_results.keys()):
|
|
333
357
|
if col_name != "index":
|
|
334
358
|
col_length = len(listed_results[col_name])
|
|
@@ -356,6 +380,7 @@ def _get_single_run_results(
|
|
|
356
380
|
expected = run_info.get("expected_rows", None)
|
|
357
381
|
if expected is not None:
|
|
358
382
|
pre_len = len(output_df)
|
|
383
|
+
LOGGER.info(f"AOAI: Validating result count: expected {expected} rows, received {pre_len} rows.")
|
|
359
384
|
# Assumes original datasource_item_id space is 0..expected-1
|
|
360
385
|
output_df = output_df.reindex(range(expected))
|
|
361
386
|
if pre_len != expected:
|
|
@@ -388,6 +413,9 @@ def _get_single_run_results(
|
|
|
388
413
|
|
|
389
414
|
# Reset to RangeIndex so downstream concatenation aligns on position
|
|
390
415
|
output_df.reset_index(drop=True, inplace=True)
|
|
416
|
+
LOGGER.info(
|
|
417
|
+
f"AOAI: Successfully processed run {run_info['eval_run_id']} with final dataframe shape: {output_df.shape}"
|
|
418
|
+
)
|
|
391
419
|
return output_df, run_metrics
|
|
392
420
|
|
|
393
421
|
|
|
@@ -481,11 +509,16 @@ def _get_graders_and_column_mappings(
|
|
|
481
509
|
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
|
|
482
510
|
"""
|
|
483
511
|
|
|
512
|
+
LOGGER.info(f"AOAI: Organizing {len(graders)} graders with column mappings...")
|
|
484
513
|
if column_mappings is None:
|
|
514
|
+
LOGGER.info("AOAI: No column mappings provided, each grader will have its own eval run.")
|
|
485
515
|
return [({name: grader}, None) for name, grader in graders.items()]
|
|
486
516
|
default_mapping = column_mappings.get("default", None)
|
|
487
517
|
if default_mapping is None:
|
|
488
518
|
default_mapping = {}
|
|
519
|
+
LOGGER.info(
|
|
520
|
+
f"AOAI: Using default mapping with {len(default_mapping)} entries for graders without specific mappings."
|
|
521
|
+
)
|
|
489
522
|
return [
|
|
490
523
|
({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
|
|
491
524
|
for name, grader in graders.items()
|
|
@@ -593,17 +626,23 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
|
|
|
593
626
|
helper function.
|
|
594
627
|
"""
|
|
595
628
|
# Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
|
|
629
|
+
LOGGER.info(
|
|
630
|
+
f"AOAI: Generating data source config for {len(input_data_df)} rows with {len(column_mapping)} column mapping(s)..."
|
|
631
|
+
)
|
|
596
632
|
referenced_paths: List[str] = []
|
|
597
633
|
for v in column_mapping.values():
|
|
598
634
|
m = DATA_PATH_PATTERN.match(v)
|
|
599
635
|
if m:
|
|
600
636
|
referenced_paths.append(m.group(1))
|
|
601
637
|
|
|
638
|
+
LOGGER.info(f"AOAI: Found {len(referenced_paths)} referenced paths in column mappings: {referenced_paths}")
|
|
602
639
|
# Decide if we have nested structures
|
|
603
640
|
has_nested = any("." in p for p in referenced_paths)
|
|
641
|
+
LOGGER.info(f"AOAI: Schema generation mode: {'nested' if has_nested else 'flat'}")
|
|
604
642
|
|
|
605
643
|
if not referenced_paths or not has_nested:
|
|
606
644
|
# Legacy flat behavior (existing logic): treat each mapping key as independent string field
|
|
645
|
+
LOGGER.info("AOAI: Using flat schema generation (no nested structures detected).")
|
|
607
646
|
data_source_config = {
|
|
608
647
|
"type": "custom",
|
|
609
648
|
"item_schema": {
|
|
@@ -617,6 +656,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
|
|
|
617
656
|
for key in column_mapping.keys():
|
|
618
657
|
props[key] = {"type": "string"}
|
|
619
658
|
req.append(key)
|
|
659
|
+
LOGGER.info(f"AOAI: Flat schema generated with {len(props)} properties: {list(props.keys())}")
|
|
620
660
|
return data_source_config
|
|
621
661
|
|
|
622
662
|
# NEW: If all nested paths share the same first segment (e.g. 'item'),
|
|
@@ -625,12 +665,14 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
|
|
|
625
665
|
first_segments = {p.split(".")[0] for p in referenced_paths}
|
|
626
666
|
strip_wrapper = False
|
|
627
667
|
wrapper_name = None
|
|
668
|
+
LOGGER.info(f"AOAI: First segments in referenced paths: {first_segments}")
|
|
628
669
|
if len(first_segments) == 1:
|
|
629
670
|
only_seg = next(iter(first_segments))
|
|
630
671
|
# We only strip if that segment looks like the canonical wrapper.
|
|
631
672
|
if only_seg == WRAPPER_KEY:
|
|
632
673
|
strip_wrapper = True
|
|
633
674
|
wrapper_name = only_seg
|
|
675
|
+
LOGGER.info(f"AOAI: All paths start with wrapper '{WRAPPER_KEY}', will strip from schema.")
|
|
634
676
|
|
|
635
677
|
effective_paths = referenced_paths
|
|
636
678
|
if strip_wrapper:
|
|
@@ -645,9 +687,12 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
|
|
|
645
687
|
# If stripping produced at least one usable path, adopt; else fall back to original.
|
|
646
688
|
if stripped:
|
|
647
689
|
effective_paths = stripped
|
|
690
|
+
LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}")
|
|
648
691
|
|
|
692
|
+
LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...")
|
|
649
693
|
nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string")
|
|
650
694
|
|
|
695
|
+
LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'")
|
|
651
696
|
return {
|
|
652
697
|
"type": "custom",
|
|
653
698
|
"item_schema": nested_schema,
|
|
@@ -697,6 +742,23 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
697
742
|
:return: A dictionary that can be used as the data source input for an OAI evaluation run.
|
|
698
743
|
:rtype: Dict[str, Any]
|
|
699
744
|
"""
|
|
745
|
+
|
|
746
|
+
def _convert_value_to_string(val: Any) -> str:
|
|
747
|
+
"""Convert a value to string representation for AOAI evaluation."""
|
|
748
|
+
if val is None:
|
|
749
|
+
return ""
|
|
750
|
+
elif isinstance(val, (str, int, float, bool)):
|
|
751
|
+
return str(val)
|
|
752
|
+
else:
|
|
753
|
+
try: # Attempt to JSON serialize lists/dicts
|
|
754
|
+
return json.dumps(val, ensure_ascii=False)
|
|
755
|
+
except (TypeError, ValueError):
|
|
756
|
+
# Fallback for unserializable objects
|
|
757
|
+
return str(val)
|
|
758
|
+
|
|
759
|
+
LOGGER.info(
|
|
760
|
+
f"AOAI: Building data source from {len(input_data_df)} rows with {len(column_mapping)} column mappings..."
|
|
761
|
+
)
|
|
700
762
|
# Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
|
|
701
763
|
# relative_parts excludes the wrapper (so schema + content align).
|
|
702
764
|
path_specs: List[Tuple[str, List[str], str]] = []
|
|
@@ -746,24 +808,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
746
808
|
leaf_name = pieces[-1]
|
|
747
809
|
path_specs.append((formatted_entry, [leaf_name], run_col))
|
|
748
810
|
|
|
811
|
+
LOGGER.info(f"AOAI: Processed {len(path_specs)} path specifications from column mappings.")
|
|
749
812
|
content: List[Dict[str, Any]] = []
|
|
750
813
|
|
|
751
814
|
for _, row in input_data_df.iterrows():
|
|
752
815
|
item_root: Dict[str, Any] = {}
|
|
753
816
|
|
|
817
|
+
# Track which dataframe columns have been processed via column_mapping
|
|
818
|
+
processed_cols: Set[str] = set()
|
|
819
|
+
|
|
754
820
|
for _, rel_parts, df_col in path_specs:
|
|
755
821
|
# Safely fetch value
|
|
756
822
|
val = row.get(df_col, None)
|
|
757
823
|
|
|
758
824
|
# Convert value to string to match schema's "type": "string" leaves.
|
|
759
|
-
|
|
760
|
-
if val is None:
|
|
761
|
-
str_val = ""
|
|
762
|
-
elif isinstance(val, (str, int, float, bool)):
|
|
763
|
-
str_val = str(val)
|
|
764
|
-
else:
|
|
765
|
-
# Lists / dicts / other -> string for now
|
|
766
|
-
str_val = str(val)
|
|
825
|
+
str_val = _convert_value_to_string(val)
|
|
767
826
|
|
|
768
827
|
# Insert into nested dict
|
|
769
828
|
cursor = item_root
|
|
@@ -776,8 +835,19 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
776
835
|
leaf_key = rel_parts[-1]
|
|
777
836
|
cursor[leaf_key] = str_val
|
|
778
837
|
|
|
838
|
+
# Mark this dataframe column as processed
|
|
839
|
+
processed_cols.add(df_col)
|
|
840
|
+
|
|
841
|
+
# Add any unmapped dataframe columns directly to item_root
|
|
842
|
+
for col_name in input_data_df.columns:
|
|
843
|
+
if col_name not in processed_cols:
|
|
844
|
+
val = row.get(col_name, None)
|
|
845
|
+
str_val = _convert_value_to_string(val)
|
|
846
|
+
item_root[col_name] = str_val
|
|
847
|
+
|
|
779
848
|
content.append({WRAPPER_KEY: item_root})
|
|
780
849
|
|
|
850
|
+
LOGGER.info(f"AOAI: Generated {len(content)} content items for data source.")
|
|
781
851
|
return {
|
|
782
852
|
"type": "jsonl",
|
|
783
853
|
"source": {
|
|
@@ -812,6 +882,7 @@ def _begin_eval_run(
|
|
|
812
882
|
:rtype: str
|
|
813
883
|
"""
|
|
814
884
|
|
|
885
|
+
LOGGER.info(f"AOAI: Creating eval run '{run_name}' for eval group {eval_group_id}...")
|
|
815
886
|
data_source = _get_data_source(input_data_df, column_mapping)
|
|
816
887
|
eval_run = client.evals.runs.create(
|
|
817
888
|
eval_id=eval_group_id,
|
|
@@ -820,6 +891,7 @@ def _begin_eval_run(
|
|
|
820
891
|
metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
|
|
821
892
|
# TODO decide if we want to add our own timeout value?
|
|
822
893
|
)
|
|
894
|
+
LOGGER.info(f"AOAI: Eval run created successfully with ID: {eval_run.id}")
|
|
823
895
|
return eval_run.id
|
|
824
896
|
|
|
825
897
|
|
|
@@ -856,8 +928,11 @@ def _wait_for_run_conclusion(
|
|
|
856
928
|
if total_wait > max_wait_seconds:
|
|
857
929
|
wait_interval -= total_wait - max_wait_seconds
|
|
858
930
|
sleep(wait_interval)
|
|
931
|
+
iters += 1
|
|
859
932
|
response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
|
|
933
|
+
LOGGER.info(f"AOAI: Polling iteration {iters}, status: {response.status}, total wait: {total_wait:.1f}s")
|
|
860
934
|
if response.status not in ["queued", "in_progress"]:
|
|
935
|
+
LOGGER.info(f"AOAI: Eval run {eval_run_id} reached terminal status: {response.status}")
|
|
861
936
|
return response
|
|
862
937
|
if total_wait > max_wait_seconds:
|
|
863
938
|
raise EvaluationException(
|
|
@@ -7,7 +7,8 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
|
|
10
|
+
import time
|
|
11
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Union, cast
|
|
11
12
|
import uuid
|
|
12
13
|
import base64
|
|
13
14
|
import math
|
|
@@ -25,7 +26,7 @@ from azure.ai.evaluation._constants import (
|
|
|
25
26
|
Prefixes,
|
|
26
27
|
)
|
|
27
28
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
28
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
29
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
29
30
|
from azure.ai.evaluation._version import VERSION
|
|
30
31
|
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
31
32
|
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
@@ -196,8 +197,14 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
196
197
|
)
|
|
197
198
|
)
|
|
198
199
|
|
|
200
|
+
# TODO: type mis-match because Evaluation instance is assigned to EvaluationRun
|
|
201
|
+
evaluation_id = (
|
|
202
|
+
upload_run_response.name # type: ignore[attr-defined]
|
|
203
|
+
if hasattr(upload_run_response, "name")
|
|
204
|
+
else upload_run_response.id
|
|
205
|
+
)
|
|
199
206
|
update_run_response = client.update_evaluation_run(
|
|
200
|
-
name=
|
|
207
|
+
name=evaluation_id,
|
|
201
208
|
evaluation=EvaluationUpload(
|
|
202
209
|
display_name=evaluation_name,
|
|
203
210
|
status="Completed",
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Dict, List, Optional, Any
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class EvaluatorMetric:
|
|
8
|
+
type: str = "ordinal"
|
|
9
|
+
desirable_direction: Optional[str] = None
|
|
10
|
+
min_value: Optional[float] = None
|
|
11
|
+
max_value: Optional[float] = None
|
|
12
|
+
|
|
13
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
14
|
+
result = {"type": self.type}
|
|
15
|
+
if self.desirable_direction is not None:
|
|
16
|
+
result["desirable_direction"] = self.desirable_direction
|
|
17
|
+
if self.min_value is not None:
|
|
18
|
+
result["min_value"] = self.min_value
|
|
19
|
+
if self.max_value is not None:
|
|
20
|
+
result["max_value"] = self.max_value
|
|
21
|
+
return result
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorMetric":
|
|
25
|
+
return cls(
|
|
26
|
+
type=data.get("type", "ordinal"),
|
|
27
|
+
desirable_direction=data.get("desirable_direction"),
|
|
28
|
+
min_value=data.get("min_value"),
|
|
29
|
+
max_value=data.get("max_value"),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class ObjectParameterDescriptorWithRequired:
|
|
35
|
+
required: List[str] = field(default_factory=list)
|
|
36
|
+
type: str = "object"
|
|
37
|
+
properties: Dict[str, Any] = field(default_factory=dict)
|
|
38
|
+
|
|
39
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
40
|
+
return {"required": self.required, "type": self.type, "properties": self.properties}
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_dict(cls, data: Dict[str, Any]) -> "ObjectParameterDescriptorWithRequired":
|
|
44
|
+
return cls(
|
|
45
|
+
required=data.get("required", []), type=data.get("type", "object"), properties=data.get("properties", {})
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class EvaluatorDefinition(ABC):
|
|
50
|
+
"""Base class for evaluator definitions"""
|
|
51
|
+
|
|
52
|
+
def __init__(self):
|
|
53
|
+
self.init_parameters: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
|
|
54
|
+
self.metrics: Dict[str, EvaluatorMetric] = {}
|
|
55
|
+
self.data_schema: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
|
|
56
|
+
self.type: str = "unknown"
|
|
57
|
+
|
|
58
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
59
|
+
result = {
|
|
60
|
+
"type": self.type,
|
|
61
|
+
"init_parameters": self.init_parameters.to_dict(),
|
|
62
|
+
"metrics": {k: v.to_dict() for k, v in self.metrics.items()},
|
|
63
|
+
"data_schema": self.data_schema.to_dict(),
|
|
64
|
+
}
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorDefinition":
|
|
69
|
+
# Create a generic instance since specific subclasses are not defined
|
|
70
|
+
instance = cls.__new__(cls)
|
|
71
|
+
instance.__init__()
|
|
72
|
+
|
|
73
|
+
instance.init_parameters = ObjectParameterDescriptorWithRequired.from_dict(data.get("init_parameters", {}))
|
|
74
|
+
instance.metrics = {k: EvaluatorMetric.from_dict(v) for k, v in data.get("metrics", {}).items()}
|
|
75
|
+
instance.data_schema = ObjectParameterDescriptorWithRequired.from_dict(data.get("data_schema", {}))
|
|
76
|
+
return instance
|
|
@@ -46,6 +46,7 @@ class BleuScoreEvaluator(EvaluatorBase):
|
|
|
46
46
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
47
47
|
|
|
48
48
|
.. admonition:: Example with Threshold:
|
|
49
|
+
|
|
49
50
|
.. literalinclude:: ../samples/evaluation_samples_threshold.py
|
|
50
51
|
:start-after: [START threshold_bleu_score_evaluator]
|
|
51
52
|
:end-before: [END threshold_bleu_score_evaluator]
|
|
@@ -56,23 +56,6 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
|
|
|
56
56
|
:param kwargs: Additional arguments to pass to the evaluator.
|
|
57
57
|
:type kwargs: Any
|
|
58
58
|
|
|
59
|
-
.. admonition:: Example:
|
|
60
|
-
|
|
61
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
62
|
-
:start-after: [START code_vulnerability_evaluator]
|
|
63
|
-
:end-before: [END code_vulnerability_evaluator]
|
|
64
|
-
:language: python
|
|
65
|
-
:dedent: 8
|
|
66
|
-
:caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
|
|
67
|
-
|
|
68
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
69
|
-
:start-after: [START code_vulnerability_evaluator]
|
|
70
|
-
:end-before: [END code_vulnerability_evaluator]
|
|
71
|
-
:language: python
|
|
72
|
-
:dedent: 8
|
|
73
|
-
:caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
|
|
74
|
-
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
75
|
-
|
|
76
59
|
.. note::
|
|
77
60
|
|
|
78
61
|
If this evaluator is supplied to the `evaluate` function, the metric
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
import inspect
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
import json
|
|
8
|
+
import copy
|
|
8
9
|
from typing import (
|
|
9
10
|
Any,
|
|
10
11
|
Callable,
|
|
@@ -39,6 +40,8 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
39
40
|
|
|
40
41
|
from ._conversation_aggregators import GetAggregator, GetAggregatorType
|
|
41
42
|
|
|
43
|
+
import copy
|
|
44
|
+
|
|
42
45
|
P = ParamSpec("P")
|
|
43
46
|
T = TypeVar("T")
|
|
44
47
|
T_EvalValue = TypeVar("T_EvalValue")
|
|
@@ -111,6 +114,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
111
114
|
_NOT_APPLICABLE_RESULT = "not applicable"
|
|
112
115
|
_PASS_RESULT = "pass"
|
|
113
116
|
_FAIL_RESULT = "fail"
|
|
117
|
+
_type = "azure_ai_evaluator"
|
|
114
118
|
|
|
115
119
|
# ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
|
|
116
120
|
|
|
@@ -488,13 +492,17 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
488
492
|
"""
|
|
489
493
|
tool_calls = []
|
|
490
494
|
tool_results_map = {}
|
|
491
|
-
|
|
492
|
-
|
|
495
|
+
|
|
496
|
+
# Work on a deep copy to avoid modifying the original object
|
|
497
|
+
response_copy = copy.deepcopy(response)
|
|
498
|
+
|
|
499
|
+
if isinstance(response_copy, list):
|
|
500
|
+
for message in response_copy:
|
|
493
501
|
# Extract tool calls from assistant messages
|
|
494
502
|
if message.get("role") == "assistant" and isinstance(message.get("content"), list):
|
|
495
503
|
for content_item in message.get("content"):
|
|
496
504
|
if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
|
|
497
|
-
tool_calls.append(content_item)
|
|
505
|
+
tool_calls.append(copy.deepcopy(content_item))
|
|
498
506
|
|
|
499
507
|
# Extract tool results from tool messages
|
|
500
508
|
elif message.get("role") == "tool" and message.get("tool_call_id"):
|
|
@@ -582,7 +590,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
|
|
|
582
590
|
:rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
|
|
583
591
|
"""
|
|
584
592
|
# Convert inputs into list of evaluable inputs.
|
|
585
|
-
|
|
593
|
+
try:
|
|
594
|
+
eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
|
|
595
|
+
except Exception as e:
|
|
596
|
+
print(f"Error converting kwargs to eval_input_list: {e}")
|
|
597
|
+
raise e
|
|
586
598
|
per_turn_results = []
|
|
587
599
|
# Evaluate all inputs.
|
|
588
600
|
for eval_input in eval_input_list:
|