azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
+
import json
|
|
5
6
|
import logging
|
|
7
|
+
import re
|
|
6
8
|
|
|
7
9
|
from openai import AzureOpenAI, OpenAI
|
|
8
10
|
import pandas as pd
|
|
9
|
-
from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List
|
|
11
|
+
from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List, cast, Set
|
|
10
12
|
from time import sleep
|
|
11
13
|
|
|
12
14
|
from ._batch_run import CodeClient, ProxyClient
|
|
@@ -21,6 +23,15 @@ from azure.ai.evaluation._common._experimental import experimental
|
|
|
21
23
|
TClient = TypeVar("TClient", ProxyClient, CodeClient)
|
|
22
24
|
LOGGER = logging.getLogger(__name__)
|
|
23
25
|
|
|
26
|
+
# Precompiled regex for extracting data paths from mapping expressions of the form
|
|
27
|
+
# ${data.some.dotted.path}. Compiled once at import time to avoid repeated
|
|
28
|
+
# recompilation on each call to _generate_data_source_config.
|
|
29
|
+
DATA_PATH_PATTERN = re.compile(r"^\$\{data\.([a-zA-Z0-9_\.]+)\}$")
|
|
30
|
+
|
|
31
|
+
# Canonical top-level wrapper key expected in nested JSONL evaluation rows.
|
|
32
|
+
# Centralizing here avoids magic strings sprinkled through schema/content generation code.
|
|
33
|
+
WRAPPER_KEY = "item"
|
|
34
|
+
|
|
24
35
|
|
|
25
36
|
class OAIEvalRunCreationInfo(TypedDict, total=True):
|
|
26
37
|
"""Configuration for an evaluator"""
|
|
@@ -50,6 +61,7 @@ def _split_evaluators_and_grader_configs(
|
|
|
50
61
|
:return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
|
|
51
62
|
:rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
|
|
52
63
|
"""
|
|
64
|
+
LOGGER.info(f"AOAI: Splitting {len(evaluators)} evaluators into AOAI graders and standard evaluators...")
|
|
53
65
|
true_evaluators = {}
|
|
54
66
|
aoai_graders = {}
|
|
55
67
|
for key, value in evaluators.items():
|
|
@@ -57,6 +69,7 @@ def _split_evaluators_and_grader_configs(
|
|
|
57
69
|
aoai_graders[key] = value
|
|
58
70
|
else:
|
|
59
71
|
true_evaluators[key] = value
|
|
72
|
+
LOGGER.info(f"AOAI: Found {len(aoai_graders)} AOAI graders and {len(true_evaluators)} standard evaluators.")
|
|
60
73
|
return true_evaluators, aoai_graders
|
|
61
74
|
|
|
62
75
|
|
|
@@ -93,16 +106,23 @@ def _begin_aoai_evaluation(
|
|
|
93
106
|
LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
|
|
94
107
|
all_eval_run_info: List[OAIEvalRunCreationInfo] = []
|
|
95
108
|
|
|
96
|
-
|
|
109
|
+
grader_mapping_list = list(_get_graders_and_column_mappings(graders, column_mappings))
|
|
110
|
+
LOGGER.info(f"AOAI: Will create {len(grader_mapping_list)} separate evaluation run(s) based on column mappings.")
|
|
111
|
+
|
|
112
|
+
for idx, (selected_graders, selected_column_mapping) in enumerate(grader_mapping_list):
|
|
113
|
+
LOGGER.info(
|
|
114
|
+
f"AOAI: Starting evaluation run {idx + 1}/{len(grader_mapping_list)} with {len(selected_graders)} grader(s)..."
|
|
115
|
+
)
|
|
97
116
|
all_eval_run_info.append(
|
|
98
117
|
_begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
|
|
99
118
|
)
|
|
100
119
|
|
|
120
|
+
LOGGER.info(f"AOAI: Successfully created {len(all_eval_run_info)} evaluation run(s).")
|
|
101
121
|
return all_eval_run_info
|
|
102
122
|
|
|
103
123
|
|
|
104
124
|
def _begin_single_aoai_evaluation(
|
|
105
|
-
graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
|
|
125
|
+
graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Optional[Dict[str, str]], run_name: str
|
|
106
126
|
) -> OAIEvalRunCreationInfo:
|
|
107
127
|
"""
|
|
108
128
|
Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
|
|
@@ -111,8 +131,10 @@ def _begin_single_aoai_evaluation(
|
|
|
111
131
|
|
|
112
132
|
:param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
|
|
113
133
|
:type graders: Dict[str, AoaiGrader]
|
|
114
|
-
:param
|
|
115
|
-
:type
|
|
134
|
+
:param data: The input data to evaluate, as a pandas DataFrame.
|
|
135
|
+
:type data: pd.DataFrame
|
|
136
|
+
:param column_mapping: The column mapping to apply. If None, an empty mapping is used.
|
|
137
|
+
:type column_mapping: Optional[Dict[str, str]]
|
|
116
138
|
:param run_name: The name of the evaluation run.
|
|
117
139
|
:type run_name: str
|
|
118
140
|
:return: A tuple containing the eval group ID and eval run ID of the resultant eval run, as well as a dictionary
|
|
@@ -121,6 +143,7 @@ def _begin_single_aoai_evaluation(
|
|
|
121
143
|
"""
|
|
122
144
|
|
|
123
145
|
# Format data for eval group creation
|
|
146
|
+
LOGGER.info(f"AOAI: Preparing evaluation for {len(graders)} grader(s): {list(graders.keys())}")
|
|
124
147
|
grader_name_list = []
|
|
125
148
|
grader_list = []
|
|
126
149
|
# It's expected that all graders supplied for a single eval run use the same credentials
|
|
@@ -130,10 +153,13 @@ def _begin_single_aoai_evaluation(
|
|
|
130
153
|
for name, grader in graders.items():
|
|
131
154
|
grader_name_list.append(name)
|
|
132
155
|
grader_list.append(grader._grader_config)
|
|
133
|
-
|
|
156
|
+
effective_column_mapping: Dict[str, str] = column_mapping or {}
|
|
157
|
+
LOGGER.info(f"AOAI: Generating data source config with {len(effective_column_mapping)} column mapping(s)...")
|
|
158
|
+
data_source_config = _generate_data_source_config(data, effective_column_mapping)
|
|
159
|
+
LOGGER.info(f"AOAI: Data source config generated with schema type: {data_source_config.get('type')}")
|
|
134
160
|
|
|
135
161
|
# Create eval group
|
|
136
|
-
|
|
162
|
+
LOGGER.info(f"AOAI: Creating eval group with {len(grader_list)} testing criteria...")
|
|
137
163
|
eval_group_info = client.evals.create(
|
|
138
164
|
data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
|
|
139
165
|
)
|
|
@@ -154,7 +180,8 @@ def _begin_single_aoai_evaluation(
|
|
|
154
180
|
grader_name_map[criteria.id] = name
|
|
155
181
|
|
|
156
182
|
# Create eval run
|
|
157
|
-
|
|
183
|
+
LOGGER.info(f"AOAI: Creating eval run '{run_name}' with {len(data)} data rows...")
|
|
184
|
+
eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping)
|
|
158
185
|
LOGGER.info(
|
|
159
186
|
f"AOAI: Eval run created with id {eval_run_id}."
|
|
160
187
|
+ " Results will be retrieved after normal evaluation is complete..."
|
|
@@ -184,13 +211,16 @@ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> T
|
|
|
184
211
|
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
|
|
185
212
|
"""
|
|
186
213
|
|
|
214
|
+
LOGGER.info(f"AOAI: Retrieving results from {len(all_run_info)} evaluation run(s)...")
|
|
187
215
|
run_metrics = {}
|
|
188
216
|
output_df = pd.DataFrame()
|
|
189
|
-
for run_info in all_run_info:
|
|
217
|
+
for idx, run_info in enumerate(all_run_info):
|
|
218
|
+
LOGGER.info(f"AOAI: Fetching results for run {idx + 1}/{len(all_run_info)} (ID: {run_info['eval_run_id']})...")
|
|
190
219
|
cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
|
|
191
220
|
output_df = pd.concat([output_df, cur_output_df], axis=1)
|
|
192
221
|
run_metrics.update(cur_run_metrics)
|
|
193
222
|
|
|
223
|
+
LOGGER.info(f"AOAI: Successfully retrieved all results. Combined dataframe shape: {output_df.shape}")
|
|
194
224
|
return output_df, run_metrics
|
|
195
225
|
|
|
196
226
|
|
|
@@ -210,8 +240,10 @@ def _get_single_run_results(
|
|
|
210
240
|
:raises EvaluationException: If the evaluation run fails or is not completed before timing out.
|
|
211
241
|
"""
|
|
212
242
|
# Wait for evaluation run to complete
|
|
243
|
+
LOGGER.info(f"AOAI: Waiting for eval run {run_info['eval_run_id']} to complete...")
|
|
213
244
|
run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
|
|
214
245
|
|
|
246
|
+
LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}")
|
|
215
247
|
if run_results.status != "completed":
|
|
216
248
|
raise EvaluationException(
|
|
217
249
|
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
|
|
@@ -222,6 +254,7 @@ def _get_single_run_results(
|
|
|
222
254
|
)
|
|
223
255
|
|
|
224
256
|
# Convert run results into a dictionary of metrics
|
|
257
|
+
LOGGER.info(f"AOAI: Processing results and calculating metrics for run {run_info['eval_run_id']}...")
|
|
225
258
|
run_metrics: Dict[str, Any] = {}
|
|
226
259
|
if run_results.per_testing_criteria_results is None:
|
|
227
260
|
msg = (
|
|
@@ -242,8 +275,10 @@ def _get_single_run_results(
|
|
|
242
275
|
ratio = passed / (passed + failed) if (passed + failed) else 0.0
|
|
243
276
|
formatted_column_name = f"{grader_name}.pass_rate"
|
|
244
277
|
run_metrics[formatted_column_name] = ratio
|
|
278
|
+
LOGGER.info(f"AOAI: Grader '{grader_name}': {passed} passed, {failed} failed, pass_rate={ratio:.4f}")
|
|
245
279
|
|
|
246
280
|
# Collect all results with pagination
|
|
281
|
+
LOGGER.info(f"AOAI: Collecting output items for run {run_info['eval_run_id']} with pagination...")
|
|
247
282
|
all_results: List[Any] = []
|
|
248
283
|
next_cursor: Optional[str] = None
|
|
249
284
|
limit = 100 # Max allowed by API
|
|
@@ -267,6 +302,7 @@ def _get_single_run_results(
|
|
|
267
302
|
else:
|
|
268
303
|
break
|
|
269
304
|
|
|
305
|
+
LOGGER.info(f"AOAI: Collected {len(all_results)} total output items across all pages.")
|
|
270
306
|
listed_results: Dict[str, List[Any]] = {"index": []}
|
|
271
307
|
# Raw data has no order guarantees; capture datasource_item_id per row for ordering.
|
|
272
308
|
for row_result in all_results:
|
|
@@ -316,6 +352,7 @@ def _get_single_run_results(
|
|
|
316
352
|
|
|
317
353
|
# Ensure all columns are the same length as the 'index' list
|
|
318
354
|
num_rows = len(listed_results["index"])
|
|
355
|
+
LOGGER.info(f"AOAI: Processing {num_rows} result rows into dataframe...")
|
|
319
356
|
for col_name in list(listed_results.keys()):
|
|
320
357
|
if col_name != "index":
|
|
321
358
|
col_length = len(listed_results[col_name])
|
|
@@ -343,6 +380,7 @@ def _get_single_run_results(
|
|
|
343
380
|
expected = run_info.get("expected_rows", None)
|
|
344
381
|
if expected is not None:
|
|
345
382
|
pre_len = len(output_df)
|
|
383
|
+
LOGGER.info(f"AOAI: Validating result count: expected {expected} rows, received {pre_len} rows.")
|
|
346
384
|
# Assumes original datasource_item_id space is 0..expected-1
|
|
347
385
|
output_df = output_df.reindex(range(expected))
|
|
348
386
|
if pre_len != expected:
|
|
@@ -375,6 +413,9 @@ def _get_single_run_results(
|
|
|
375
413
|
|
|
376
414
|
# Reset to RangeIndex so downstream concatenation aligns on position
|
|
377
415
|
output_df.reset_index(drop=True, inplace=True)
|
|
416
|
+
LOGGER.info(
|
|
417
|
+
f"AOAI: Successfully processed run {run_info['eval_run_id']} with final dataframe shape: {output_df.shape}"
|
|
418
|
+
)
|
|
378
419
|
return output_df, run_metrics
|
|
379
420
|
|
|
380
421
|
|
|
@@ -468,47 +509,194 @@ def _get_graders_and_column_mappings(
|
|
|
468
509
|
:rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
|
|
469
510
|
"""
|
|
470
511
|
|
|
512
|
+
LOGGER.info(f"AOAI: Organizing {len(graders)} graders with column mappings...")
|
|
471
513
|
if column_mappings is None:
|
|
514
|
+
LOGGER.info("AOAI: No column mappings provided, each grader will have its own eval run.")
|
|
472
515
|
return [({name: grader}, None) for name, grader in graders.items()]
|
|
473
516
|
default_mapping = column_mappings.get("default", None)
|
|
474
517
|
if default_mapping is None:
|
|
475
518
|
default_mapping = {}
|
|
519
|
+
LOGGER.info(
|
|
520
|
+
f"AOAI: Using default mapping with {len(default_mapping)} entries for graders without specific mappings."
|
|
521
|
+
)
|
|
476
522
|
return [
|
|
477
523
|
({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
|
|
478
524
|
for name, grader in graders.items()
|
|
479
525
|
]
|
|
480
526
|
|
|
481
527
|
|
|
528
|
+
def _build_schema_tree_from_paths(
|
|
529
|
+
paths: List[str],
|
|
530
|
+
force_leaf_type: str = "string",
|
|
531
|
+
) -> Dict[str, Any]:
|
|
532
|
+
"""
|
|
533
|
+
Build a nested JSON schema (object) from a list of dot-delimited paths.
|
|
534
|
+
Each path represents a leaf. Intermediate segments become nested object properties.
|
|
535
|
+
|
|
536
|
+
Example input paths:
|
|
537
|
+
["item.query",
|
|
538
|
+
"item.context.company.policy.security.passwords.rotation_days",
|
|
539
|
+
"item.context.company.policy.security.network.vpn.required"]
|
|
540
|
+
|
|
541
|
+
Returns schema fragment:
|
|
542
|
+
{
|
|
543
|
+
"type": "object",
|
|
544
|
+
"properties": {
|
|
545
|
+
"item": {
|
|
546
|
+
"type": "object",
|
|
547
|
+
"properties": {
|
|
548
|
+
"query": {"type": "string"},
|
|
549
|
+
"context": {
|
|
550
|
+
"type": "object",
|
|
551
|
+
"properties": {
|
|
552
|
+
"company": { ... }
|
|
553
|
+
},
|
|
554
|
+
"required": ["company"]
|
|
555
|
+
}
|
|
556
|
+
},
|
|
557
|
+
"required": ["query", "context"]
|
|
558
|
+
}
|
|
559
|
+
},
|
|
560
|
+
"required": ["item"]
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
:param paths: A list of dot-delimited strings, each representing a leaf path
|
|
564
|
+
in the logical object hierarchy (e.g. ``"item.context.company.policy.security.passwords.rotation_days"``).
|
|
565
|
+
Empty path segments are ignored.
|
|
566
|
+
:type paths: List[str]
|
|
567
|
+
:param force_leaf_type: The JSON Schema ``type`` value to assign to every leaf node
|
|
568
|
+
produced from the supplied paths. Defaults to ``"string"``.
|
|
569
|
+
:type force_leaf_type: str
|
|
570
|
+
:return: A JSON Schema fragment describing the hierarchical structure implied by
|
|
571
|
+
the input paths. The returned schema root always has ``type: object`` with
|
|
572
|
+
recursively nested ``properties`` / ``required`` keys.
|
|
573
|
+
:rtype: Dict[str, Any]
|
|
574
|
+
"""
|
|
575
|
+
# Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool }
|
|
576
|
+
root: Dict[str, Any] = {"__children__": {}, "__leaf__": False}
|
|
577
|
+
|
|
578
|
+
def insert(path: str):
|
|
579
|
+
parts = [p for p in path.split(".") if p]
|
|
580
|
+
node = root
|
|
581
|
+
for i, part in enumerate(parts):
|
|
582
|
+
children = node["__children__"]
|
|
583
|
+
if part not in children:
|
|
584
|
+
children[part] = {"__children__": {}, "__leaf__": False}
|
|
585
|
+
node = children[part]
|
|
586
|
+
if i == len(parts) - 1:
|
|
587
|
+
node["__leaf__"] = True
|
|
588
|
+
|
|
589
|
+
for p in paths:
|
|
590
|
+
insert(p)
|
|
591
|
+
|
|
592
|
+
def to_schema(node: Dict[str, Any]) -> Dict[str, Any]:
|
|
593
|
+
children = node["__children__"]
|
|
594
|
+
if not children:
|
|
595
|
+
# Leaf node
|
|
596
|
+
return {"type": force_leaf_type}
|
|
597
|
+
props = {}
|
|
598
|
+
required = []
|
|
599
|
+
for name, child in children.items():
|
|
600
|
+
props[name] = to_schema(child)
|
|
601
|
+
required.append(name)
|
|
602
|
+
return {
|
|
603
|
+
"type": "object",
|
|
604
|
+
"properties": props,
|
|
605
|
+
"required": required,
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
return to_schema(root)
|
|
609
|
+
|
|
610
|
+
|
|
482
611
|
def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
483
|
-
"""
|
|
484
|
-
|
|
485
|
-
|
|
612
|
+
"""
|
|
613
|
+
Produce a data source config (JSON schema) that reflects nested object structure
|
|
614
|
+
when column mappings reference dotted paths (e.g., item.context.company...).
|
|
615
|
+
|
|
616
|
+
Backward compatibility:
|
|
617
|
+
- If all referenced source paths are single tokens (flat), fall back to legacy flat schema.
|
|
618
|
+
- Otherwise build a nested object schema covering only referenced leaves.
|
|
486
619
|
|
|
487
|
-
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
488
|
-
helper function.
|
|
489
620
|
:type input_data_df: pd.DataFrame
|
|
490
|
-
:param
|
|
621
|
+
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
491
622
|
:type column_mapping: Optional[Dict[str, str]]
|
|
623
|
+
:param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
|
|
492
624
|
:return: A dictionary that can act as data source config for OAI evaluation group creation.
|
|
493
625
|
:rtype: Dict[str, Any]
|
|
626
|
+
helper function.
|
|
494
627
|
"""
|
|
628
|
+
# Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
|
|
629
|
+
LOGGER.info(
|
|
630
|
+
f"AOAI: Generating data source config for {len(input_data_df)} rows with {len(column_mapping)} column mapping(s)..."
|
|
631
|
+
)
|
|
632
|
+
referenced_paths: List[str] = []
|
|
633
|
+
for v in column_mapping.values():
|
|
634
|
+
m = DATA_PATH_PATTERN.match(v)
|
|
635
|
+
if m:
|
|
636
|
+
referenced_paths.append(m.group(1))
|
|
637
|
+
|
|
638
|
+
LOGGER.info(f"AOAI: Found {len(referenced_paths)} referenced paths in column mappings: {referenced_paths}")
|
|
639
|
+
# Decide if we have nested structures
|
|
640
|
+
has_nested = any("." in p for p in referenced_paths)
|
|
641
|
+
LOGGER.info(f"AOAI: Schema generation mode: {'nested' if has_nested else 'flat'}")
|
|
642
|
+
|
|
643
|
+
if not referenced_paths or not has_nested:
|
|
644
|
+
# Legacy flat behavior (existing logic): treat each mapping key as independent string field
|
|
645
|
+
LOGGER.info("AOAI: Using flat schema generation (no nested structures detected).")
|
|
646
|
+
data_source_config = {
|
|
647
|
+
"type": "custom",
|
|
648
|
+
"item_schema": {
|
|
649
|
+
"type": "object",
|
|
650
|
+
"properties": {},
|
|
651
|
+
"required": [],
|
|
652
|
+
},
|
|
653
|
+
}
|
|
654
|
+
props = data_source_config["item_schema"]["properties"]
|
|
655
|
+
req = data_source_config["item_schema"]["required"]
|
|
656
|
+
for key in column_mapping.keys():
|
|
657
|
+
props[key] = {"type": "string"}
|
|
658
|
+
req.append(key)
|
|
659
|
+
LOGGER.info(f"AOAI: Flat schema generated with {len(props)} properties: {list(props.keys())}")
|
|
660
|
+
return data_source_config
|
|
661
|
+
|
|
662
|
+
# NEW: If all nested paths share the same first segment (e.g. 'item'),
|
|
663
|
+
# treat that segment as the wrapper already provided by the JSONL line ("item": {...})
|
|
664
|
+
# so we exclude it from the schema (schema describes the *inside* of "item").
|
|
665
|
+
first_segments = {p.split(".")[0] for p in referenced_paths}
|
|
666
|
+
strip_wrapper = False
|
|
667
|
+
wrapper_name = None
|
|
668
|
+
LOGGER.info(f"AOAI: First segments in referenced paths: {first_segments}")
|
|
669
|
+
if len(first_segments) == 1:
|
|
670
|
+
only_seg = next(iter(first_segments))
|
|
671
|
+
# We only strip if that segment looks like the canonical wrapper.
|
|
672
|
+
if only_seg == WRAPPER_KEY:
|
|
673
|
+
strip_wrapper = True
|
|
674
|
+
wrapper_name = only_seg
|
|
675
|
+
LOGGER.info(f"AOAI: All paths start with wrapper '{WRAPPER_KEY}', will strip from schema.")
|
|
676
|
+
|
|
677
|
+
effective_paths = referenced_paths
|
|
678
|
+
if strip_wrapper:
|
|
679
|
+
stripped = []
|
|
680
|
+
for p in referenced_paths:
|
|
681
|
+
parts = p.split(".", 1)
|
|
682
|
+
if len(parts) == 2:
|
|
683
|
+
stripped.append(parts[1]) # drop leading 'item.'
|
|
684
|
+
else:
|
|
685
|
+
# Path was just 'item' (no leaf) – ignore; it doesn't define a leaf value.
|
|
686
|
+
continue
|
|
687
|
+
# If stripping produced at least one usable path, adopt; else fall back to original.
|
|
688
|
+
if stripped:
|
|
689
|
+
effective_paths = stripped
|
|
690
|
+
LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}")
|
|
495
691
|
|
|
496
|
-
|
|
692
|
+
LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...")
|
|
693
|
+
nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string")
|
|
694
|
+
|
|
695
|
+
LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'")
|
|
696
|
+
return {
|
|
497
697
|
"type": "custom",
|
|
498
|
-
"item_schema":
|
|
499
|
-
"type": "object",
|
|
500
|
-
"properties": {},
|
|
501
|
-
"required": [],
|
|
502
|
-
},
|
|
698
|
+
"item_schema": nested_schema,
|
|
503
699
|
}
|
|
504
|
-
properties = data_source_config["item_schema"]["properties"]
|
|
505
|
-
required = data_source_config["item_schema"]["required"]
|
|
506
|
-
for key in column_mapping.keys():
|
|
507
|
-
properties[key] = {
|
|
508
|
-
"type": "string",
|
|
509
|
-
}
|
|
510
|
-
required.append(key)
|
|
511
|
-
return data_source_config
|
|
512
700
|
|
|
513
701
|
|
|
514
702
|
def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
|
|
@@ -543,9 +731,9 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
|
|
|
543
731
|
|
|
544
732
|
def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
|
|
545
733
|
"""
|
|
546
|
-
Given a dataframe of data to be evaluated, and
|
|
547
|
-
produce a dictionary can be used as the data source input for an OAI evaluation run.
|
|
548
|
-
|
|
734
|
+
Given a dataframe of data to be evaluated, and a column mapping,
|
|
735
|
+
produce a dictionary that can be used as the data source input for an OAI evaluation run.
|
|
736
|
+
Builds a nested 'item' object mirroring the hierarchical paths in the mapping values.
|
|
549
737
|
:param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
|
|
550
738
|
helper function.
|
|
551
739
|
:type input_data_df: pd.DataFrame
|
|
@@ -554,26 +742,112 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
|
|
|
554
742
|
:return: A dictionary that can be used as the data source input for an OAI evaluation run.
|
|
555
743
|
:rtype: Dict[str, Any]
|
|
556
744
|
"""
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
745
|
+
|
|
746
|
+
def _convert_value_to_string(val: Any) -> str:
|
|
747
|
+
"""Convert a value to string representation for AOAI evaluation."""
|
|
748
|
+
if val is None:
|
|
749
|
+
return ""
|
|
750
|
+
elif isinstance(val, (str, int, float, bool)):
|
|
751
|
+
return str(val)
|
|
752
|
+
else:
|
|
753
|
+
try: # Attempt to JSON serialize lists/dicts
|
|
754
|
+
return json.dumps(val, ensure_ascii=False)
|
|
755
|
+
except (TypeError, ValueError):
|
|
756
|
+
# Fallback for unserializable objects
|
|
757
|
+
return str(val)
|
|
758
|
+
|
|
759
|
+
LOGGER.info(
|
|
760
|
+
f"AOAI: Building data source from {len(input_data_df)} rows with {len(column_mapping)} column mappings..."
|
|
761
|
+
)
|
|
762
|
+
# Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
|
|
763
|
+
# relative_parts excludes the wrapper (so schema + content align).
|
|
764
|
+
path_specs: List[Tuple[str, List[str], str]] = []
|
|
765
|
+
|
|
561
766
|
for name, formatted_entry in column_mapping.items():
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
767
|
+
if not (
|
|
768
|
+
isinstance(formatted_entry, str) and formatted_entry.startswith("${") and formatted_entry.endswith("}")
|
|
769
|
+
):
|
|
770
|
+
continue
|
|
771
|
+
body = formatted_entry[2:-1] # remove ${ }
|
|
772
|
+
pieces = body.split(".")
|
|
773
|
+
|
|
774
|
+
if not pieces:
|
|
775
|
+
continue
|
|
776
|
+
|
|
777
|
+
if pieces[0] == "data":
|
|
778
|
+
# Data path: data.<maybe wrapper>.<...>
|
|
779
|
+
if len(pieces) == 1:
|
|
780
|
+
continue
|
|
781
|
+
source_path = ".".join(pieces[1:]) # e.g. item.context.company...
|
|
782
|
+
# Skip mapping of wrapper itself
|
|
783
|
+
if source_path == WRAPPER_KEY:
|
|
784
|
+
continue
|
|
785
|
+
|
|
786
|
+
# Determine dataframe column name (it is the full dotted path as flattened earlier)
|
|
787
|
+
dataframe_col = source_path
|
|
788
|
+
|
|
789
|
+
# Relative parts for nested insertion (drop leading wrapper if present)
|
|
790
|
+
if source_path.startswith(WRAPPER_KEY + "."):
|
|
791
|
+
relative_path = source_path[len(WRAPPER_KEY) + 1 :]
|
|
792
|
+
else:
|
|
793
|
+
# Path not under wrapper; treat its segments as is (will live directly under wrapper)
|
|
794
|
+
relative_path = source_path
|
|
795
|
+
|
|
796
|
+
relative_parts = [p for p in relative_path.split(".") if p]
|
|
797
|
+
|
|
798
|
+
# Defensive: if mapping alias differs from leaf, prefer actual path leaf to stay consistent.
|
|
799
|
+
# (If you want alias override, replace relative_parts[-1] with name when name != path_leaf.)
|
|
800
|
+
if not relative_parts:
|
|
801
|
+
continue
|
|
802
|
+
|
|
803
|
+
path_specs.append((formatted_entry, relative_parts, dataframe_col))
|
|
804
|
+
|
|
805
|
+
elif pieces[0] == "run" and len(pieces) >= 3 and pieces[1] == "outputs":
|
|
806
|
+
# Target / run outputs become __outputs.<rest> columns
|
|
807
|
+
run_col = "__outputs." + ".".join(pieces[2:])
|
|
808
|
+
leaf_name = pieces[-1]
|
|
809
|
+
path_specs.append((formatted_entry, [leaf_name], run_col))
|
|
810
|
+
|
|
811
|
+
LOGGER.info(f"AOAI: Processed {len(path_specs)} path specifications from column mappings.")
|
|
812
|
+
content: List[Dict[str, Any]] = []
|
|
813
|
+
|
|
814
|
+
for _, row in input_data_df.iterrows():
|
|
815
|
+
item_root: Dict[str, Any] = {}
|
|
816
|
+
|
|
817
|
+
# Track which dataframe columns have been processed via column_mapping
|
|
818
|
+
processed_cols: Set[str] = set()
|
|
819
|
+
|
|
820
|
+
for _, rel_parts, df_col in path_specs:
|
|
821
|
+
# Safely fetch value
|
|
822
|
+
val = row.get(df_col, None)
|
|
823
|
+
|
|
824
|
+
# Convert value to string to match schema's "type": "string" leaves.
|
|
825
|
+
str_val = _convert_value_to_string(val)
|
|
826
|
+
|
|
827
|
+
# Insert into nested dict
|
|
828
|
+
cursor = item_root
|
|
829
|
+
for seg in rel_parts[:-1]:
|
|
830
|
+
nxt = cursor.get(seg)
|
|
831
|
+
if not isinstance(nxt, dict):
|
|
832
|
+
nxt = {}
|
|
833
|
+
cursor[seg] = nxt
|
|
834
|
+
cursor = nxt
|
|
835
|
+
leaf_key = rel_parts[-1]
|
|
836
|
+
cursor[leaf_key] = str_val
|
|
837
|
+
|
|
838
|
+
# Mark this dataframe column as processed
|
|
839
|
+
processed_cols.add(df_col)
|
|
840
|
+
|
|
841
|
+
# Add any unmapped dataframe columns directly to item_root
|
|
842
|
+
for col_name in input_data_df.columns:
|
|
843
|
+
if col_name not in processed_cols:
|
|
844
|
+
val = row.get(col_name, None)
|
|
845
|
+
str_val = _convert_value_to_string(val)
|
|
846
|
+
item_root[col_name] = str_val
|
|
847
|
+
|
|
848
|
+
content.append({WRAPPER_KEY: item_root})
|
|
576
849
|
|
|
850
|
+
LOGGER.info(f"AOAI: Generated {len(content)} content items for data source.")
|
|
577
851
|
return {
|
|
578
852
|
"type": "jsonl",
|
|
579
853
|
"source": {
|
|
@@ -608,14 +882,16 @@ def _begin_eval_run(
|
|
|
608
882
|
:rtype: str
|
|
609
883
|
"""
|
|
610
884
|
|
|
885
|
+
LOGGER.info(f"AOAI: Creating eval run '{run_name}' for eval group {eval_group_id}...")
|
|
611
886
|
data_source = _get_data_source(input_data_df, column_mapping)
|
|
612
887
|
eval_run = client.evals.runs.create(
|
|
613
888
|
eval_id=eval_group_id,
|
|
614
|
-
data_source=data_source,
|
|
889
|
+
data_source=cast(Any, data_source), # Cast for type checker: dynamic schema dict accepted by SDK at runtime
|
|
615
890
|
name=run_name,
|
|
616
891
|
metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
|
|
617
892
|
# TODO decide if we want to add our own timeout value?
|
|
618
893
|
)
|
|
894
|
+
LOGGER.info(f"AOAI: Eval run created successfully with ID: {eval_run.id}")
|
|
619
895
|
return eval_run.id
|
|
620
896
|
|
|
621
897
|
|
|
@@ -652,8 +928,11 @@ def _wait_for_run_conclusion(
|
|
|
652
928
|
if total_wait > max_wait_seconds:
|
|
653
929
|
wait_interval -= total_wait - max_wait_seconds
|
|
654
930
|
sleep(wait_interval)
|
|
931
|
+
iters += 1
|
|
655
932
|
response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
|
|
933
|
+
LOGGER.info(f"AOAI: Polling iteration {iters}, status: {response.status}, total wait: {total_wait:.1f}s")
|
|
656
934
|
if response.status not in ["queued", "in_progress"]:
|
|
935
|
+
LOGGER.info(f"AOAI: Eval run {eval_run_id} reached terminal status: {response.status}")
|
|
657
936
|
return response
|
|
658
937
|
if total_wait > max_wait_seconds:
|
|
659
938
|
raise EvaluationException(
|
|
@@ -7,7 +7,8 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
import tempfile
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
|
|
10
|
+
import time
|
|
11
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Union, cast
|
|
11
12
|
import uuid
|
|
12
13
|
import base64
|
|
13
14
|
import math
|
|
@@ -25,7 +26,7 @@ from azure.ai.evaluation._constants import (
|
|
|
25
26
|
Prefixes,
|
|
26
27
|
)
|
|
27
28
|
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
28
|
-
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
29
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
|
|
29
30
|
from azure.ai.evaluation._version import VERSION
|
|
30
31
|
from azure.ai.evaluation._user_agent import UserAgentSingleton
|
|
31
32
|
from azure.ai.evaluation._azure._clients import LiteMLClient
|
|
@@ -196,8 +197,14 @@ def _log_metrics_and_instance_results_onedp(
|
|
|
196
197
|
)
|
|
197
198
|
)
|
|
198
199
|
|
|
200
|
+
# TODO: type mis-match because Evaluation instance is assigned to EvaluationRun
|
|
201
|
+
evaluation_id = (
|
|
202
|
+
upload_run_response.name # type: ignore[attr-defined]
|
|
203
|
+
if hasattr(upload_run_response, "name")
|
|
204
|
+
else upload_run_response.id
|
|
205
|
+
)
|
|
199
206
|
update_run_response = client.update_evaluation_run(
|
|
200
|
-
name=
|
|
207
|
+
name=evaluation_id,
|
|
201
208
|
evaluation=EvaluationUpload(
|
|
202
209
|
display_name=evaluation_name,
|
|
203
210
|
status="Completed",
|
|
@@ -330,7 +337,11 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
|
|
|
330
337
|
json.dump(data_dict, f, ensure_ascii=False)
|
|
331
338
|
|
|
332
339
|
# Use tqdm.write to print message without interfering with any current progress bar
|
|
333
|
-
tqdm.write(
|
|
340
|
+
# Fall back to regular print if tqdm.write fails (e.g., when progress bar is closed)
|
|
341
|
+
try:
|
|
342
|
+
tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
343
|
+
except Exception:
|
|
344
|
+
print(f'Evaluation results saved to "{p.resolve()}".\n')
|
|
334
345
|
|
|
335
346
|
|
|
336
347
|
def _apply_column_mapping(
|
|
@@ -460,7 +471,7 @@ class JSONLDataFileLoader:
|
|
|
460
471
|
self.filename = filename
|
|
461
472
|
|
|
462
473
|
def load(self) -> pd.DataFrame:
|
|
463
|
-
return pd.read_json(self.filename, lines=True)
|
|
474
|
+
return pd.read_json(self.filename, lines=True, dtype=object)
|
|
464
475
|
|
|
465
476
|
|
|
466
477
|
class CSVDataFileLoader:
|
|
@@ -468,7 +479,7 @@ class CSVDataFileLoader:
|
|
|
468
479
|
self.filename = filename
|
|
469
480
|
|
|
470
481
|
def load(self) -> pd.DataFrame:
|
|
471
|
-
return pd.read_csv(self.filename)
|
|
482
|
+
return pd.read_csv(self.filename, dtype=str)
|
|
472
483
|
|
|
473
484
|
|
|
474
485
|
class DataLoaderFactory:
|