azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -2,11 +2,13 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import json
5
6
  import logging
7
+ import re
6
8
 
7
9
  from openai import AzureOpenAI, OpenAI
8
10
  import pandas as pd
9
- from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List
11
+ from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List, cast, Set
10
12
  from time import sleep
11
13
 
12
14
  from ._batch_run import CodeClient, ProxyClient
@@ -21,6 +23,15 @@ from azure.ai.evaluation._common._experimental import experimental
21
23
  TClient = TypeVar("TClient", ProxyClient, CodeClient)
22
24
  LOGGER = logging.getLogger(__name__)
23
25
 
26
+ # Precompiled regex for extracting data paths from mapping expressions of the form
27
+ # ${data.some.dotted.path}. Compiled once at import time to avoid repeated
28
+ # recompilation on each call to _generate_data_source_config.
29
+ DATA_PATH_PATTERN = re.compile(r"^\$\{data\.([a-zA-Z0-9_\.]+)\}$")
30
+
31
+ # Canonical top-level wrapper key expected in nested JSONL evaluation rows.
32
+ # Centralizing here avoids magic strings sprinkled through schema/content generation code.
33
+ WRAPPER_KEY = "item"
34
+
24
35
 
25
36
  class OAIEvalRunCreationInfo(TypedDict, total=True):
26
37
  """Configuration for an evaluator"""
@@ -50,6 +61,7 @@ def _split_evaluators_and_grader_configs(
50
61
  :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
51
62
  :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
52
63
  """
64
+ LOGGER.info(f"AOAI: Splitting {len(evaluators)} evaluators into AOAI graders and standard evaluators...")
53
65
  true_evaluators = {}
54
66
  aoai_graders = {}
55
67
  for key, value in evaluators.items():
@@ -57,6 +69,7 @@ def _split_evaluators_and_grader_configs(
57
69
  aoai_graders[key] = value
58
70
  else:
59
71
  true_evaluators[key] = value
72
+ LOGGER.info(f"AOAI: Found {len(aoai_graders)} AOAI graders and {len(true_evaluators)} standard evaluators.")
60
73
  return true_evaluators, aoai_graders
61
74
 
62
75
 
@@ -93,16 +106,23 @@ def _begin_aoai_evaluation(
93
106
  LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
94
107
  all_eval_run_info: List[OAIEvalRunCreationInfo] = []
95
108
 
96
- for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
109
+ grader_mapping_list = list(_get_graders_and_column_mappings(graders, column_mappings))
110
+ LOGGER.info(f"AOAI: Will create {len(grader_mapping_list)} separate evaluation run(s) based on column mappings.")
111
+
112
+ for idx, (selected_graders, selected_column_mapping) in enumerate(grader_mapping_list):
113
+ LOGGER.info(
114
+ f"AOAI: Starting evaluation run {idx + 1}/{len(grader_mapping_list)} with {len(selected_graders)} grader(s)..."
115
+ )
97
116
  all_eval_run_info.append(
98
117
  _begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
99
118
  )
100
119
 
120
+ LOGGER.info(f"AOAI: Successfully created {len(all_eval_run_info)} evaluation run(s).")
101
121
  return all_eval_run_info
102
122
 
103
123
 
104
124
  def _begin_single_aoai_evaluation(
105
- graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Dict[str, str], run_name: str
125
+ graders: Dict[str, AzureOpenAIGrader], data: pd.DataFrame, column_mapping: Optional[Dict[str, str]], run_name: str
106
126
  ) -> OAIEvalRunCreationInfo:
107
127
  """
108
128
  Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
@@ -111,8 +131,10 @@ def _begin_single_aoai_evaluation(
111
131
 
112
132
  :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
113
133
  :type graders: Dict[str, AoaiGrader]
114
- :param data_source_config: The data source configuration to apply to the
115
- :type data_source_config: pd.DataFrame
134
+ :param data: The input data to evaluate, as a pandas DataFrame.
135
+ :type data: pd.DataFrame
136
+ :param column_mapping: The column mapping to apply. If None, an empty mapping is used.
137
+ :type column_mapping: Optional[Dict[str, str]]
116
138
  :param run_name: The name of the evaluation run.
117
139
  :type run_name: str
118
140
  :return: A tuple containing the eval group ID and eval run ID of the resultant eval run, as well as a dictionary
@@ -121,6 +143,7 @@ def _begin_single_aoai_evaluation(
121
143
  """
122
144
 
123
145
  # Format data for eval group creation
146
+ LOGGER.info(f"AOAI: Preparing evaluation for {len(graders)} grader(s): {list(graders.keys())}")
124
147
  grader_name_list = []
125
148
  grader_list = []
126
149
  # It's expected that all graders supplied for a single eval run use the same credentials
@@ -130,10 +153,13 @@ def _begin_single_aoai_evaluation(
130
153
  for name, grader in graders.items():
131
154
  grader_name_list.append(name)
132
155
  grader_list.append(grader._grader_config)
133
- data_source_config = _generate_data_source_config(data, column_mapping)
156
+ effective_column_mapping: Dict[str, str] = column_mapping or {}
157
+ LOGGER.info(f"AOAI: Generating data source config with {len(effective_column_mapping)} column mapping(s)...")
158
+ data_source_config = _generate_data_source_config(data, effective_column_mapping)
159
+ LOGGER.info(f"AOAI: Data source config generated with schema type: {data_source_config.get('type')}")
134
160
 
135
161
  # Create eval group
136
- # import pdb; pdb.set_trace()
162
+ LOGGER.info(f"AOAI: Creating eval group with {len(grader_list)} testing criteria...")
137
163
  eval_group_info = client.evals.create(
138
164
  data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
139
165
  )
@@ -154,7 +180,8 @@ def _begin_single_aoai_evaluation(
154
180
  grader_name_map[criteria.id] = name
155
181
 
156
182
  # Create eval run
157
- eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, column_mapping)
183
+ LOGGER.info(f"AOAI: Creating eval run '{run_name}' with {len(data)} data rows...")
184
+ eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping)
158
185
  LOGGER.info(
159
186
  f"AOAI: Eval run created with id {eval_run_id}."
160
187
  + " Results will be retrieved after normal evaluation is complete..."
@@ -184,13 +211,16 @@ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> T
184
211
  :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
185
212
  """
186
213
 
214
+ LOGGER.info(f"AOAI: Retrieving results from {len(all_run_info)} evaluation run(s)...")
187
215
  run_metrics = {}
188
216
  output_df = pd.DataFrame()
189
- for run_info in all_run_info:
217
+ for idx, run_info in enumerate(all_run_info):
218
+ LOGGER.info(f"AOAI: Fetching results for run {idx + 1}/{len(all_run_info)} (ID: {run_info['eval_run_id']})...")
190
219
  cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
191
220
  output_df = pd.concat([output_df, cur_output_df], axis=1)
192
221
  run_metrics.update(cur_run_metrics)
193
222
 
223
+ LOGGER.info(f"AOAI: Successfully retrieved all results. Combined dataframe shape: {output_df.shape}")
194
224
  return output_df, run_metrics
195
225
 
196
226
 
@@ -210,8 +240,10 @@ def _get_single_run_results(
210
240
  :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
211
241
  """
212
242
  # Wait for evaluation run to complete
243
+ LOGGER.info(f"AOAI: Waiting for eval run {run_info['eval_run_id']} to complete...")
213
244
  run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
214
245
 
246
+ LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}")
215
247
  if run_results.status != "completed":
216
248
  raise EvaluationException(
217
249
  message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
@@ -222,6 +254,7 @@ def _get_single_run_results(
222
254
  )
223
255
 
224
256
  # Convert run results into a dictionary of metrics
257
+ LOGGER.info(f"AOAI: Processing results and calculating metrics for run {run_info['eval_run_id']}...")
225
258
  run_metrics: Dict[str, Any] = {}
226
259
  if run_results.per_testing_criteria_results is None:
227
260
  msg = (
@@ -242,8 +275,10 @@ def _get_single_run_results(
242
275
  ratio = passed / (passed + failed) if (passed + failed) else 0.0
243
276
  formatted_column_name = f"{grader_name}.pass_rate"
244
277
  run_metrics[formatted_column_name] = ratio
278
+ LOGGER.info(f"AOAI: Grader '{grader_name}': {passed} passed, {failed} failed, pass_rate={ratio:.4f}")
245
279
 
246
280
  # Collect all results with pagination
281
+ LOGGER.info(f"AOAI: Collecting output items for run {run_info['eval_run_id']} with pagination...")
247
282
  all_results: List[Any] = []
248
283
  next_cursor: Optional[str] = None
249
284
  limit = 100 # Max allowed by API
@@ -267,6 +302,7 @@ def _get_single_run_results(
267
302
  else:
268
303
  break
269
304
 
305
+ LOGGER.info(f"AOAI: Collected {len(all_results)} total output items across all pages.")
270
306
  listed_results: Dict[str, List[Any]] = {"index": []}
271
307
  # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
272
308
  for row_result in all_results:
@@ -316,6 +352,7 @@ def _get_single_run_results(
316
352
 
317
353
  # Ensure all columns are the same length as the 'index' list
318
354
  num_rows = len(listed_results["index"])
355
+ LOGGER.info(f"AOAI: Processing {num_rows} result rows into dataframe...")
319
356
  for col_name in list(listed_results.keys()):
320
357
  if col_name != "index":
321
358
  col_length = len(listed_results[col_name])
@@ -343,6 +380,7 @@ def _get_single_run_results(
343
380
  expected = run_info.get("expected_rows", None)
344
381
  if expected is not None:
345
382
  pre_len = len(output_df)
383
+ LOGGER.info(f"AOAI: Validating result count: expected {expected} rows, received {pre_len} rows.")
346
384
  # Assumes original datasource_item_id space is 0..expected-1
347
385
  output_df = output_df.reindex(range(expected))
348
386
  if pre_len != expected:
@@ -375,6 +413,9 @@ def _get_single_run_results(
375
413
 
376
414
  # Reset to RangeIndex so downstream concatenation aligns on position
377
415
  output_df.reset_index(drop=True, inplace=True)
416
+ LOGGER.info(
417
+ f"AOAI: Successfully processed run {run_info['eval_run_id']} with final dataframe shape: {output_df.shape}"
418
+ )
378
419
  return output_df, run_metrics
379
420
 
380
421
 
@@ -468,47 +509,194 @@ def _get_graders_and_column_mappings(
468
509
  :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
469
510
  """
470
511
 
512
+ LOGGER.info(f"AOAI: Organizing {len(graders)} graders with column mappings...")
471
513
  if column_mappings is None:
514
+ LOGGER.info("AOAI: No column mappings provided, each grader will have its own eval run.")
472
515
  return [({name: grader}, None) for name, grader in graders.items()]
473
516
  default_mapping = column_mappings.get("default", None)
474
517
  if default_mapping is None:
475
518
  default_mapping = {}
519
+ LOGGER.info(
520
+ f"AOAI: Using default mapping with {len(default_mapping)} entries for graders without specific mappings."
521
+ )
476
522
  return [
477
523
  ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
478
524
  for name, grader in graders.items()
479
525
  ]
480
526
 
481
527
 
528
+ def _build_schema_tree_from_paths(
529
+ paths: List[str],
530
+ force_leaf_type: str = "string",
531
+ ) -> Dict[str, Any]:
532
+ """
533
+ Build a nested JSON schema (object) from a list of dot-delimited paths.
534
+ Each path represents a leaf. Intermediate segments become nested object properties.
535
+
536
+ Example input paths:
537
+ ["item.query",
538
+ "item.context.company.policy.security.passwords.rotation_days",
539
+ "item.context.company.policy.security.network.vpn.required"]
540
+
541
+ Returns schema fragment:
542
+ {
543
+ "type": "object",
544
+ "properties": {
545
+ "item": {
546
+ "type": "object",
547
+ "properties": {
548
+ "query": {"type": "string"},
549
+ "context": {
550
+ "type": "object",
551
+ "properties": {
552
+ "company": { ... }
553
+ },
554
+ "required": ["company"]
555
+ }
556
+ },
557
+ "required": ["query", "context"]
558
+ }
559
+ },
560
+ "required": ["item"]
561
+ }
562
+
563
+ :param paths: A list of dot-delimited strings, each representing a leaf path
564
+ in the logical object hierarchy (e.g. ``"item.context.company.policy.security.passwords.rotation_days"``).
565
+ Empty path segments are ignored.
566
+ :type paths: List[str]
567
+ :param force_leaf_type: The JSON Schema ``type`` value to assign to every leaf node
568
+ produced from the supplied paths. Defaults to ``"string"``.
569
+ :type force_leaf_type: str
570
+ :return: A JSON Schema fragment describing the hierarchical structure implied by
571
+ the input paths. The returned schema root always has ``type: object`` with
572
+ recursively nested ``properties`` / ``required`` keys.
573
+ :rtype: Dict[str, Any]
574
+ """
575
+ # Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool }
576
+ root: Dict[str, Any] = {"__children__": {}, "__leaf__": False}
577
+
578
+ def insert(path: str):
579
+ parts = [p for p in path.split(".") if p]
580
+ node = root
581
+ for i, part in enumerate(parts):
582
+ children = node["__children__"]
583
+ if part not in children:
584
+ children[part] = {"__children__": {}, "__leaf__": False}
585
+ node = children[part]
586
+ if i == len(parts) - 1:
587
+ node["__leaf__"] = True
588
+
589
+ for p in paths:
590
+ insert(p)
591
+
592
+ def to_schema(node: Dict[str, Any]) -> Dict[str, Any]:
593
+ children = node["__children__"]
594
+ if not children:
595
+ # Leaf node
596
+ return {"type": force_leaf_type}
597
+ props = {}
598
+ required = []
599
+ for name, child in children.items():
600
+ props[name] = to_schema(child)
601
+ required.append(name)
602
+ return {
603
+ "type": "object",
604
+ "properties": props,
605
+ "required": required,
606
+ }
607
+
608
+ return to_schema(root)
609
+
610
+
482
611
  def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
483
- """Produce a data source config that maps all columns from the supplied data source into
484
- the OAI API. The mapping is naive unless a column mapping is provided, in which case
485
- the column mapping's values overrule the relevant naive mappings
612
+ """
613
+ Produce a data source config (JSON schema) that reflects nested object structure
614
+ when column mappings reference dotted paths (e.g., item.context.company...).
615
+
616
+ Backward compatibility:
617
+ - If all referenced source paths are single tokens (flat), fall back to legacy flat schema.
618
+ - Otherwise build a nested object schema covering only referenced leaves.
486
619
 
487
- :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
488
- helper function.
489
620
  :type input_data_df: pd.DataFrame
490
- :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
621
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
491
622
  :type column_mapping: Optional[Dict[str, str]]
623
+ :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
492
624
  :return: A dictionary that can act as data source config for OAI evaluation group creation.
493
625
  :rtype: Dict[str, Any]
626
+ helper function.
494
627
  """
628
+ # Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
629
+ LOGGER.info(
630
+ f"AOAI: Generating data source config for {len(input_data_df)} rows with {len(column_mapping)} column mapping(s)..."
631
+ )
632
+ referenced_paths: List[str] = []
633
+ for v in column_mapping.values():
634
+ m = DATA_PATH_PATTERN.match(v)
635
+ if m:
636
+ referenced_paths.append(m.group(1))
637
+
638
+ LOGGER.info(f"AOAI: Found {len(referenced_paths)} referenced paths in column mappings: {referenced_paths}")
639
+ # Decide if we have nested structures
640
+ has_nested = any("." in p for p in referenced_paths)
641
+ LOGGER.info(f"AOAI: Schema generation mode: {'nested' if has_nested else 'flat'}")
642
+
643
+ if not referenced_paths or not has_nested:
644
+ # Legacy flat behavior (existing logic): treat each mapping key as independent string field
645
+ LOGGER.info("AOAI: Using flat schema generation (no nested structures detected).")
646
+ data_source_config = {
647
+ "type": "custom",
648
+ "item_schema": {
649
+ "type": "object",
650
+ "properties": {},
651
+ "required": [],
652
+ },
653
+ }
654
+ props = data_source_config["item_schema"]["properties"]
655
+ req = data_source_config["item_schema"]["required"]
656
+ for key in column_mapping.keys():
657
+ props[key] = {"type": "string"}
658
+ req.append(key)
659
+ LOGGER.info(f"AOAI: Flat schema generated with {len(props)} properties: {list(props.keys())}")
660
+ return data_source_config
661
+
662
+ # NEW: If all nested paths share the same first segment (e.g. 'item'),
663
+ # treat that segment as the wrapper already provided by the JSONL line ("item": {...})
664
+ # so we exclude it from the schema (schema describes the *inside* of "item").
665
+ first_segments = {p.split(".")[0] for p in referenced_paths}
666
+ strip_wrapper = False
667
+ wrapper_name = None
668
+ LOGGER.info(f"AOAI: First segments in referenced paths: {first_segments}")
669
+ if len(first_segments) == 1:
670
+ only_seg = next(iter(first_segments))
671
+ # We only strip if that segment looks like the canonical wrapper.
672
+ if only_seg == WRAPPER_KEY:
673
+ strip_wrapper = True
674
+ wrapper_name = only_seg
675
+ LOGGER.info(f"AOAI: All paths start with wrapper '{WRAPPER_KEY}', will strip from schema.")
676
+
677
+ effective_paths = referenced_paths
678
+ if strip_wrapper:
679
+ stripped = []
680
+ for p in referenced_paths:
681
+ parts = p.split(".", 1)
682
+ if len(parts) == 2:
683
+ stripped.append(parts[1]) # drop leading 'item.'
684
+ else:
685
+ # Path was just 'item' (no leaf) – ignore; it doesn't define a leaf value.
686
+ continue
687
+ # If stripping produced at least one usable path, adopt; else fall back to original.
688
+ if stripped:
689
+ effective_paths = stripped
690
+ LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}")
495
691
 
496
- data_source_config = {
692
+ LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...")
693
+ nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string")
694
+
695
+ LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'")
696
+ return {
497
697
  "type": "custom",
498
- "item_schema": {
499
- "type": "object",
500
- "properties": {},
501
- "required": [],
502
- },
698
+ "item_schema": nested_schema,
503
699
  }
504
- properties = data_source_config["item_schema"]["properties"]
505
- required = data_source_config["item_schema"]["required"]
506
- for key in column_mapping.keys():
507
- properties[key] = {
508
- "type": "string",
509
- }
510
- required.append(key)
511
- return data_source_config
512
700
 
513
701
 
514
702
  def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
@@ -543,9 +731,9 @@ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[st
543
731
 
544
732
  def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
545
733
  """
546
- Given a dataframe of data to be evaluated, and an optional column mapping,
547
- produce a dictionary can be used as the data source input for an OAI evaluation run.
548
-
734
+ Given a dataframe of data to be evaluated, and a column mapping,
735
+ produce a dictionary that can be used as the data source input for an OAI evaluation run.
736
+ Builds a nested 'item' object mirroring the hierarchical paths in the mapping values.
549
737
  :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
550
738
  helper function.
551
739
  :type input_data_df: pd.DataFrame
@@ -554,26 +742,112 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
554
742
  :return: A dictionary that can be used as the data source input for an OAI evaluation run.
555
743
  :rtype: Dict[str, Any]
556
744
  """
557
- content = []
558
- column_to_source_map = {}
559
- # Convert from column mapping's format to figure out actual column names in
560
- # input dataframe, and map those to the appropriate OAI input names.
745
+
746
+ def _convert_value_to_string(val: Any) -> str:
747
+ """Convert a value to string representation for AOAI evaluation."""
748
+ if val is None:
749
+ return ""
750
+ elif isinstance(val, (str, int, float, bool)):
751
+ return str(val)
752
+ else:
753
+ try: # Attempt to JSON serialize lists/dicts
754
+ return json.dumps(val, ensure_ascii=False)
755
+ except (TypeError, ValueError):
756
+ # Fallback for unserializable objects
757
+ return str(val)
758
+
759
+ LOGGER.info(
760
+ f"AOAI: Building data source from {len(input_data_df)} rows with {len(column_mapping)} column mappings..."
761
+ )
762
+ # Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
763
+ # relative_parts excludes the wrapper (so schema + content align).
764
+ path_specs: List[Tuple[str, List[str], str]] = []
765
+
561
766
  for name, formatted_entry in column_mapping.items():
562
- # From "${" from start and "}" from end before splitting.
563
- entry_pieces = formatted_entry[2:-1].split(".")
564
- if len(entry_pieces) == 2 and entry_pieces[0] == "data":
565
- column_to_source_map[name] = entry_pieces[1]
566
- elif len(entry_pieces) == 3 and entry_pieces[0] == "run" and entry_pieces[1] == "outputs":
567
- column_to_source_map[name] = f"__outputs.{entry_pieces[2]}"
568
-
569
- # Using the above mapping, transform the input dataframe into a content
570
- # dictionary that'll work in an OAI data source.
571
- for row in input_data_df.iterrows():
572
- row_dict = {}
573
- for oai_key, dataframe_key in column_to_source_map.items():
574
- row_dict[oai_key] = str(row[1][dataframe_key])
575
- content.append({"item": row_dict})
767
+ if not (
768
+ isinstance(formatted_entry, str) and formatted_entry.startswith("${") and formatted_entry.endswith("}")
769
+ ):
770
+ continue
771
+ body = formatted_entry[2:-1] # remove ${ }
772
+ pieces = body.split(".")
773
+
774
+ if not pieces:
775
+ continue
776
+
777
+ if pieces[0] == "data":
778
+ # Data path: data.<maybe wrapper>.<...>
779
+ if len(pieces) == 1:
780
+ continue
781
+ source_path = ".".join(pieces[1:]) # e.g. item.context.company...
782
+ # Skip mapping of wrapper itself
783
+ if source_path == WRAPPER_KEY:
784
+ continue
785
+
786
+ # Determine dataframe column name (it is the full dotted path as flattened earlier)
787
+ dataframe_col = source_path
788
+
789
+ # Relative parts for nested insertion (drop leading wrapper if present)
790
+ if source_path.startswith(WRAPPER_KEY + "."):
791
+ relative_path = source_path[len(WRAPPER_KEY) + 1 :]
792
+ else:
793
+ # Path not under wrapper; treat its segments as is (will live directly under wrapper)
794
+ relative_path = source_path
795
+
796
+ relative_parts = [p for p in relative_path.split(".") if p]
797
+
798
+ # Defensive: if mapping alias differs from leaf, prefer actual path leaf to stay consistent.
799
+ # (If you want alias override, replace relative_parts[-1] with name when name != path_leaf.)
800
+ if not relative_parts:
801
+ continue
802
+
803
+ path_specs.append((formatted_entry, relative_parts, dataframe_col))
804
+
805
+ elif pieces[0] == "run" and len(pieces) >= 3 and pieces[1] == "outputs":
806
+ # Target / run outputs become __outputs.<rest> columns
807
+ run_col = "__outputs." + ".".join(pieces[2:])
808
+ leaf_name = pieces[-1]
809
+ path_specs.append((formatted_entry, [leaf_name], run_col))
810
+
811
+ LOGGER.info(f"AOAI: Processed {len(path_specs)} path specifications from column mappings.")
812
+ content: List[Dict[str, Any]] = []
813
+
814
+ for _, row in input_data_df.iterrows():
815
+ item_root: Dict[str, Any] = {}
816
+
817
+ # Track which dataframe columns have been processed via column_mapping
818
+ processed_cols: Set[str] = set()
819
+
820
+ for _, rel_parts, df_col in path_specs:
821
+ # Safely fetch value
822
+ val = row.get(df_col, None)
823
+
824
+ # Convert value to string to match schema's "type": "string" leaves.
825
+ str_val = _convert_value_to_string(val)
826
+
827
+ # Insert into nested dict
828
+ cursor = item_root
829
+ for seg in rel_parts[:-1]:
830
+ nxt = cursor.get(seg)
831
+ if not isinstance(nxt, dict):
832
+ nxt = {}
833
+ cursor[seg] = nxt
834
+ cursor = nxt
835
+ leaf_key = rel_parts[-1]
836
+ cursor[leaf_key] = str_val
837
+
838
+ # Mark this dataframe column as processed
839
+ processed_cols.add(df_col)
840
+
841
+ # Add any unmapped dataframe columns directly to item_root
842
+ for col_name in input_data_df.columns:
843
+ if col_name not in processed_cols:
844
+ val = row.get(col_name, None)
845
+ str_val = _convert_value_to_string(val)
846
+ item_root[col_name] = str_val
847
+
848
+ content.append({WRAPPER_KEY: item_root})
576
849
 
850
+ LOGGER.info(f"AOAI: Generated {len(content)} content items for data source.")
577
851
  return {
578
852
  "type": "jsonl",
579
853
  "source": {
@@ -608,14 +882,16 @@ def _begin_eval_run(
608
882
  :rtype: str
609
883
  """
610
884
 
885
+ LOGGER.info(f"AOAI: Creating eval run '{run_name}' for eval group {eval_group_id}...")
611
886
  data_source = _get_data_source(input_data_df, column_mapping)
612
887
  eval_run = client.evals.runs.create(
613
888
  eval_id=eval_group_id,
614
- data_source=data_source,
889
+ data_source=cast(Any, data_source), # Cast for type checker: dynamic schema dict accepted by SDK at runtime
615
890
  name=run_name,
616
891
  metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
617
892
  # TODO decide if we want to add our own timeout value?
618
893
  )
894
+ LOGGER.info(f"AOAI: Eval run created successfully with ID: {eval_run.id}")
619
895
  return eval_run.id
620
896
 
621
897
 
@@ -652,8 +928,11 @@ def _wait_for_run_conclusion(
652
928
  if total_wait > max_wait_seconds:
653
929
  wait_interval -= total_wait - max_wait_seconds
654
930
  sleep(wait_interval)
931
+ iters += 1
655
932
  response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
933
+ LOGGER.info(f"AOAI: Polling iteration {iters}, status: {response.status}, total wait: {total_wait:.1f}s")
656
934
  if response.status not in ["queued", "in_progress"]:
935
+ LOGGER.info(f"AOAI: Eval run {eval_run_id} reached terminal status: {response.status}")
657
936
  return response
658
937
  if total_wait > max_wait_seconds:
659
938
  raise EvaluationException(
@@ -7,7 +7,8 @@ import os
7
7
  import re
8
8
  import tempfile
9
9
  from pathlib import Path
10
- from typing import Any, Dict, NamedTuple, Optional, Union, cast
10
+ import time
11
+ from typing import Any, Dict, List, NamedTuple, Optional, Union, cast
11
12
  import uuid
12
13
  import base64
13
14
  import math
@@ -25,7 +26,7 @@ from azure.ai.evaluation._constants import (
25
26
  Prefixes,
26
27
  )
27
28
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
28
- from azure.ai.evaluation._model_configurations import AzureAIProject
29
+ from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
29
30
  from azure.ai.evaluation._version import VERSION
30
31
  from azure.ai.evaluation._user_agent import UserAgentSingleton
31
32
  from azure.ai.evaluation._azure._clients import LiteMLClient
@@ -196,8 +197,14 @@ def _log_metrics_and_instance_results_onedp(
196
197
  )
197
198
  )
198
199
 
200
+ # TODO: type mis-match because Evaluation instance is assigned to EvaluationRun
201
+ evaluation_id = (
202
+ upload_run_response.name # type: ignore[attr-defined]
203
+ if hasattr(upload_run_response, "name")
204
+ else upload_run_response.id
205
+ )
199
206
  update_run_response = client.update_evaluation_run(
200
- name=upload_run_response.id,
207
+ name=evaluation_id,
201
208
  evaluation=EvaluationUpload(
202
209
  display_name=evaluation_name,
203
210
  status="Completed",
@@ -330,7 +337,11 @@ def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
330
337
  json.dump(data_dict, f, ensure_ascii=False)
331
338
 
332
339
  # Use tqdm.write to print message without interfering with any current progress bar
333
- tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
340
+ # Fall back to regular print if tqdm.write fails (e.g., when progress bar is closed)
341
+ try:
342
+ tqdm.write(f'Evaluation results saved to "{p.resolve()}".\n')
343
+ except Exception:
344
+ print(f'Evaluation results saved to "{p.resolve()}".\n')
334
345
 
335
346
 
336
347
  def _apply_column_mapping(
@@ -460,7 +471,7 @@ class JSONLDataFileLoader:
460
471
  self.filename = filename
461
472
 
462
473
  def load(self) -> pd.DataFrame:
463
- return pd.read_json(self.filename, lines=True)
474
+ return pd.read_json(self.filename, lines=True, dtype=object)
464
475
 
465
476
 
466
477
  class CSVDataFileLoader:
@@ -468,7 +479,7 @@ class CSVDataFileLoader:
468
479
  self.filename = filename
469
480
 
470
481
  def load(self) -> pd.DataFrame:
471
- return pd.read_csv(self.filename)
482
+ return pd.read_csv(self.filename, dtype=str)
472
483
 
473
484
 
474
485
  class DataLoaderFactory: