azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
  3. azure/ai/evaluation/_aoai/label_grader.py +6 -10
  4. azure/ai/evaluation/_aoai/python_grader.py +7 -10
  5. azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
  6. azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +241 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -2
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
  33. azure/ai/evaluation/_evaluate/_utils.py +10 -3
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
  38. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  39. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
  40. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  41. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  42. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  43. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
  44. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  45. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  46. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  47. azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
  48. azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
  49. azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
  50. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
  52. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  53. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  55. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  56. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  57. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  58. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  59. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  60. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  61. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  62. azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
  63. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  64. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  65. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  66. azure/ai/evaluation/_exceptions.py +6 -1
  67. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  68. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  69. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  70. azure/ai/evaluation/_model_configurations.py +26 -0
  71. azure/ai/evaluation/_version.py +1 -1
  72. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  73. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  74. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  75. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  76. azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
  77. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  78. azure/ai/evaluation/red_team/_red_team.py +494 -37
  79. azure/ai/evaluation/red_team/_red_team_result.py +48 -28
  80. azure/ai/evaluation/red_team/_result_processor.py +558 -29
  81. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  82. azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
  83. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  84. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  85. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  86. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  87. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  88. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  90. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  91. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  92. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  94. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  95. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
  96. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
  97. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  98. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  99. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
+ import json
5
6
  import logging
6
7
  import re
7
8
 
@@ -60,6 +61,7 @@ def _split_evaluators_and_grader_configs(
60
61
  :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
61
62
  :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
62
63
  """
64
+ LOGGER.info(f"AOAI: Splitting {len(evaluators)} evaluators into AOAI graders and standard evaluators...")
63
65
  true_evaluators = {}
64
66
  aoai_graders = {}
65
67
  for key, value in evaluators.items():
@@ -67,6 +69,7 @@ def _split_evaluators_and_grader_configs(
67
69
  aoai_graders[key] = value
68
70
  else:
69
71
  true_evaluators[key] = value
72
+ LOGGER.info(f"AOAI: Found {len(aoai_graders)} AOAI graders and {len(true_evaluators)} standard evaluators.")
70
73
  return true_evaluators, aoai_graders
71
74
 
72
75
 
@@ -103,11 +106,18 @@ def _begin_aoai_evaluation(
103
106
  LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
104
107
  all_eval_run_info: List[OAIEvalRunCreationInfo] = []
105
108
 
106
- for selected_graders, selected_column_mapping in _get_graders_and_column_mappings(graders, column_mappings):
109
+ grader_mapping_list = list(_get_graders_and_column_mappings(graders, column_mappings))
110
+ LOGGER.info(f"AOAI: Will create {len(grader_mapping_list)} separate evaluation run(s) based on column mappings.")
111
+
112
+ for idx, (selected_graders, selected_column_mapping) in enumerate(grader_mapping_list):
113
+ LOGGER.info(
114
+ f"AOAI: Starting evaluation run {idx + 1}/{len(grader_mapping_list)} with {len(selected_graders)} grader(s)..."
115
+ )
107
116
  all_eval_run_info.append(
108
117
  _begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name)
109
118
  )
110
119
 
120
+ LOGGER.info(f"AOAI: Successfully created {len(all_eval_run_info)} evaluation run(s).")
111
121
  return all_eval_run_info
112
122
 
113
123
 
@@ -133,6 +143,7 @@ def _begin_single_aoai_evaluation(
133
143
  """
134
144
 
135
145
  # Format data for eval group creation
146
+ LOGGER.info(f"AOAI: Preparing evaluation for {len(graders)} grader(s): {list(graders.keys())}")
136
147
  grader_name_list = []
137
148
  grader_list = []
138
149
  # It's expected that all graders supplied for a single eval run use the same credentials
@@ -143,10 +154,12 @@ def _begin_single_aoai_evaluation(
143
154
  grader_name_list.append(name)
144
155
  grader_list.append(grader._grader_config)
145
156
  effective_column_mapping: Dict[str, str] = column_mapping or {}
157
+ LOGGER.info(f"AOAI: Generating data source config with {len(effective_column_mapping)} column mapping(s)...")
146
158
  data_source_config = _generate_data_source_config(data, effective_column_mapping)
159
+ LOGGER.info(f"AOAI: Data source config generated with schema type: {data_source_config.get('type')}")
147
160
 
148
161
  # Create eval group
149
- # import pdb; pdb.set_trace()
162
+ LOGGER.info(f"AOAI: Creating eval group with {len(grader_list)} testing criteria...")
150
163
  eval_group_info = client.evals.create(
151
164
  data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
152
165
  )
@@ -167,6 +180,7 @@ def _begin_single_aoai_evaluation(
167
180
  grader_name_map[criteria.id] = name
168
181
 
169
182
  # Create eval run
183
+ LOGGER.info(f"AOAI: Creating eval run '{run_name}' with {len(data)} data rows...")
170
184
  eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping)
171
185
  LOGGER.info(
172
186
  f"AOAI: Eval run created with id {eval_run_id}."
@@ -197,13 +211,16 @@ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> T
197
211
  :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
198
212
  """
199
213
 
214
+ LOGGER.info(f"AOAI: Retrieving results from {len(all_run_info)} evaluation run(s)...")
200
215
  run_metrics = {}
201
216
  output_df = pd.DataFrame()
202
- for run_info in all_run_info:
217
+ for idx, run_info in enumerate(all_run_info):
218
+ LOGGER.info(f"AOAI: Fetching results for run {idx + 1}/{len(all_run_info)} (ID: {run_info['eval_run_id']})...")
203
219
  cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
204
220
  output_df = pd.concat([output_df, cur_output_df], axis=1)
205
221
  run_metrics.update(cur_run_metrics)
206
222
 
223
+ LOGGER.info(f"AOAI: Successfully retrieved all results. Combined dataframe shape: {output_df.shape}")
207
224
  return output_df, run_metrics
208
225
 
209
226
 
@@ -223,8 +240,10 @@ def _get_single_run_results(
223
240
  :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
224
241
  """
225
242
  # Wait for evaluation run to complete
243
+ LOGGER.info(f"AOAI: Waiting for eval run {run_info['eval_run_id']} to complete...")
226
244
  run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
227
245
 
246
+ LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}")
228
247
  if run_results.status != "completed":
229
248
  raise EvaluationException(
230
249
  message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
@@ -235,6 +254,7 @@ def _get_single_run_results(
235
254
  )
236
255
 
237
256
  # Convert run results into a dictionary of metrics
257
+ LOGGER.info(f"AOAI: Processing results and calculating metrics for run {run_info['eval_run_id']}...")
238
258
  run_metrics: Dict[str, Any] = {}
239
259
  if run_results.per_testing_criteria_results is None:
240
260
  msg = (
@@ -255,8 +275,10 @@ def _get_single_run_results(
255
275
  ratio = passed / (passed + failed) if (passed + failed) else 0.0
256
276
  formatted_column_name = f"{grader_name}.pass_rate"
257
277
  run_metrics[formatted_column_name] = ratio
278
+ LOGGER.info(f"AOAI: Grader '{grader_name}': {passed} passed, {failed} failed, pass_rate={ratio:.4f}")
258
279
 
259
280
  # Collect all results with pagination
281
+ LOGGER.info(f"AOAI: Collecting output items for run {run_info['eval_run_id']} with pagination...")
260
282
  all_results: List[Any] = []
261
283
  next_cursor: Optional[str] = None
262
284
  limit = 100 # Max allowed by API
@@ -280,6 +302,7 @@ def _get_single_run_results(
280
302
  else:
281
303
  break
282
304
 
305
+ LOGGER.info(f"AOAI: Collected {len(all_results)} total output items across all pages.")
283
306
  listed_results: Dict[str, List[Any]] = {"index": []}
284
307
  # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
285
308
  for row_result in all_results:
@@ -329,6 +352,7 @@ def _get_single_run_results(
329
352
 
330
353
  # Ensure all columns are the same length as the 'index' list
331
354
  num_rows = len(listed_results["index"])
355
+ LOGGER.info(f"AOAI: Processing {num_rows} result rows into dataframe...")
332
356
  for col_name in list(listed_results.keys()):
333
357
  if col_name != "index":
334
358
  col_length = len(listed_results[col_name])
@@ -356,6 +380,7 @@ def _get_single_run_results(
356
380
  expected = run_info.get("expected_rows", None)
357
381
  if expected is not None:
358
382
  pre_len = len(output_df)
383
+ LOGGER.info(f"AOAI: Validating result count: expected {expected} rows, received {pre_len} rows.")
359
384
  # Assumes original datasource_item_id space is 0..expected-1
360
385
  output_df = output_df.reindex(range(expected))
361
386
  if pre_len != expected:
@@ -388,6 +413,9 @@ def _get_single_run_results(
388
413
 
389
414
  # Reset to RangeIndex so downstream concatenation aligns on position
390
415
  output_df.reset_index(drop=True, inplace=True)
416
+ LOGGER.info(
417
+ f"AOAI: Successfully processed run {run_info['eval_run_id']} with final dataframe shape: {output_df.shape}"
418
+ )
391
419
  return output_df, run_metrics
392
420
 
393
421
 
@@ -481,11 +509,16 @@ def _get_graders_and_column_mappings(
481
509
  :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
482
510
  """
483
511
 
512
+ LOGGER.info(f"AOAI: Organizing {len(graders)} graders with column mappings...")
484
513
  if column_mappings is None:
514
+ LOGGER.info("AOAI: No column mappings provided, each grader will have its own eval run.")
485
515
  return [({name: grader}, None) for name, grader in graders.items()]
486
516
  default_mapping = column_mappings.get("default", None)
487
517
  if default_mapping is None:
488
518
  default_mapping = {}
519
+ LOGGER.info(
520
+ f"AOAI: Using default mapping with {len(default_mapping)} entries for graders without specific mappings."
521
+ )
489
522
  return [
490
523
  ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
491
524
  for name, grader in graders.items()
@@ -593,17 +626,23 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
593
626
  helper function.
594
627
  """
595
628
  # Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
629
+ LOGGER.info(
630
+ f"AOAI: Generating data source config for {len(input_data_df)} rows with {len(column_mapping)} column mapping(s)..."
631
+ )
596
632
  referenced_paths: List[str] = []
597
633
  for v in column_mapping.values():
598
634
  m = DATA_PATH_PATTERN.match(v)
599
635
  if m:
600
636
  referenced_paths.append(m.group(1))
601
637
 
638
+ LOGGER.info(f"AOAI: Found {len(referenced_paths)} referenced paths in column mappings: {referenced_paths}")
602
639
  # Decide if we have nested structures
603
640
  has_nested = any("." in p for p in referenced_paths)
641
+ LOGGER.info(f"AOAI: Schema generation mode: {'nested' if has_nested else 'flat'}")
604
642
 
605
643
  if not referenced_paths or not has_nested:
606
644
  # Legacy flat behavior (existing logic): treat each mapping key as independent string field
645
+ LOGGER.info("AOAI: Using flat schema generation (no nested structures detected).")
607
646
  data_source_config = {
608
647
  "type": "custom",
609
648
  "item_schema": {
@@ -617,6 +656,7 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
617
656
  for key in column_mapping.keys():
618
657
  props[key] = {"type": "string"}
619
658
  req.append(key)
659
+ LOGGER.info(f"AOAI: Flat schema generated with {len(props)} properties: {list(props.keys())}")
620
660
  return data_source_config
621
661
 
622
662
  # NEW: If all nested paths share the same first segment (e.g. 'item'),
@@ -625,12 +665,14 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
625
665
  first_segments = {p.split(".")[0] for p in referenced_paths}
626
666
  strip_wrapper = False
627
667
  wrapper_name = None
668
+ LOGGER.info(f"AOAI: First segments in referenced paths: {first_segments}")
628
669
  if len(first_segments) == 1:
629
670
  only_seg = next(iter(first_segments))
630
671
  # We only strip if that segment looks like the canonical wrapper.
631
672
  if only_seg == WRAPPER_KEY:
632
673
  strip_wrapper = True
633
674
  wrapper_name = only_seg
675
+ LOGGER.info(f"AOAI: All paths start with wrapper '{WRAPPER_KEY}', will strip from schema.")
634
676
 
635
677
  effective_paths = referenced_paths
636
678
  if strip_wrapper:
@@ -645,9 +687,12 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di
645
687
  # If stripping produced at least one usable path, adopt; else fall back to original.
646
688
  if stripped:
647
689
  effective_paths = stripped
690
+ LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}")
648
691
 
692
+ LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...")
649
693
  nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string")
650
694
 
695
+ LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'")
651
696
  return {
652
697
  "type": "custom",
653
698
  "item_schema": nested_schema,
@@ -697,6 +742,23 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
697
742
  :return: A dictionary that can be used as the data source input for an OAI evaluation run.
698
743
  :rtype: Dict[str, Any]
699
744
  """
745
+
746
+ def _convert_value_to_string(val: Any) -> str:
747
+ """Convert a value to string representation for AOAI evaluation."""
748
+ if val is None:
749
+ return ""
750
+ elif isinstance(val, (str, int, float, bool)):
751
+ return str(val)
752
+ else:
753
+ try: # Attempt to JSON serialize lists/dicts
754
+ return json.dumps(val, ensure_ascii=False)
755
+ except (TypeError, ValueError):
756
+ # Fallback for unserializable objects
757
+ return str(val)
758
+
759
+ LOGGER.info(
760
+ f"AOAI: Building data source from {len(input_data_df)} rows with {len(column_mapping)} column mappings..."
761
+ )
700
762
  # Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
701
763
  # relative_parts excludes the wrapper (so schema + content align).
702
764
  path_specs: List[Tuple[str, List[str], str]] = []
@@ -746,24 +808,21 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
746
808
  leaf_name = pieces[-1]
747
809
  path_specs.append((formatted_entry, [leaf_name], run_col))
748
810
 
811
+ LOGGER.info(f"AOAI: Processed {len(path_specs)} path specifications from column mappings.")
749
812
  content: List[Dict[str, Any]] = []
750
813
 
751
814
  for _, row in input_data_df.iterrows():
752
815
  item_root: Dict[str, Any] = {}
753
816
 
817
+ # Track which dataframe columns have been processed via column_mapping
818
+ processed_cols: Set[str] = set()
819
+
754
820
  for _, rel_parts, df_col in path_specs:
755
821
  # Safely fetch value
756
822
  val = row.get(df_col, None)
757
823
 
758
824
  # Convert value to string to match schema's "type": "string" leaves.
759
- # (If you later infer types, you can remove the stringify.)
760
- if val is None:
761
- str_val = ""
762
- elif isinstance(val, (str, int, float, bool)):
763
- str_val = str(val)
764
- else:
765
- # Lists / dicts / other -> string for now
766
- str_val = str(val)
825
+ str_val = _convert_value_to_string(val)
767
826
 
768
827
  # Insert into nested dict
769
828
  cursor = item_root
@@ -776,8 +835,19 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]
776
835
  leaf_key = rel_parts[-1]
777
836
  cursor[leaf_key] = str_val
778
837
 
838
+ # Mark this dataframe column as processed
839
+ processed_cols.add(df_col)
840
+
841
+ # Add any unmapped dataframe columns directly to item_root
842
+ for col_name in input_data_df.columns:
843
+ if col_name not in processed_cols:
844
+ val = row.get(col_name, None)
845
+ str_val = _convert_value_to_string(val)
846
+ item_root[col_name] = str_val
847
+
779
848
  content.append({WRAPPER_KEY: item_root})
780
849
 
850
+ LOGGER.info(f"AOAI: Generated {len(content)} content items for data source.")
781
851
  return {
782
852
  "type": "jsonl",
783
853
  "source": {
@@ -812,6 +882,7 @@ def _begin_eval_run(
812
882
  :rtype: str
813
883
  """
814
884
 
885
+ LOGGER.info(f"AOAI: Creating eval run '{run_name}' for eval group {eval_group_id}...")
815
886
  data_source = _get_data_source(input_data_df, column_mapping)
816
887
  eval_run = client.evals.runs.create(
817
888
  eval_id=eval_group_id,
@@ -820,6 +891,7 @@ def _begin_eval_run(
820
891
  metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
821
892
  # TODO decide if we want to add our own timeout value?
822
893
  )
894
+ LOGGER.info(f"AOAI: Eval run created successfully with ID: {eval_run.id}")
823
895
  return eval_run.id
824
896
 
825
897
 
@@ -856,8 +928,11 @@ def _wait_for_run_conclusion(
856
928
  if total_wait > max_wait_seconds:
857
929
  wait_interval -= total_wait - max_wait_seconds
858
930
  sleep(wait_interval)
931
+ iters += 1
859
932
  response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
933
+ LOGGER.info(f"AOAI: Polling iteration {iters}, status: {response.status}, total wait: {total_wait:.1f}s")
860
934
  if response.status not in ["queued", "in_progress"]:
935
+ LOGGER.info(f"AOAI: Eval run {eval_run_id} reached terminal status: {response.status}")
861
936
  return response
862
937
  if total_wait > max_wait_seconds:
863
938
  raise EvaluationException(
@@ -7,7 +7,8 @@ import os
7
7
  import re
8
8
  import tempfile
9
9
  from pathlib import Path
10
- from typing import Any, Dict, NamedTuple, Optional, Union, cast
10
+ import time
11
+ from typing import Any, Dict, List, NamedTuple, Optional, Union, cast
11
12
  import uuid
12
13
  import base64
13
14
  import math
@@ -25,7 +26,7 @@ from azure.ai.evaluation._constants import (
25
26
  Prefixes,
26
27
  )
27
28
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
28
- from azure.ai.evaluation._model_configurations import AzureAIProject
29
+ from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
29
30
  from azure.ai.evaluation._version import VERSION
30
31
  from azure.ai.evaluation._user_agent import UserAgentSingleton
31
32
  from azure.ai.evaluation._azure._clients import LiteMLClient
@@ -196,8 +197,14 @@ def _log_metrics_and_instance_results_onedp(
196
197
  )
197
198
  )
198
199
 
200
+ # TODO: type mis-match because Evaluation instance is assigned to EvaluationRun
201
+ evaluation_id = (
202
+ upload_run_response.name # type: ignore[attr-defined]
203
+ if hasattr(upload_run_response, "name")
204
+ else upload_run_response.id
205
+ )
199
206
  update_run_response = client.update_evaluation_run(
200
- name=upload_run_response.id,
207
+ name=evaluation_id,
201
208
  evaluation=EvaluationUpload(
202
209
  display_name=evaluation_name,
203
210
  status="Completed",
@@ -0,0 +1,76 @@
1
+ from abc import ABC
2
+ from typing import Dict, List, Optional, Any
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class EvaluatorMetric:
8
+ type: str = "ordinal"
9
+ desirable_direction: Optional[str] = None
10
+ min_value: Optional[float] = None
11
+ max_value: Optional[float] = None
12
+
13
+ def to_dict(self) -> Dict[str, Any]:
14
+ result = {"type": self.type}
15
+ if self.desirable_direction is not None:
16
+ result["desirable_direction"] = self.desirable_direction
17
+ if self.min_value is not None:
18
+ result["min_value"] = self.min_value
19
+ if self.max_value is not None:
20
+ result["max_value"] = self.max_value
21
+ return result
22
+
23
+ @classmethod
24
+ def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorMetric":
25
+ return cls(
26
+ type=data.get("type", "ordinal"),
27
+ desirable_direction=data.get("desirable_direction"),
28
+ min_value=data.get("min_value"),
29
+ max_value=data.get("max_value"),
30
+ )
31
+
32
+
33
+ @dataclass
34
+ class ObjectParameterDescriptorWithRequired:
35
+ required: List[str] = field(default_factory=list)
36
+ type: str = "object"
37
+ properties: Dict[str, Any] = field(default_factory=dict)
38
+
39
+ def to_dict(self) -> Dict[str, Any]:
40
+ return {"required": self.required, "type": self.type, "properties": self.properties}
41
+
42
+ @classmethod
43
+ def from_dict(cls, data: Dict[str, Any]) -> "ObjectParameterDescriptorWithRequired":
44
+ return cls(
45
+ required=data.get("required", []), type=data.get("type", "object"), properties=data.get("properties", {})
46
+ )
47
+
48
+
49
+ class EvaluatorDefinition(ABC):
50
+ """Base class for evaluator definitions"""
51
+
52
+ def __init__(self):
53
+ self.init_parameters: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
54
+ self.metrics: Dict[str, EvaluatorMetric] = {}
55
+ self.data_schema: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
56
+ self.type: str = "unknown"
57
+
58
+ def to_dict(self) -> Dict[str, Any]:
59
+ result = {
60
+ "type": self.type,
61
+ "init_parameters": self.init_parameters.to_dict(),
62
+ "metrics": {k: v.to_dict() for k, v in self.metrics.items()},
63
+ "data_schema": self.data_schema.to_dict(),
64
+ }
65
+ return result
66
+
67
+ @classmethod
68
+ def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorDefinition":
69
+ # Create a generic instance since specific subclasses are not defined
70
+ instance = cls.__new__(cls)
71
+ instance.__init__()
72
+
73
+ instance.init_parameters = ObjectParameterDescriptorWithRequired.from_dict(data.get("init_parameters", {}))
74
+ instance.metrics = {k: EvaluatorMetric.from_dict(v) for k, v in data.get("metrics", {}).items()}
75
+ instance.data_schema = ObjectParameterDescriptorWithRequired.from_dict(data.get("data_schema", {}))
76
+ return instance
@@ -46,6 +46,7 @@ class BleuScoreEvaluator(EvaluatorBase):
46
46
  https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
47
47
 
48
48
  .. admonition:: Example with Threshold:
49
+
49
50
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
50
51
  :start-after: [START threshold_bleu_score_evaluator]
51
52
  :end-before: [END threshold_bleu_score_evaluator]
@@ -56,23 +56,6 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
56
56
  :param kwargs: Additional arguments to pass to the evaluator.
57
57
  :type kwargs: Any
58
58
 
59
- .. admonition:: Example:
60
-
61
- .. literalinclude:: ../samples/evaluation_samples_evaluate.py
62
- :start-after: [START code_vulnerability_evaluator]
63
- :end-before: [END code_vulnerability_evaluator]
64
- :language: python
65
- :dedent: 8
66
- :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
67
-
68
- .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
69
- :start-after: [START code_vulnerability_evaluator]
70
- :end-before: [END code_vulnerability_evaluator]
71
- :language: python
72
- :dedent: 8
73
- :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
74
- https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
75
-
76
59
  .. note::
77
60
 
78
61
  If this evaluator is supplied to the `evaluate` function, the metric
@@ -5,6 +5,7 @@
5
5
  import inspect
6
6
  from abc import ABC, abstractmethod
7
7
  import json
8
+ import copy
8
9
  from typing import (
9
10
  Any,
10
11
  Callable,
@@ -39,6 +40,8 @@ from azure.ai.evaluation._common._experimental import experimental
39
40
 
40
41
  from ._conversation_aggregators import GetAggregator, GetAggregatorType
41
42
 
43
+ import copy
44
+
42
45
  P = ParamSpec("P")
43
46
  T = TypeVar("T")
44
47
  T_EvalValue = TypeVar("T_EvalValue")
@@ -111,6 +114,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
111
114
  _NOT_APPLICABLE_RESULT = "not applicable"
112
115
  _PASS_RESULT = "pass"
113
116
  _FAIL_RESULT = "fail"
117
+ _type = "azure_ai_evaluator"
114
118
 
115
119
  # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
116
120
 
@@ -488,13 +492,17 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
488
492
  """
489
493
  tool_calls = []
490
494
  tool_results_map = {}
491
- if isinstance(response, list):
492
- for message in response:
495
+
496
+ # Work on a deep copy to avoid modifying the original object
497
+ response_copy = copy.deepcopy(response)
498
+
499
+ if isinstance(response_copy, list):
500
+ for message in response_copy:
493
501
  # Extract tool calls from assistant messages
494
502
  if message.get("role") == "assistant" and isinstance(message.get("content"), list):
495
503
  for content_item in message.get("content"):
496
504
  if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
497
- tool_calls.append(content_item)
505
+ tool_calls.append(copy.deepcopy(content_item))
498
506
 
499
507
  # Extract tool results from tool messages
500
508
  elif message.get("role") == "tool" and message.get("tool_call_id"):
@@ -582,7 +590,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
582
590
  :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
583
591
  """
584
592
  # Convert inputs into list of evaluable inputs.
585
- eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
593
+ try:
594
+ eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
595
+ except Exception as e:
596
+ print(f"Error converting kwargs to eval_input_list: {e}")
597
+ raise e
586
598
  per_turn_results = []
587
599
  # Evaluate all inputs.
588
600
  for eval_input in eval_input_list: