azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show
  1. azure/ai/evaluation/__init__.py +46 -12
  2. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  3. azure/ai/evaluation/_aoai/score_model_grader.py +1 -0
  4. azure/ai/evaluation/_common/onedp/models/_models.py +5 -0
  5. azure/ai/evaluation/_common/rai_service.py +3 -3
  6. azure/ai/evaluation/_common/utils.py +74 -17
  7. azure/ai/evaluation/_converters/_ai_services.py +60 -10
  8. azure/ai/evaluation/_converters/_models.py +75 -26
  9. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +70 -22
  10. azure/ai/evaluation/_evaluate/_eval_run.py +14 -1
  11. azure/ai/evaluation/_evaluate/_evaluate.py +163 -44
  12. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +79 -33
  13. azure/ai/evaluation/_evaluate/_utils.py +5 -2
  14. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  15. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +8 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +3 -2
  17. azure/ai/evaluation/_evaluators/_common/_base_eval.py +143 -25
  18. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +7 -2
  19. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +19 -9
  20. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +15 -5
  21. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +4 -1
  22. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +4 -1
  23. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +5 -2
  24. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +4 -1
  25. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +3 -0
  26. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -0
  27. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +1 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -2
  29. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  30. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +114 -4
  31. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +9 -3
  32. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +1 -1
  33. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +8 -1
  34. azure/ai/evaluation/_evaluators/_qa/_qa.py +1 -1
  35. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +56 -3
  36. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  37. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +11 -3
  38. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +3 -2
  39. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +1 -1
  40. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +2 -1
  41. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +3 -2
  42. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +24 -12
  43. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  44. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +214 -187
  45. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +126 -31
  46. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +8 -1
  47. azure/ai/evaluation/_evaluators/_xpia/xpia.py +4 -1
  48. azure/ai/evaluation/_exceptions.py +1 -0
  49. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  50. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +115 -30
  51. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  52. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  53. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +28 -31
  54. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +2 -0
  55. azure/ai/evaluation/_version.py +1 -1
  56. azure/ai/evaluation/red_team/__init__.py +4 -3
  57. azure/ai/evaluation/red_team/_attack_objective_generator.py +17 -0
  58. azure/ai/evaluation/red_team/_callback_chat_target.py +14 -1
  59. azure/ai/evaluation/red_team/_evaluation_processor.py +376 -0
  60. azure/ai/evaluation/red_team/_mlflow_integration.py +322 -0
  61. azure/ai/evaluation/red_team/_orchestrator_manager.py +661 -0
  62. azure/ai/evaluation/red_team/_red_team.py +655 -2665
  63. azure/ai/evaluation/red_team/_red_team_result.py +6 -0
  64. azure/ai/evaluation/red_team/_result_processor.py +610 -0
  65. azure/ai/evaluation/red_team/_utils/__init__.py +34 -0
  66. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +11 -4
  67. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +6 -0
  68. azure/ai/evaluation/red_team/_utils/constants.py +0 -2
  69. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  70. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  71. azure/ai/evaluation/red_team/_utils/formatting_utils.py +115 -13
  72. azure/ai/evaluation/red_team/_utils/metric_mapping.py +24 -4
  73. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  74. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  75. azure/ai/evaluation/red_team/_utils/strategy_utils.py +17 -4
  76. azure/ai/evaluation/simulator/_adversarial_simulator.py +14 -2
  77. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +13 -1
  78. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +21 -7
  79. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +24 -5
  80. azure/ai/evaluation/simulator/_simulator.py +12 -0
  81. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/METADATA +63 -4
  82. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/RECORD +85 -76
  83. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/WHEEL +1 -1
  84. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info/licenses}/NOTICE.txt +0 -0
  85. {azure_ai_evaluation-1.9.0.dist-info → azure_ai_evaluation-1.11.0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,9 @@ import json
7
7
  import logging
8
8
  import os
9
9
  import re
10
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
10
+ import tempfile
11
+ import json
12
+ from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
11
13
 
12
14
  from openai import OpenAI, AzureOpenAI
13
15
  from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
@@ -462,7 +464,7 @@ def _validate_columns_for_evaluators(
462
464
  )
463
465
 
464
466
 
465
- def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
467
+ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name, tags):
466
468
  if data is None:
467
469
  msg = "The 'data' parameter is required for evaluation."
468
470
  raise EvaluationException(
@@ -611,6 +613,18 @@ def _apply_target_to_data(
611
613
  category=ErrorCategory.FAILED_EXECUTION,
612
614
  blame=ErrorBlame.USER_ERROR,
613
615
  )
616
+
617
+ # Log a warning if some rows failed
618
+ failed_lines = run_summary.get("failed_lines", 0)
619
+ completed_lines = run_summary["completed_lines"]
620
+ total_lines = failed_lines + completed_lines
621
+
622
+ if failed_lines > 0:
623
+ LOGGER.warning(
624
+ f"Target function completed {completed_lines} out of {total_lines} rows. "
625
+ f"{failed_lines} rows failed and will be filled with NaN values."
626
+ )
627
+
614
628
  # Remove input and output prefix
615
629
  generated_columns = {
616
630
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -618,6 +632,13 @@ def _apply_target_to_data(
618
632
  # Sort output by line numbers
619
633
  target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
620
634
  target_output.sort_index(inplace=True)
635
+
636
+ initial_data_with_line_numbers = initial_data.copy()
637
+ initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
638
+
639
+ complete_index = initial_data_with_line_numbers[LINE_NUMBER]
640
+ target_output = target_output.reindex(complete_index)
641
+
621
642
  target_output.reset_index(inplace=True, drop=False)
622
643
  # target_output contains only input columns, taken by function,
623
644
  # so we need to concatenate it to the input data frame.
@@ -626,8 +647,8 @@ def _apply_target_to_data(
626
647
  # Rename outputs columns to __outputs
627
648
  rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
628
649
  target_output.rename(columns=rename_dict, inplace=True)
629
- # Concatenate output to input
630
- target_output = pd.concat([target_output, initial_data], axis=1)
650
+ # Concatenate output to input - now both dataframes have the same number of rows
651
+ target_output = pd.concat([initial_data, target_output], axis=1)
631
652
 
632
653
  return target_output, generated_columns, run
633
654
 
@@ -645,7 +666,7 @@ def _process_column_mappings(
645
666
 
646
667
  processed_config: Dict[str, Dict[str, str]] = {}
647
668
 
648
- expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
669
+ expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
649
670
 
650
671
  if column_mapping:
651
672
  for evaluator, mapping_config in column_mapping.items():
@@ -704,6 +725,7 @@ def evaluate(
704
725
  azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
705
726
  output_path: Optional[Union[str, os.PathLike]] = None,
706
727
  fail_on_evaluator_errors: bool = False,
728
+ tags: Optional[Dict[str, str]] = None,
707
729
  **kwargs,
708
730
  ) -> EvaluationResult:
709
731
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
@@ -736,6 +758,10 @@ def evaluate(
736
758
  Defaults to false, which means that evaluations will continue regardless of failures.
737
759
  If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
738
760
  :paramtype fail_on_evaluator_errors: bool
761
+ :keyword tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
762
+ Keys and values must be strings. For more information about tag limits, see:
763
+ https://learn.microsoft.com/en-us/azure/machine-learning/resource-limits-capacity?view=azureml-api-2#runs
764
+ :paramtype tags: Optional[Dict[str, str]]
739
765
  :keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
740
766
  :paramtype user_agent: Optional[str]
741
767
  :return: Evaluation results.
@@ -772,6 +798,7 @@ def evaluate(
772
798
  azure_ai_project=azure_ai_project,
773
799
  output_path=output_path,
774
800
  fail_on_evaluator_errors=fail_on_evaluator_errors,
801
+ tags=tags,
775
802
  **kwargs,
776
803
  )
777
804
  except Exception as e:
@@ -840,6 +867,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
840
867
  azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
841
868
  output_path: Optional[Union[str, os.PathLike]] = None,
842
869
  fail_on_evaluator_errors: bool = False,
870
+ tags: Optional[Dict[str, str]] = None,
843
871
  **kwargs,
844
872
  ) -> EvaluationResult:
845
873
  if fail_on_evaluator_errors:
@@ -855,6 +883,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
855
883
  output_path=output_path,
856
884
  azure_ai_project=azure_ai_project,
857
885
  evaluation_name=evaluation_name,
886
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
887
+ tags=tags,
858
888
  **kwargs,
859
889
  )
860
890
 
@@ -934,7 +964,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
934
964
  name_map = _map_names_to_builtins(evaluators, graders)
935
965
  if is_onedp_project(azure_ai_project):
936
966
  studio_url = _log_metrics_and_instance_results_onedp(
937
- metrics, results_df, azure_ai_project, evaluation_name, name_map, **kwargs
967
+ metrics, results_df, azure_ai_project, evaluation_name, name_map, tags=tags, **kwargs
938
968
  )
939
969
  else:
940
970
  # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
@@ -942,7 +972,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
942
972
  studio_url = None
943
973
  if trace_destination:
944
974
  studio_url = _log_metrics_and_instance_results(
945
- metrics, results_df, trace_destination, None, evaluation_name, name_map, **kwargs
975
+ metrics, results_df, trace_destination, None, evaluation_name, name_map, tags=tags, **kwargs
946
976
  )
947
977
 
948
978
  result_df_dict = results_df.to_dict("records")
@@ -962,6 +992,8 @@ def _preprocess_data(
962
992
  output_path: Optional[Union[str, os.PathLike]] = None,
963
993
  azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
964
994
  evaluation_name: Optional[str] = None,
995
+ fail_on_evaluator_errors: bool = False,
996
+ tags: Optional[Dict[str, str]] = None,
965
997
  **kwargs,
966
998
  ) -> __ValidatedData:
967
999
  # Process evaluator config to replace ${target.} with ${data.}
@@ -969,7 +1001,7 @@ def _preprocess_data(
969
1001
  evaluator_config = {}
970
1002
 
971
1003
  input_data_df = _validate_and_load_data(
972
- target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name
1004
+ target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name, tags
973
1005
  )
974
1006
  if target is not None:
975
1007
  _validate_columns_for_target(input_data_df, target)
@@ -995,15 +1027,49 @@ def _preprocess_data(
995
1027
  batch_run_client: BatchClient
996
1028
  batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
997
1029
 
998
- if kwargs.pop("_use_run_submitter_client", False):
999
- batch_run_client = RunSubmitterClient()
1030
+ def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
1031
+ """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
1032
+ _use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
1033
+ _use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
1034
+
1035
+ if _use_run_submitter_client is None and _use_pf_client is None:
1036
+ # If both are unset, return default
1037
+ return "run_submitter"
1038
+
1039
+ if _use_run_submitter_client and _use_pf_client:
1040
+ raise EvaluationException(
1041
+ message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
1042
+ target=ErrorTarget.EVALUATE,
1043
+ category=ErrorCategory.INVALID_VALUE,
1044
+ blame=ErrorBlame.USER_ERROR,
1045
+ )
1046
+
1047
+ if _use_run_submitter_client == False and _use_pf_client == False:
1048
+ return "code_client"
1049
+
1050
+ if _use_run_submitter_client:
1051
+ return "run_submitter"
1052
+ if _use_pf_client:
1053
+ return "pf_client"
1054
+
1055
+ if _use_run_submitter_client is None and _use_pf_client == False:
1056
+ return "run_submitter"
1057
+ if _use_run_submitter_client == False and _use_pf_client is None:
1058
+ return "pf_client"
1059
+
1060
+ assert False, "This should be impossible"
1061
+
1062
+ client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
1063
+
1064
+ if client_type == "run_submitter":
1065
+ batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
1000
1066
  batch_run_data = input_data_df
1001
- elif kwargs.pop("_use_pf_client", True):
1067
+ elif client_type == "pf_client":
1002
1068
  batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
1003
1069
  # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1004
1070
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
1005
1071
  batch_run_data = os.path.abspath(data)
1006
- else:
1072
+ elif client_type == "code_client":
1007
1073
  batch_run_client = CodeClient()
1008
1074
  batch_run_data = input_data_df
1009
1075
 
@@ -1013,17 +1079,50 @@ def _preprocess_data(
1013
1079
  target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
1014
1080
  )
1015
1081
 
1016
- for evaluator_name, mapping in column_mapping.items():
1017
- mapped_to_values = set(mapping.values())
1018
- for col in target_generated_columns:
1019
- # If user defined mapping differently, do not change it.
1020
- # If it was mapped to target, we have already changed it
1021
- # in _process_column_mappings
1022
- run_output = f"${{run.outputs.{col}}}"
1023
- # We will add our mapping only if
1024
- # customer did not mapped target output.
1025
- if col not in mapping and run_output not in mapped_to_values:
1026
- column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
1082
+ # IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
1083
+ # This ensures that evaluators get all rows (including failed ones with NaN values)
1084
+ if isinstance(batch_run_client, ProxyClient):
1085
+ # Create a temporary JSONL file with the complete dataframe
1086
+ temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
1087
+ try:
1088
+ for _, row in input_data_df.iterrows():
1089
+ row_dict = row.to_dict()
1090
+ temp_file.write(json.dumps(row_dict) + "\n")
1091
+ temp_file.close()
1092
+ batch_run_data = temp_file.name
1093
+
1094
+ # Update column mappings to use data references instead of run outputs
1095
+ for evaluator_name, mapping in column_mapping.items():
1096
+ mapped_to_values = set(mapping.values())
1097
+ for col in target_generated_columns:
1098
+ # Use data reference instead of run output to ensure we get all rows
1099
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1100
+
1101
+ # We will add our mapping only if customer did not map target output.
1102
+ if col not in mapping and target_reference not in mapped_to_values:
1103
+ column_mapping[evaluator_name][col] = target_reference
1104
+
1105
+ # Don't pass the target_run since we're now using the complete dataframe
1106
+ target_run = None
1107
+
1108
+ except Exception as e:
1109
+ # Clean up the temp file if something goes wrong
1110
+ if os.path.exists(temp_file.name):
1111
+ os.unlink(temp_file.name)
1112
+ raise e
1113
+ else:
1114
+ # For DataFrame-based clients, update batch_run_data to use the updated input_data_df
1115
+ batch_run_data = input_data_df
1116
+
1117
+ # Update column mappings for DataFrame clients
1118
+ for evaluator_name, mapping in column_mapping.items():
1119
+ mapped_to_values = set(mapping.values())
1120
+ for col in target_generated_columns:
1121
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1122
+
1123
+ # We will add our mapping only if customer did not map target output.
1124
+ if col not in mapping and target_reference not in mapped_to_values:
1125
+ column_mapping[evaluator_name][col] = target_reference
1027
1126
 
1028
1127
  # After we have generated all columns, we can check if we have everything we need for evaluators.
1029
1128
  _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -1062,30 +1161,50 @@ def _run_callable_evaluators(
1062
1161
  batch_run_data = validated_data["batch_run_data"]
1063
1162
  column_mapping = validated_data["column_mapping"]
1064
1163
  evaluators = validated_data["evaluators"]
1065
- with EvalRunContext(batch_run_client):
1066
- runs = {
1067
- evaluator_name: batch_run_client.run(
1068
- flow=evaluator,
1069
- data=batch_run_data,
1070
- run=target_run,
1071
- evaluator_name=evaluator_name,
1072
- column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
1073
- stream=True,
1074
- name=kwargs.get("_run_name"),
1075
- )
1076
- for evaluator_name, evaluator in evaluators.items()
1077
- }
1078
1164
 
1079
- # get_details needs to be called within EvalRunContext scope in order to have user agent populated
1080
- per_evaluator_results: Dict[str, __EvaluatorInfo] = {
1081
- evaluator_name: {
1082
- "result": batch_run_client.get_details(run, all_results=True),
1083
- "metrics": batch_run_client.get_metrics(run),
1084
- "run_summary": batch_run_client.get_run_summary(run),
1165
+ # Clean up temporary file after evaluation if it was created
1166
+ temp_file_to_cleanup = None
1167
+ if (
1168
+ isinstance(batch_run_client, ProxyClient)
1169
+ and isinstance(batch_run_data, str)
1170
+ and batch_run_data.endswith(".jsonl")
1171
+ ):
1172
+ # Check if it's a temporary file (contains temp directory path)
1173
+ if tempfile.gettempdir() in batch_run_data:
1174
+ temp_file_to_cleanup = batch_run_data
1175
+
1176
+ try:
1177
+ with EvalRunContext(batch_run_client):
1178
+ runs = {
1179
+ evaluator_name: batch_run_client.run(
1180
+ flow=evaluator,
1181
+ data=batch_run_data,
1182
+ # Don't pass target_run when using complete dataframe
1183
+ run=target_run,
1184
+ evaluator_name=evaluator_name,
1185
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
1186
+ stream=True,
1187
+ name=kwargs.get("_run_name"),
1188
+ )
1189
+ for evaluator_name, evaluator in evaluators.items()
1085
1190
  }
1086
- for evaluator_name, run in runs.items()
1087
- }
1088
1191
 
1192
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
1193
+ per_evaluator_results: Dict[str, __EvaluatorInfo] = {
1194
+ evaluator_name: {
1195
+ "result": batch_run_client.get_details(run, all_results=True),
1196
+ "metrics": batch_run_client.get_metrics(run),
1197
+ "run_summary": batch_run_client.get_run_summary(run),
1198
+ }
1199
+ for evaluator_name, run in runs.items()
1200
+ }
1201
+ finally:
1202
+ # Clean up temporary file if it was created
1203
+ if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
1204
+ try:
1205
+ os.unlink(temp_file_to_cleanup)
1206
+ except Exception as e:
1207
+ LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
1089
1208
  # Concatenate all results
1090
1209
  evaluators_result_df = pd.DataFrame()
1091
1210
  evaluators_metric = {}
@@ -29,6 +29,10 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
29
29
  eval_group_id: str
30
30
  eval_run_id: str
31
31
  grader_name_map: Dict[str, str]
32
+ # Total number of expected rows in the original dataset. Used to
33
+ # re-align AOAI grader results to guard against silent row drops
34
+ # causing horizontal concatenation misalignment.
35
+ expected_rows: int
32
36
 
33
37
 
34
38
  def _split_evaluators_and_grader_configs(
@@ -157,7 +161,11 @@ def _begin_single_aoai_evaluation(
157
161
  )
158
162
 
159
163
  return OAIEvalRunCreationInfo(
160
- client=client, eval_group_id=eval_group_info.id, eval_run_id=eval_run_id, grader_name_map=grader_name_map
164
+ client=client,
165
+ eval_group_id=eval_group_info.id,
166
+ eval_run_id=eval_run_id,
167
+ grader_name_map=grader_name_map,
168
+ expected_rows=len(data),
161
169
  )
162
170
 
163
171
 
@@ -214,7 +222,7 @@ def _get_single_run_results(
214
222
  )
215
223
 
216
224
  # Convert run results into a dictionary of metrics
217
- run_metrics = {}
225
+ run_metrics: Dict[str, Any] = {}
218
226
  if run_results.per_testing_criteria_results is None:
219
227
  msg = (
220
228
  "AOAI evaluation run returned no results, despite 'completed' status. This might"
@@ -231,28 +239,16 @@ def _get_single_run_results(
231
239
  grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
232
240
  passed = criteria_result.passed
233
241
  failed = criteria_result.failed
234
- ratio = passed / (passed + failed)
242
+ ratio = passed / (passed + failed) if (passed + failed) else 0.0
235
243
  formatted_column_name = f"{grader_name}.pass_rate"
236
244
  run_metrics[formatted_column_name] = ratio
237
245
 
238
- # Get full results and convert them into a dataframe.
239
- # Notes on raw full data output from OAI eval runs:
240
- # Each row in the full results list in itself a list.
241
- # Each entry corresponds to one grader's results from the criteria list
242
- # that was inputted to the eval group.
243
- # Each entry is a dictionary, with a name, sample, passed boolean, and score number.
244
- # The name is used to figure out which grader the entry refers to, the sample is ignored.
245
- # The passed and score values are then added to the results dictionary, prepended with the grader's name
246
- # as entered by the user in the inputted dictionary.
247
- # Other values, if they exist, are also added to the results dictionary.
248
-
249
246
  # Collect all results with pagination
250
- all_results = []
251
- next_cursor = None
247
+ all_results: List[Any] = []
248
+ next_cursor: Optional[str] = None
252
249
  limit = 100 # Max allowed by API
253
250
 
254
251
  while True:
255
- # Build kwargs for the API call
256
252
  list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
257
253
  if next_cursor is not None:
258
254
  list_kwargs["after"] = next_cursor
@@ -265,28 +261,25 @@ def _get_single_run_results(
265
261
  # Check for more pages
266
262
  if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
267
263
  if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
268
- # Get the last item's ID for cursor-based pagination
269
264
  next_cursor = raw_list_results.data[-1].id
270
265
  else:
271
266
  break
272
267
  else:
273
268
  break
274
269
 
275
- listed_results = {"index": []}
276
- # raw data has no order guarantees, we need to sort them by their
277
- # datasource_item_id
270
+ listed_results: Dict[str, List[Any]] = {"index": []}
271
+ # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
278
272
  for row_result in all_results:
279
- # Add the datasource_item_id for later sorting
280
273
  listed_results["index"].append(row_result.datasource_item_id)
281
274
  for single_grader_row_result in row_result.results:
282
275
  grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
283
276
  for name, value in single_grader_row_result.items():
284
- if name in ["name"]: # Todo decide if we also want to exclude "sample"
277
+ if name in ["name"]:
285
278
  continue
286
279
  if name.lower() == "passed":
287
- # create a `_result` column for each grader
280
+ # Create a `_result` column for each grader
288
281
  result_column_name = f"outputs.{grader_name}.{grader_name}_result"
289
- if len(result_column_name) < 50: # TODO: is this the limit? Should we keep "passed"?
282
+ if len(result_column_name) < 50:
290
283
  if result_column_name not in listed_results:
291
284
  listed_results[result_column_name] = []
292
285
  listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
@@ -296,23 +289,67 @@ def _get_single_run_results(
296
289
  listed_results[formatted_column_name] = []
297
290
  listed_results[formatted_column_name].append(value)
298
291
 
299
- # Ensure all columns have the same length as the index
292
+ # Ensure all columns are the same length as the 'index' list
300
293
  num_rows = len(listed_results["index"])
301
294
  for col_name in list(listed_results.keys()):
302
295
  if col_name != "index":
303
296
  col_length = len(listed_results[col_name])
304
297
  if col_length < num_rows:
305
- # Pad with None values
306
298
  listed_results[col_name].extend([None] * (num_rows - col_length))
307
299
  elif col_length > num_rows:
308
- # This shouldn't happen, but truncate if it does
309
300
  listed_results[col_name] = listed_results[col_name][:num_rows]
310
301
 
311
302
  output_df = pd.DataFrame(listed_results)
312
- # sort by index
313
- output_df = output_df.sort_values("index", ascending=[True])
314
- # remove index column
315
- output_df.drop(columns=["index"], inplace=True)
303
+
304
+ # If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
305
+ if "index" not in output_df.columns:
306
+ output_df["index"] = list(range(len(output_df)))
307
+
308
+ # Deterministic ordering by original datasource_item_id
309
+ output_df = output_df.sort_values("index", ascending=True)
310
+
311
+ # Keep a temporary row-id copy for debugging/inspection.
312
+ # Use underscores (not hyphens) to avoid pandas column handling quirks.
313
+ output_df["__azure_ai_evaluation_index"] = output_df["index"]
314
+
315
+ # Preserve original ids as index, then pad to expected length
316
+ output_df.set_index("index", inplace=True)
317
+
318
+ expected = run_info.get("expected_rows", None)
319
+ if expected is not None:
320
+ pre_len = len(output_df)
321
+ # Assumes original datasource_item_id space is 0..expected-1
322
+ output_df = output_df.reindex(range(expected))
323
+ if pre_len != expected:
324
+ missing_rows = expected - pre_len
325
+ LOGGER.warning(
326
+ "AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
327
+ run_info["eval_run_id"],
328
+ pre_len,
329
+ expected,
330
+ missing_rows,
331
+ )
332
+ # Add a per-grader 'row_missing' boolean for padded rows
333
+ grader_user_names: Set[str] = set()
334
+ for col in output_df.columns:
335
+ if col.startswith("outputs."):
336
+ parts = col.split(".")
337
+ if len(parts) > 2:
338
+ grader_user_names.add(parts[1])
339
+ if grader_user_names:
340
+ missing_index_mask = output_df.isna().all(axis=1)
341
+ for g in grader_user_names:
342
+ col_name = f"outputs.{g}.row_missing"
343
+ if col_name not in output_df:
344
+ output_df[col_name] = False
345
+ output_df.loc[missing_index_mask, col_name] = True
346
+
347
+ # Drop the temporary helper column before returning (no public surface change)
348
+ if "__azure_ai_evaluation_index" in output_df.columns:
349
+ output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
350
+
351
+ # Reset to RangeIndex so downstream concatenation aligns on position
352
+ output_df.reset_index(drop=True, inplace=True)
316
353
  return output_df, run_metrics
317
354
 
318
355
 
@@ -353,6 +390,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
353
390
  AzureOpenAIStringCheckGrader,
354
391
  AzureOpenAITextSimilarityGrader,
355
392
  AzureOpenAIScoreModelGrader,
393
+ AzureOpenAIPythonGrader,
356
394
  )
357
395
 
358
396
  id_map = {
@@ -361,6 +399,7 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
361
399
  AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
362
400
  AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
363
401
  AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
402
+ AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
364
403
  }
365
404
 
366
405
  for key in id_map.keys():
@@ -404,8 +443,15 @@ def _get_graders_and_column_mappings(
404
443
  :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
405
444
  """
406
445
 
446
+ if column_mappings is None:
447
+ return [({name: grader}, None) for name, grader in graders.items()]
407
448
  default_mapping = column_mappings.get("default", None)
408
- return [({name: grader}, column_mappings.get(name, default_mapping)) for name, grader in graders.items()]
449
+ if default_mapping is None:
450
+ default_mapping = {}
451
+ return [
452
+ ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
453
+ for name, grader in graders.items()
454
+ ]
409
455
 
410
456
 
411
457
  def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
138
138
  project_url: str,
139
139
  evaluation_name: Optional[str],
140
140
  name_map: Dict[str, str],
141
+ tags: Optional[Dict[str, str]] = None,
141
142
  **kwargs,
142
143
  ) -> Optional[str]:
143
144
 
@@ -178,7 +179,6 @@ def _log_metrics_and_instance_results_onedp(
178
179
 
179
180
  properties = {
180
181
  EvaluationRunProperties.RUN_TYPE: "eval_run",
181
- EvaluationRunProperties.EVALUATION_RUN: "promptflow.BatchRun",
182
182
  EvaluationRunProperties.EVALUATION_SDK: f"azure-ai-evaluation:{VERSION}",
183
183
  "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
184
184
  }
@@ -191,6 +191,8 @@ def _log_metrics_and_instance_results_onedp(
191
191
  upload_run_response = client.start_evaluation_run(
192
192
  evaluation=EvaluationUpload(
193
193
  display_name=evaluation_name,
194
+ properties=properties,
195
+ tags=tags,
194
196
  )
195
197
  )
196
198
 
@@ -202,7 +204,6 @@ def _log_metrics_and_instance_results_onedp(
202
204
  outputs={
203
205
  "evaluationResultId": create_evaluation_result_response.id,
204
206
  },
205
- properties=properties,
206
207
  ),
207
208
  )
208
209
 
@@ -216,6 +217,7 @@ def _log_metrics_and_instance_results(
216
217
  run: Optional[Run],
217
218
  evaluation_name: Optional[str],
218
219
  name_map: Dict[str, str],
220
+ tags: Optional[Dict[str, str]] = None,
219
221
  **kwargs,
220
222
  ) -> Optional[str]:
221
223
  from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -245,6 +247,7 @@ def _log_metrics_and_instance_results(
245
247
  workspace_name=ws_triad.workspace_name,
246
248
  management_client=management_client,
247
249
  promptflow_run=run,
250
+ tags=tags,
248
251
  ) as ev_run:
249
252
  artifact_name = EvalRun.EVALUATION_ARTIFACT
250
253
 
@@ -54,7 +54,7 @@ class BleuScoreEvaluator(EvaluatorBase):
54
54
  :caption: Initialize with threshold and call an BleuScoreEvaluator.
55
55
  """
56
56
 
57
- id = "azureml://registries/azureml/models/Bleu-Score-Evaluator/versions/3"
57
+ id = "azureai://built-in/evaluators/bleu_score"
58
58
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
59
59
 
60
60
  def __init__(self, *, threshold=0.5):
@@ -79,19 +79,26 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
79
79
  for the code vulnerability will be "code_vulnerability_label".
80
80
  """
81
81
 
82
- id = "code_vulnerability"
82
+ id = "azureai://built-in/evaluators/code_vulnerability"
83
83
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
84
+ _OPTIONAL_PARAMS = ["query"]
84
85
 
85
86
  @override
86
87
  def __init__(
87
88
  self,
88
89
  credential,
89
90
  azure_ai_project,
91
+ **kwargs,
90
92
  ):
93
+ # Set default for evaluate_query if not provided
94
+ if "evaluate_query" not in kwargs:
95
+ kwargs["evaluate_query"] = True
96
+
91
97
  super().__init__(
92
98
  eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
93
99
  azure_ai_project=azure_ai_project,
94
100
  credential=credential,
101
+ **kwargs,
95
102
  )
96
103
 
97
104
  @overload
@@ -62,11 +62,11 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
62
62
  _PROMPTY_FILE = "coherence.prompty"
63
63
  _RESULT_KEY = "coherence"
64
64
 
65
- id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
65
+ id = "azureai://built-in/evaluators/coherence"
66
66
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
67
67
 
68
68
  @override
69
- def __init__(self, model_config, *, threshold=3):
69
+ def __init__(self, model_config, *, threshold=3, credential=None):
70
70
  current_dir = os.path.dirname(__file__)
71
71
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
72
72
  self._threshold = threshold
@@ -76,6 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
76
76
  prompty_file=prompty_path,
77
77
  result_key=self._RESULT_KEY,
78
78
  threshold=threshold,
79
+ credential=credential,
79
80
  _higher_is_better=self._higher_is_better,
80
81
  )
81
82