azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -5,11 +5,13 @@ import inspect
5
5
  import contextlib
6
6
  import json
7
7
  import logging
8
+ import math
8
9
  import os
9
10
  import re
10
11
  import tempfile
11
12
  import json
12
- from typing import Any, Callable, Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
13
+ import time
14
+ from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
13
15
 
14
16
  from openai import OpenAI, AzureOpenAI
15
17
  from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
@@ -18,8 +20,8 @@ import pandas as pd
18
20
 
19
21
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
20
22
  from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
23
+ from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
21
24
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
22
-
23
25
  from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
24
26
 
25
27
  from .._constants import (
@@ -31,8 +33,10 @@ from .._constants import (
31
33
  _InternalEvaluationMetrics,
32
34
  BINARY_AGGREGATE_SUFFIX,
33
35
  DEFAULT_OAI_EVAL_RUN_NAME,
36
+ EVALUATION_EVENT_NAME,
37
+ _EvaluatorMetricMapping,
34
38
  )
35
- from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
39
+ from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig
36
40
  from .._user_agent import UserAgentSingleton
37
41
  from ._batch_run import (
38
42
  EvalRunContext,
@@ -282,6 +286,51 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
282
286
  return results
283
287
 
284
288
 
289
+ def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
290
+ """Identify token count columns from known SDK metrics that should be excluded from aggregation.
291
+
292
+ Token counts from custom evaluators are not excluded, only those from EvaluationMetrics
293
+ and _InternalEvaluationMetrics.
294
+
295
+ :param df: The dataframe of evaluation results.
296
+ :type df: ~pandas.DataFrame
297
+ :return: List of column names to exclude from aggregation.
298
+ :rtype: List[str]
299
+ """
300
+ # Get all metric values from EvaluationMetrics class
301
+ evaluation_metrics_values = [
302
+ getattr(EvaluationMetrics, attr)
303
+ for attr in dir(EvaluationMetrics)
304
+ if not attr.startswith("_") and isinstance(getattr(EvaluationMetrics, attr), str)
305
+ ]
306
+
307
+ # Get all metric values from _InternalEvaluationMetrics class
308
+ internal_metrics_values = [
309
+ getattr(_InternalEvaluationMetrics, attr)
310
+ for attr in dir(_InternalEvaluationMetrics)
311
+ if not attr.startswith("_") and isinstance(getattr(_InternalEvaluationMetrics, attr), str)
312
+ ]
313
+
314
+ # Combine all known metrics
315
+ all_known_metrics = evaluation_metrics_values + internal_metrics_values
316
+
317
+ # Find token count columns that belong to known metrics
318
+ token_count_cols = [
319
+ col
320
+ for col in df.columns
321
+ if (
322
+ any(
323
+ col.endswith(f"{metric}_prompt_tokens")
324
+ or col.endswith(f"{metric}_completion_tokens")
325
+ or col.endswith(f"{metric}_total_tokens")
326
+ for metric in all_known_metrics
327
+ )
328
+ )
329
+ ]
330
+
331
+ return token_count_cols
332
+
333
+
285
334
  def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
286
335
  """Aggregate metrics from the evaluation results.
287
336
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -314,9 +363,16 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
314
363
  handled_columns.extend(label_cols)
315
364
  defect_rates.update(label_defect_rates)
316
365
 
366
+ # Exclude token count columns from aggregation for known SDK metrics
367
+ token_count_cols = _get_token_count_columns_to_exclude(df)
368
+ handled_columns.extend(token_count_cols)
369
+
317
370
  # For rest of metrics, we will calculate mean
318
371
  df.drop(columns=handled_columns, inplace=True)
319
372
 
373
+ # Convert "not applicable" strings to None to allow proper numeric aggregation
374
+ df = df.replace(EvaluatorBase._NOT_APPLICABLE_RESULT, None)
375
+
320
376
  # NOTE: nan/None values don't count as as booleans, so boolean columns with
321
377
  # nan/None values won't have a mean produced from them.
322
378
  # This is different from label-based known evaluators, which have special handling.
@@ -789,7 +845,7 @@ def evaluate(
789
845
  try:
790
846
  user_agent: Optional[str] = kwargs.get("user_agent")
791
847
  with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
792
- return _evaluate(
848
+ results = _evaluate(
793
849
  evaluation_name=evaluation_name,
794
850
  target=target,
795
851
  data=data,
@@ -801,6 +857,7 @@ def evaluate(
801
857
  tags=tags,
802
858
  **kwargs,
803
859
  )
860
+ return results
804
861
  except Exception as e:
805
862
  # Handle multiprocess bootstrap error
806
863
  bootstrap_error = (
@@ -896,6 +953,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
896
953
  results_df = pd.DataFrame()
897
954
  metrics: Dict[str, float] = {}
898
955
  eval_run_info_list: List[OAIEvalRunCreationInfo] = []
956
+ eval_run_summary_dict = {}
899
957
 
900
958
  # Start OAI eval runs if any graders are present.
901
959
  need_oai_run = len(graders) > 0
@@ -930,6 +988,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
930
988
  got_local_results = True
931
989
  # TODO figure out how to update this printing to include OAI results?
932
990
  _print_summary(per_evaluator_results)
991
+ eval_run_summary_dict = {name: result["run_summary"] for name, result in per_evaluator_results.items()}
992
+ LOGGER.info(f"run_summary: \r\n{json.dumps(eval_run_summary_dict, indent=4)}")
933
993
  except EvaluationException as e:
934
994
  if need_get_oai_results:
935
995
  # If there are OAI graders, we only print a warning on local failures.
@@ -977,13 +1037,322 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
977
1037
 
978
1038
  result_df_dict = results_df.to_dict("records")
979
1039
  result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
1040
+ # _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data"))
1041
+
1042
+ eval_id: Optional[str] = kwargs.get("_eval_id")
1043
+ eval_run_id: Optional[str] = kwargs.get("_eval_run_id")
1044
+ eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("_eval_meta_data")
1045
+ if kwargs.get("_convert_to_aoai_evaluation_result", False):
1046
+ _convert_results_to_aoai_evaluation_results(
1047
+ result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data
1048
+ )
1049
+ if app_insights_configuration := kwargs.get("_app_insights_configuration"):
1050
+ emit_eval_result_events_to_app_insights(
1051
+ app_insights_configuration, result["_evaluation_results_list"], evaluator_config
1052
+ )
980
1053
 
981
1054
  if output_path:
982
1055
  _write_output(output_path, result)
983
-
984
1056
  return result
985
1057
 
986
1058
 
1059
+ def _build_internal_log_attributes(
1060
+ event_data: Dict[str, Any],
1061
+ metric_name: str,
1062
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]],
1063
+ internal_log_attributes: Dict[str, str],
1064
+ ) -> Dict[str, str]:
1065
+ """
1066
+ Build internal log attributes for OpenTelemetry logging.
1067
+
1068
+ :param event_data: The event data containing threshold and name information
1069
+ :type event_data: Dict[str, Any]
1070
+ :param metric_name: The name of the metric being evaluated
1071
+ :type metric_name: str
1072
+ :param evaluator_config: Configuration for evaluators
1073
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1074
+ :return: Dictionary of internal log attributes
1075
+ :rtype: Dict[str, str]
1076
+ """
1077
+ # Add threshold if present
1078
+ if event_data.get("threshold"):
1079
+ internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"])
1080
+
1081
+ # Add testing criteria details if present
1082
+ testing_criteria_name = event_data.get("name")
1083
+ if testing_criteria_name:
1084
+ internal_log_attributes["gen_ai.evaluation.testing_criteria.name"] = testing_criteria_name
1085
+
1086
+ # Get evaluator definition details
1087
+ if evaluator_config and testing_criteria_name in evaluator_config:
1088
+ testing_criteria_config = evaluator_config[testing_criteria_name]
1089
+
1090
+ if evaluator_name := testing_criteria_config.get("_evaluator_name"):
1091
+ internal_log_attributes["gen_ai.evaluator.name"] = str(evaluator_name)
1092
+
1093
+ if evaluator_version := testing_criteria_config.get("_evaluator_version"):
1094
+ internal_log_attributes["gen_ai.evaluator.version"] = str(evaluator_version)
1095
+
1096
+ if evaluator_id := testing_criteria_config.get("_evaluator_id"):
1097
+ internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
1098
+
1099
+ if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
1100
+ metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
1101
+
1102
+ if metric_config_detail:
1103
+ if metric_config_detail.get("min_value") is not None:
1104
+ internal_log_attributes["gen_ai.evaluation.min_value"] = str(metric_config_detail["min_value"])
1105
+ if metric_config_detail.get("max_value") is not None:
1106
+ internal_log_attributes["gen_ai.evaluation.max_value"] = str(metric_config_detail["max_value"])
1107
+
1108
+ return internal_log_attributes
1109
+
1110
+
1111
+ def _log_events_to_app_insights(
1112
+ otel_logger,
1113
+ events: List[Dict[str, Any]],
1114
+ log_attributes: Dict[str, Any],
1115
+ app_insights_config: AppInsightsConfig,
1116
+ data_source_item: Optional[Dict[str, Any]] = None,
1117
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1118
+ ) -> None:
1119
+ """
1120
+ Log independent events directly to App Insights using OpenTelemetry logging.
1121
+ No spans are created - events are sent as pure log records.
1122
+
1123
+ :param otel_logger: OpenTelemetry logger instance
1124
+ :type otel_logger: Logger
1125
+ :param events: List of event data dictionaries to log
1126
+ :type events: List[Dict[str, Any]]
1127
+ :param log_attributes: Attributes dict to use for each event (already includes extra_attributes if present)
1128
+ :type log_attributes: Dict[str, Any]
1129
+ :param app_insights_config: App Insights configuration containing connection string
1130
+ :type app_insights_config: AppInsightsConfig
1131
+ :param data_source_item: Data source item containing trace, response, and agent information
1132
+ :type data_source_item: Optional[Dict[str, Any]]
1133
+ """
1134
+
1135
+ from opentelemetry import trace
1136
+ from opentelemetry.trace import SpanContext, TraceFlags, NonRecordingSpan
1137
+
1138
+ try:
1139
+ # Initialize values from AppInsights config as defaults
1140
+ trace_id = None
1141
+ span_id = None
1142
+ response_id = None
1143
+ conversation_id = None
1144
+ previous_response_id = None
1145
+ agent_id = app_insights_config.get("agent_id", None)
1146
+ agent_version = app_insights_config.get("agent_version", None)
1147
+ agent_name = app_insights_config.get("agent_name", None)
1148
+
1149
+ # Data source item values have higher priority and will override AppInsights config defaults
1150
+ if data_source_item:
1151
+ for key, value in data_source_item.items():
1152
+ if key.endswith("trace_id") and value and isinstance(value, str):
1153
+ # Remove dashes if present
1154
+ trace_id_str = str(value).replace("-", "").lower()
1155
+ if len(trace_id_str) == 32: # Valid trace_id length
1156
+ trace_id = int(trace_id_str, 16)
1157
+ elif key == "previous_response_id" and value and isinstance(value, str):
1158
+ previous_response_id = value
1159
+ elif key == "response_id" and value and isinstance(value, str):
1160
+ response_id = value
1161
+ elif key == "conversation_id" and value and isinstance(value, str):
1162
+ conversation_id = value
1163
+ elif key == "agent_id" and value and isinstance(value, str):
1164
+ agent_id = value
1165
+ elif key.endswith("span_id") and value and isinstance(value, str):
1166
+ # Remove dashes if present and convert to int
1167
+ span_id_str = str(value).replace("-", "").lower()
1168
+ if len(span_id_str) == 16: # Valid span_id length (64-bit = 16 hex chars)
1169
+ span_id = int(span_id_str, 16)
1170
+ elif key == "agent_version" and value and isinstance(value, str):
1171
+ agent_version = value
1172
+ elif key == "agent_name" and value and isinstance(value, str):
1173
+ agent_name = value
1174
+
1175
+ # Log each event as a separate log record
1176
+ for i, event_data in enumerate(events):
1177
+ try:
1178
+ # Prepare log record attributes with specific mappings
1179
+ # The standard attributes are already in https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-events.md#event-eventgen_aievaluationresult
1180
+ metric_name = event_data.get("metric")
1181
+ standard_log_attributes = {}
1182
+ standard_log_attributes["microsoft.custom_event.name"] = EVALUATION_EVENT_NAME
1183
+ standard_log_attributes["gen_ai.evaluation.name"] = metric_name
1184
+ if event_data.get("score") is not None:
1185
+ standard_log_attributes["gen_ai.evaluation.score.value"] = event_data.get("score")
1186
+ if event_data.get("label") is not None:
1187
+ standard_log_attributes["gen_ai.evaluation.score.label"] = event_data.get("label")
1188
+
1189
+ # Internal proposed attributes
1190
+ # Put it in internal property bag for now, will be expanded if we got sign-off to Otel standard later.
1191
+ internal_log_attributes = _build_internal_log_attributes(
1192
+ event_data, metric_name, evaluator_config, log_attributes
1193
+ )
1194
+
1195
+ # Optional field that may not always be present
1196
+ if "reason" in event_data:
1197
+ standard_log_attributes["gen_ai.evaluation.explanation"] = str(event_data["reason"])
1198
+
1199
+ # Handle error from sample if present
1200
+ # Put the error message in error.type to follow OTel semantic conventions
1201
+ error = event_data.get("sample", {}).get("error", {}).get("message", None)
1202
+ if error:
1203
+ standard_log_attributes["error.type"] = error
1204
+
1205
+ # Handle redteam attack properties if present
1206
+ if "properties" in event_data:
1207
+ properties = event_data["properties"]
1208
+
1209
+ if "attack_success" in properties:
1210
+ internal_log_attributes["gen_ai.redteam.attack.success"] = str(properties["attack_success"])
1211
+
1212
+ if "attack_technique" in properties:
1213
+ internal_log_attributes["gen_ai.redteam.attack.technique"] = str(properties["attack_technique"])
1214
+
1215
+ if "attack_complexity" in properties:
1216
+ internal_log_attributes["gen_ai.redteam.attack.complexity"] = str(
1217
+ properties["attack_complexity"]
1218
+ )
1219
+
1220
+ if "attack_success_threshold" in properties:
1221
+ internal_log_attributes["gen_ai.redteam.attack.success_threshold"] = str(
1222
+ properties["attack_success_threshold"]
1223
+ )
1224
+
1225
+ # Add data source item attributes if present
1226
+ if response_id:
1227
+ standard_log_attributes["gen_ai.response.id"] = response_id
1228
+ if conversation_id:
1229
+ standard_log_attributes["gen_ai.conversation.id"] = conversation_id
1230
+ if previous_response_id:
1231
+ internal_log_attributes["gen_ai.previous.response.id"] = previous_response_id
1232
+ if agent_id:
1233
+ standard_log_attributes["gen_ai.agent.id"] = agent_id
1234
+ if agent_name:
1235
+ standard_log_attributes["gen_ai.agent.name"] = agent_name
1236
+ if agent_version:
1237
+ internal_log_attributes["gen_ai.agent.version"] = agent_version
1238
+
1239
+ # Combine standard and internal attributes, put internal under the properties bag
1240
+ standard_log_attributes["internal_properties"] = json.dumps(internal_log_attributes)
1241
+ # Anonymize IP address to prevent Azure GeoIP enrichment and location tracking
1242
+ standard_log_attributes["http.client_ip"] = "0.0.0.0"
1243
+
1244
+ # Create context with trace_id and span_id if present (for distributed tracing correlation)
1245
+ ctx = None
1246
+ if trace_id:
1247
+ span_context = SpanContext(
1248
+ trace_id=trace_id,
1249
+ span_id=span_id if span_id else 0, # Use extracted span_id or 0 if not available
1250
+ is_remote=False,
1251
+ trace_flags=TraceFlags(0x01),
1252
+ )
1253
+ span = NonRecordingSpan(span_context)
1254
+ ctx = trace.set_span_in_context(span)
1255
+
1256
+ otel_logger.emit(
1257
+ timestamp=time.time_ns(),
1258
+ observed_timestamp=time.time_ns(),
1259
+ body=EVALUATION_EVENT_NAME,
1260
+ attributes=standard_log_attributes,
1261
+ context=ctx,
1262
+ )
1263
+
1264
+ except Exception as e:
1265
+ LOGGER.warning(f"Failed to log event {i}: {e}")
1266
+
1267
+ except Exception as e:
1268
+ LOGGER.error(f"Failed to log events to App Insights: {e}")
1269
+
1270
+
1271
+ def emit_eval_result_events_to_app_insights(
1272
+ app_insights_config: AppInsightsConfig,
1273
+ results: List[Dict],
1274
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1275
+ ) -> None:
1276
+ """
1277
+ Emit evaluation result events to App Insights using OpenTelemetry logging.
1278
+ Each result is logged as an independent log record, potentially including trace context.
1279
+
1280
+ :param app_insights_config: App Insights configuration containing connection string
1281
+ :type app_insights_config: AppInsightsConfig
1282
+ :param results: List of evaluation results to log
1283
+ :type results: List[Dict]
1284
+ """
1285
+
1286
+ from opentelemetry import _logs
1287
+ from opentelemetry.sdk._logs import LoggerProvider
1288
+ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
1289
+ from opentelemetry.sdk.resources import Resource
1290
+ from opentelemetry.semconv.resource import ResourceAttributes
1291
+ from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter
1292
+
1293
+ if not results:
1294
+ LOGGER.debug("No results to log to App Insights")
1295
+ return
1296
+
1297
+ try:
1298
+ # Configure OpenTelemetry logging with anonymized Resource attributes
1299
+
1300
+ # Create a resource with minimal attributes to prevent sensitive data collection
1301
+ # SERVICE_INSTANCE_ID maps to cloud_RoleInstance in Azure Monitor and prevents
1302
+ # Azure Monitor from auto-detecting the device hostname
1303
+ anonymized_resource = Resource.create(
1304
+ {
1305
+ ResourceAttributes.SERVICE_NAME: "unknown",
1306
+ ResourceAttributes.SERVICE_INSTANCE_ID: "unknown",
1307
+ }
1308
+ )
1309
+
1310
+ logger_provider = LoggerProvider(resource=anonymized_resource)
1311
+ _logs.set_logger_provider(logger_provider)
1312
+
1313
+ # Create Azure Monitor log exporter
1314
+ azure_log_exporter = AzureMonitorLogExporter(connection_string=app_insights_config["connection_string"])
1315
+
1316
+ # Add the Azure Monitor exporter to the logger provider
1317
+ logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
1318
+
1319
+ # Create a logger from OUR configured logger_provider (not the global one)
1320
+ # This ensures the logger uses our anonymized resource
1321
+ otel_logger = logger_provider.get_logger(__name__)
1322
+
1323
+ # Initialize base log attributes with extra_attributes if present, otherwise empty dict
1324
+ base_log_attributes = app_insights_config.get("extra_attributes", {})
1325
+
1326
+ # Add AppInsights config attributes with proper semantic convention mappings
1327
+ if "run_type" in app_insights_config:
1328
+ base_log_attributes["gen_ai.evaluation.azure_ai_type"] = str(app_insights_config["run_type"])
1329
+ if "schedule_type" in app_insights_config:
1330
+ base_log_attributes["gen_ai.evaluation.azure_ai_scheduled"] = str(app_insights_config["schedule_type"])
1331
+ if "run_id" in app_insights_config:
1332
+ base_log_attributes["gen_ai.evaluation.run.id"] = str(app_insights_config["run_id"])
1333
+ if "project_id" in app_insights_config:
1334
+ base_log_attributes["gen_ai.azure_ai_project.id"] = str(app_insights_config["project_id"])
1335
+
1336
+ for result in results:
1337
+ # Create a copy of base attributes for this result's events
1338
+ log_attributes = base_log_attributes.copy()
1339
+
1340
+ _log_events_to_app_insights(
1341
+ otel_logger=otel_logger,
1342
+ events=result["results"],
1343
+ log_attributes=log_attributes,
1344
+ data_source_item=result["datasource_item"] if "datasource_item" in result else None,
1345
+ evaluator_config=evaluator_config,
1346
+ app_insights_config=app_insights_config,
1347
+ )
1348
+ # Force flush to ensure events are sent
1349
+ logger_provider.force_flush()
1350
+ LOGGER.info(f"Successfully logged {len(results)} evaluation results to App Insights")
1351
+
1352
+ except Exception as e:
1353
+ LOGGER.error(f"Failed to emit evaluation results to App Insights: {e}")
1354
+
1355
+
987
1356
  def _preprocess_data(
988
1357
  data: Union[str, os.PathLike],
989
1358
  evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
@@ -1066,7 +1435,7 @@ def _preprocess_data(
1066
1435
  batch_run_data = input_data_df
1067
1436
  elif client_type == "pf_client":
1068
1437
  batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
1069
- # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1438
+ # Ensure the absolute path is Re to pf.run, as relative path doesn't work with
1070
1439
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
1071
1440
  batch_run_data = os.path.abspath(data)
1072
1441
  elif client_type == "code_client":
@@ -1131,11 +1500,36 @@ def _preprocess_data(
1131
1500
  # via target mapping.
1132
1501
  # If both the data and the output dictionary of the target function
1133
1502
  # have the same column, then the target function value is used.
1503
+ # NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
1504
+ # Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
1134
1505
  if input_data_df is not None:
1506
+ if "conversation" in input_data_df.columns or "messages" in input_data_df.columns:
1507
+ # No action is taken when 'conversation' or 'messages' columns are present,
1508
+ # as these indicate chat/conversation data which should not be flattened or mapped by default.
1509
+ pass
1510
+ else:
1511
+ input_data_df = _flatten_object_columns_for_default_mapping(input_data_df)
1512
+
1513
+ # Build default mapping for leaves:
1514
+ if input_data_df is not None:
1515
+ # First, map flattened nested columns (those containing a dot) to leaf names.
1516
+ for col in input_data_df.columns:
1517
+ # Skip target output columns
1518
+ if col.startswith(Prefixes.TSG_OUTPUTS):
1519
+ continue
1520
+ # Skip root container columns (no dot) here; they'll be handled below if truly primitive.
1521
+ if "." in col:
1522
+ leaf_name = col.split(".")[-1]
1523
+ if leaf_name not in column_mapping["default"]:
1524
+ column_mapping["default"][leaf_name] = f"${{data.{col}}}"
1525
+
1526
+ # Then, handle remaining top-level primitive columns (original logic).
1135
1527
  for col in input_data_df.columns:
1136
- # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
1137
- # Also ignore columns that are already in config, since they've been covered by target mapping.
1138
- if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
1528
+ if (
1529
+ not col.startswith(Prefixes.TSG_OUTPUTS)
1530
+ and col not in column_mapping["default"].keys()
1531
+ and "." not in col # only pure top-level primitives
1532
+ ):
1139
1533
  column_mapping["default"][col] = f"${{data.{col}}}"
1140
1534
 
1141
1535
  return __ValidatedData(
@@ -1149,6 +1543,79 @@ def _preprocess_data(
1149
1543
  )
1150
1544
 
1151
1545
 
1546
+ def _flatten_object_columns_for_default_mapping(
1547
+ df: pd.DataFrame, root_prefixes: Optional[Iterable[str]] = None
1548
+ ) -> pd.DataFrame:
1549
+ """Flatten nested dictionary-valued columns into dotted leaf columns.
1550
+
1551
+ For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
1552
+ leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
1553
+ columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
1554
+ all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
1555
+ are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
1556
+
1557
+ Example
1558
+ If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
1559
+ columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
1560
+
1561
+ :param df: Input DataFrame to flatten in place.
1562
+ :type df: ~pandas.DataFrame
1563
+ :param root_prefixes: Optional iterable restricting which top-level columns are considered
1564
+ for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
1565
+ :type root_prefixes: Optional[Iterable[str]]
1566
+ :return: The same DataFrame instance (returned for convenient chaining).
1567
+ :rtype: ~pandas.DataFrame
1568
+ """
1569
+ candidate_cols = []
1570
+ if root_prefixes is not None:
1571
+ candidate_cols = [c for c in root_prefixes if c in df.columns]
1572
+ else:
1573
+ # pick columns where at least one non-null value is a dict
1574
+ for c in df.columns:
1575
+ series = df[c]
1576
+ if series.map(lambda v: isinstance(v, dict)).any():
1577
+ candidate_cols.append(c)
1578
+
1579
+ def _extract_leaves(obj: Any, prefix: str) -> Iterator[Tuple[str, Any]]:
1580
+ if isinstance(obj, dict):
1581
+ for k, v in obj.items():
1582
+ new_prefix = f"{prefix}.{k}" if prefix else k
1583
+ if isinstance(v, dict):
1584
+ yield from _extract_leaves(v, new_prefix)
1585
+ else:
1586
+ # treat list / primitive / None as leaf
1587
+ yield new_prefix, v
1588
+
1589
+ for root_col in candidate_cols:
1590
+ # Build a union of leaf paths across rows to ensure consistent columns
1591
+ leaf_paths: Set[str] = set()
1592
+ for val in df[root_col]:
1593
+ if isinstance(val, dict):
1594
+ for path, _ in _extract_leaves(val, root_col):
1595
+ leaf_paths.add(path)
1596
+
1597
+ if not leaf_paths:
1598
+ continue
1599
+
1600
+ # Create each flattened column if absent
1601
+ for path in leaf_paths:
1602
+ if path in df.columns:
1603
+ continue # already present
1604
+ relative_keys = path[len(root_col) + 1 :].split(".") if len(path) > len(root_col) else []
1605
+
1606
+ def getter(root_val: Any) -> Any:
1607
+ cur = root_val
1608
+ for rk in relative_keys:
1609
+ if not isinstance(cur, dict):
1610
+ return None
1611
+ cur = cur.get(rk, None)
1612
+ return cur
1613
+
1614
+ df[path] = df[root_col].map(lambda rv: getter(rv) if isinstance(rv, dict) else None)
1615
+
1616
+ return df
1617
+
1618
+
1152
1619
  def _run_callable_evaluators(
1153
1620
  validated_data: __ValidatedData,
1154
1621
  fail_on_evaluator_errors: bool = False,
@@ -1304,3 +1771,652 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
1304
1771
  category=ErrorCategory.FAILED_EXECUTION,
1305
1772
  blame=ErrorBlame.UNKNOWN,
1306
1773
  )
1774
+
1775
+
1776
+ def _convert_results_to_aoai_evaluation_results(
1777
+ results: EvaluationResult,
1778
+ logger: logging.Logger,
1779
+ eval_id: Optional[str] = None,
1780
+ eval_run_id: Optional[str] = None,
1781
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None,
1782
+ eval_run_summary: Optional[Dict[str, Any]] = None,
1783
+ eval_meta_data: Optional[Dict[str, Any]] = None,
1784
+ ) -> None:
1785
+ """
1786
+ Convert evaluation results to AOAI evaluation results format.
1787
+
1788
+ Each row of input results.rows looks like:
1789
+ {"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe",
1790
+ "inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.",
1791
+ "outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5}
1792
+
1793
+ Convert each row into new RunOutputItem object with results array.
1794
+
1795
+ :param results: The evaluation results to convert
1796
+ :type results: EvaluationResult
1797
+ :param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria
1798
+ :type eval_meta_data: Dict[str, Any]
1799
+ :param logger: Logger instance
1800
+ :type logger: logging.Logger
1801
+ :return: EvaluationResult with converted evaluation results in AOAI format
1802
+ :rtype: EvaluationResult
1803
+ """
1804
+
1805
+ if evaluators is None:
1806
+ return
1807
+
1808
+ # Get the testing_criteria_name and testing_criteria_type from evaluators
1809
+ testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {}
1810
+ criteria_name_types_from_meta: Optional[Dict[str, str]] = {}
1811
+ if eval_meta_data and "testing_criteria" in eval_meta_data:
1812
+ testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")
1813
+ if testing_criteria_list is not None:
1814
+ for criteria in testing_criteria_list:
1815
+ criteria_name = criteria.get("name")
1816
+ criteria_type = criteria.get("type")
1817
+ if criteria_name is not None and criteria_type is not None:
1818
+ criteria_name_types_from_meta[criteria_name] = criteria
1819
+
1820
+ for criteria_name, evaluator in evaluators.items():
1821
+ criteria_type = None
1822
+ metrics = []
1823
+ if criteria_name in criteria_name_types_from_meta:
1824
+ criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
1825
+ evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
1826
+ current_evaluator_metrics = criteria_name_types_from_meta[criteria_name].get("metrics", None)
1827
+ if current_evaluator_metrics and len(current_evaluator_metrics) > 0:
1828
+ metrics.extend(current_evaluator_metrics)
1829
+ elif evaluator_name:
1830
+ if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
1831
+ evaluator_name = evaluator_name.replace("builtin.", "")
1832
+ metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
1833
+ if metrics_mapped and len(metrics_mapped) > 0:
1834
+ metrics.extend(metrics_mapped)
1835
+ else:
1836
+ metrics.append(criteria_name)
1837
+ elif isinstance(evaluator, AzureOpenAIGrader):
1838
+ criteria_type = evaluator._type # pylint: disable=protected-access
1839
+ metrics.append(criteria_name)
1840
+ elif isinstance(evaluator, EvaluatorBase):
1841
+ criteria_type = "azure_ai_evaluator"
1842
+ evaluator_class_name = evaluator.__class__.__name__
1843
+ eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None)
1844
+ if eval_name:
1845
+ metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, [])
1846
+ if metrics_mapped and len(metrics_mapped) > 0:
1847
+ metrics.extend(metrics_mapped)
1848
+ else:
1849
+ metrics.append(criteria_name)
1850
+ else:
1851
+ criteria_type = "unknown"
1852
+ metrics.append(criteria_name)
1853
+ testing_criteria_name_types_metrics[criteria_name] = {"type": criteria_type, "metrics": metrics}
1854
+
1855
+ created_time = int(time.time())
1856
+ converted_rows = []
1857
+
1858
+ for row_idx, row in enumerate(results.get("rows", [])):
1859
+ # Group outputs by test criteria name
1860
+ criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()}
1861
+ input_groups = {}
1862
+ top_sample = {}
1863
+ for key, value in row.items():
1864
+ if key.startswith("outputs."):
1865
+ # Parse key: outputs.<test-criteria-name>.<metric>
1866
+ parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '<criteria-name>', '<metric>']
1867
+ if len(parts) >= 3:
1868
+ criteria_name = parts[1]
1869
+ metric_name = parts[2]
1870
+
1871
+ if criteria_name not in criteria_groups:
1872
+ criteria_groups[criteria_name] = {}
1873
+
1874
+ criteria_groups[criteria_name][metric_name] = value
1875
+ elif key.startswith("inputs."):
1876
+ input_key = key.replace("inputs.", "")
1877
+ if input_key not in input_groups:
1878
+ input_groups[input_key] = value
1879
+
1880
+ # Convert each criteria group to RunOutputItem result
1881
+ run_output_results = []
1882
+ for criteria_name, metrics in criteria_groups.items():
1883
+ # Extract metrics for this criteria
1884
+ expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
1885
+ criteria_type = testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "unknown")
1886
+ result_per_metric = {}
1887
+ # Find score - look for various score patterns
1888
+ for metric_key, metric_value in metrics.items():
1889
+ if metric_key.endswith("_score") or metric_key == "score":
1890
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1891
+ if metric not in result_per_metric:
1892
+ result_per_metric[metric] = {"score": metric_value}
1893
+ else:
1894
+ result_per_metric[metric]["score"] = metric_value
1895
+ _append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
1896
+ if metric_key == "passed":
1897
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1898
+ if metric not in result_per_metric:
1899
+ result_per_metric[metric] = {"passed": metric_value}
1900
+ else:
1901
+ result_per_metric[metric]["passed"] = metric_value
1902
+ _append_indirect_attachments_to_results(result_per_metric, "passed", metric, metric_value)
1903
+ elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
1904
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1905
+ label = metric_value
1906
+ passed = (
1907
+ True if (str(metric_value).lower() == "pass" or str(metric_value).lower() == "true") else False
1908
+ )
1909
+ if metric not in result_per_metric:
1910
+ if criteria_type == "azure_ai_evaluator":
1911
+ result_per_metric[metric] = {"label": label, "passed": passed}
1912
+ else:
1913
+ result_per_metric[metric] = {"label": label}
1914
+ else:
1915
+ result_per_metric[metric]["label"] = metric_value
1916
+ if criteria_type == "azure_ai_evaluator":
1917
+ result_per_metric[metric]["passed"] = passed
1918
+ _append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
1919
+ if criteria_type == "azure_ai_evaluator":
1920
+ _append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
1921
+ elif (
1922
+ metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
1923
+ ) or metric_key == "reason":
1924
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1925
+ if metric not in result_per_metric:
1926
+ result_per_metric[metric] = {"reason": metric_value}
1927
+ else:
1928
+ result_per_metric[metric]["reason"] = metric_value
1929
+ _append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
1930
+ elif metric_key.endswith("_threshold") or metric_key == "threshold":
1931
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1932
+ if metric not in result_per_metric:
1933
+ result_per_metric[metric] = {"threshold": metric_value}
1934
+ else:
1935
+ result_per_metric[metric]["threshold"] = metric_value
1936
+ _append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
1937
+ elif metric_key == "sample":
1938
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1939
+ if metric not in result_per_metric:
1940
+ result_per_metric[metric] = {"sample": metric_value}
1941
+ else:
1942
+ result_per_metric[metric]["sample"] = metric_value
1943
+ _append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
1944
+ elif metric_key.endswith("_finish_reason"):
1945
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1946
+ if metric not in result_per_metric:
1947
+ result_per_metric[metric] = {"sample": {"finish_reason": metric_value}}
1948
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1949
+ result_per_metric[metric]["sample"] = {"finish_reason": metric_value}
1950
+ elif (
1951
+ metric in result_per_metric
1952
+ and "sample" in result_per_metric[metric]
1953
+ and "finish_reason" not in result_per_metric[metric]["sample"]
1954
+ ):
1955
+ result_per_metric[metric]["sample"]["finish_reason"] = metric_value
1956
+ _append_indirect_attachments_to_results(
1957
+ result_per_metric, "sample", metric, metric_value, "finish_reason"
1958
+ )
1959
+ elif metric_key.endswith("_model"):
1960
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1961
+ if metric not in result_per_metric:
1962
+ result_per_metric[metric] = {"sample": {"model": metric_value}}
1963
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1964
+ result_per_metric[metric]["sample"] = {"model": metric_value}
1965
+ elif (
1966
+ metric in result_per_metric
1967
+ and "sample" in result_per_metric[metric]
1968
+ and "model" not in result_per_metric[metric]["sample"]
1969
+ ):
1970
+ result_per_metric[metric]["sample"]["model"] = metric_value
1971
+ _append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
1972
+ elif metric_key.endswith("_sample_input"):
1973
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1974
+ input_metric_val_json: Optional[List[Dict[str, Any]]] = []
1975
+ try:
1976
+ input_metric_val_json = json.loads(metric_value)
1977
+ except Exception as e:
1978
+ logger.warning(f"Failed to parse _sample_input value as JSON: {e}")
1979
+ if metric not in result_per_metric:
1980
+ result_per_metric[metric] = {"sample": {"input": input_metric_val_json}}
1981
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1982
+ result_per_metric[metric]["sample"] = {"input": input_metric_val_json}
1983
+ elif (
1984
+ metric in result_per_metric
1985
+ and "sample" in result_per_metric[metric]
1986
+ and "input" not in result_per_metric[metric]["sample"]
1987
+ ):
1988
+ result_per_metric[metric]["sample"]["input"] = input_metric_val_json
1989
+ _append_indirect_attachments_to_results(
1990
+ result_per_metric, "sample", metric, input_metric_val_json, "input"
1991
+ )
1992
+ elif metric_key.endswith("_sample_output"):
1993
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1994
+ output_metric_val_json: Optional[List[Dict[str, Any]]] = []
1995
+ try:
1996
+ output_metric_val_json = json.loads(metric_value)
1997
+ except Exception as e:
1998
+ logger.warning(f"Failed to parse _sample_output value as JSON: {e}")
1999
+ if metric not in result_per_metric:
2000
+ result_per_metric[metric] = {"sample": {"output": output_metric_val_json}}
2001
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2002
+ result_per_metric[metric]["sample"] = {"output": output_metric_val_json}
2003
+ elif (
2004
+ metric in result_per_metric
2005
+ and "sample" in result_per_metric[metric]
2006
+ and "output" not in result_per_metric[metric]["sample"]
2007
+ ):
2008
+ result_per_metric[metric]["sample"]["output"] = output_metric_val_json
2009
+ _append_indirect_attachments_to_results(
2010
+ result_per_metric, "sample", metric, output_metric_val_json, "output"
2011
+ )
2012
+ elif metric_key.endswith("_total_tokens"):
2013
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2014
+ if metric not in result_per_metric:
2015
+ result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
2016
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2017
+ result_per_metric[metric]["sample"] = {"usage": {"total_tokens": metric_value}}
2018
+ elif (
2019
+ metric in result_per_metric
2020
+ and "sample" in result_per_metric[metric]
2021
+ and "usage" not in result_per_metric[metric]["sample"]
2022
+ ):
2023
+ result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
2024
+ else:
2025
+ result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
2026
+ _append_indirect_attachments_to_results(
2027
+ result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
2028
+ )
2029
+ elif metric_key.endswith("_prompt_tokens"):
2030
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2031
+ if metric not in result_per_metric:
2032
+ result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
2033
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2034
+ result_per_metric[metric]["sample"] = {"usage": {"prompt_tokens": metric_value}}
2035
+ elif (
2036
+ metric in result_per_metric
2037
+ and "sample" in result_per_metric[metric]
2038
+ and "usage" not in result_per_metric[metric]["sample"]
2039
+ ):
2040
+ result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
2041
+ else:
2042
+ result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
2043
+ _append_indirect_attachments_to_results(
2044
+ result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
2045
+ )
2046
+ elif metric_key.endswith("_completion_tokens"):
2047
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2048
+ if metric not in result_per_metric:
2049
+ result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
2050
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2051
+ result_per_metric[metric]["sample"] = {"usage": {"completion_tokens": metric_value}}
2052
+ elif (
2053
+ metric in result_per_metric
2054
+ and "sample" in result_per_metric[metric]
2055
+ and "usage" not in result_per_metric[metric]["sample"]
2056
+ ):
2057
+ result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
2058
+ else:
2059
+ result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
2060
+ _append_indirect_attachments_to_results(
2061
+ result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
2062
+ )
2063
+ elif not any(
2064
+ metric_key.endswith(suffix)
2065
+ for suffix in [
2066
+ "_result",
2067
+ "_reason",
2068
+ "_threshold",
2069
+ "_label",
2070
+ "_score",
2071
+ "_model",
2072
+ "_finish_reason",
2073
+ "_sample_input",
2074
+ "_sample_output",
2075
+ "_total_tokens",
2076
+ "_prompt_tokens",
2077
+ "_completion_tokens",
2078
+ ]
2079
+ ):
2080
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2081
+ # If no score found yet and this doesn't match other patterns, use as score
2082
+ if metric_key == metric and metric not in result_per_metric:
2083
+ result_per_metric[metric] = {"score": metric_value}
2084
+ elif metric_key == metric and result_per_metric[metric].get("score", None) is None:
2085
+ result_per_metric[metric]["score"] = metric_value
2086
+
2087
+ for metric, metric_values in result_per_metric.items():
2088
+ score = metric_values.get("score", None)
2089
+ label = metric_values.get("label", None)
2090
+ reason = metric_values.get("reason", None)
2091
+ threshold = metric_values.get("threshold", None)
2092
+ passed = metric_values.get("passed", None)
2093
+ sample = metric_values.get("sample", None)
2094
+
2095
+ # Create result object for this criteria
2096
+ result_obj = {
2097
+ "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2098
+ "type", "azure_ai_evaluator"
2099
+ ),
2100
+ "name": criteria_name, # Use criteria name as name
2101
+ "metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2102
+ }
2103
+ # Add optional fields
2104
+ if (
2105
+ metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
2106
+ or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
2107
+ or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
2108
+ ):
2109
+ copy_label = label
2110
+ if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
2111
+ label = "fail"
2112
+ score = 0.0
2113
+ passed = False
2114
+ else:
2115
+ label = "pass"
2116
+ score = 1.0
2117
+ passed = True
2118
+ result_obj["score"] = (
2119
+ score if not (score is None or (isinstance(score, float) and math.isnan(score))) else None
2120
+ )
2121
+ result_obj["label"] = label
2122
+ result_obj["reason"] = reason
2123
+ result_obj["threshold"] = threshold
2124
+ result_obj["passed"] = passed
2125
+
2126
+ if sample is not None:
2127
+ result_obj["sample"] = sample
2128
+ top_sample = sample # Save top sample for the row
2129
+ run_output_results.append(result_obj)
2130
+
2131
+ if (
2132
+ eval_run_summary
2133
+ and criteria_name in eval_run_summary
2134
+ and isinstance(eval_run_summary[criteria_name], dict)
2135
+ and "error_code" in eval_run_summary[criteria_name]
2136
+ ) and eval_run_summary[criteria_name].get("error_code", None) is not None:
2137
+ error_info = (
2138
+ {
2139
+ "code": eval_run_summary[criteria_name].get("error_code", None),
2140
+ "message": eval_run_summary[criteria_name].get("error_message", None),
2141
+ }
2142
+ if eval_run_summary[criteria_name].get("error_code", None) is not None
2143
+ else None
2144
+ )
2145
+ sample = {"error": error_info} if error_info is not None else None
2146
+ # Create result object for this criteria
2147
+ metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
2148
+ for metric in metrics:
2149
+ result_obj = {
2150
+ "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2151
+ "type", "azure_ai_evaluator"
2152
+ ),
2153
+ "name": criteria_name, # Use criteria name as name
2154
+ "metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2155
+ "score": None,
2156
+ "label": None,
2157
+ "reason": None,
2158
+ "threshold": None,
2159
+ "passed": None,
2160
+ "sample": sample,
2161
+ }
2162
+ run_output_results.append(result_obj)
2163
+
2164
+ # Create RunOutputItem structure
2165
+ run_output_item = {
2166
+ "object": "eval.run.output_item",
2167
+ "id": f"{row_idx+1}",
2168
+ "run_id": eval_run_id,
2169
+ "eval_id": eval_id,
2170
+ "created_at": created_time,
2171
+ "datasource_item_id": row_idx,
2172
+ "datasource_item": input_groups,
2173
+ "results": run_output_results,
2174
+ "status": "completed" if len(run_output_results) > 0 else "error",
2175
+ }
2176
+
2177
+ run_output_item["sample"] = top_sample
2178
+
2179
+ converted_rows.append(run_output_item)
2180
+
2181
+ # Create converted results maintaining the same structure
2182
+ results["_evaluation_results_list"] = converted_rows
2183
+ logger.info(
2184
+ f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
2185
+ )
2186
+ # Calculate summary statistics
2187
+ evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger)
2188
+ results["_evaluation_summary"] = evaluation_summary
2189
+ logger.info(
2190
+ f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
2191
+ )
2192
+
2193
+
2194
+ def _append_indirect_attachments_to_results(
2195
+ current_result_dict: Dict[str, Any],
2196
+ result_name: str,
2197
+ metric: str,
2198
+ metric_value: Any,
2199
+ nested_result_name: Optional[str] = None,
2200
+ secondnested_result_name: Optional[str] = None,
2201
+ ) -> None:
2202
+ """
2203
+ Append indirect attachments to the current result dictionary.
2204
+
2205
+ :param current_result_dict: The current result dictionary to update
2206
+ :type current_result_dict: Dict[str, Any]
2207
+ :param result_name: The result name
2208
+ :type result_name: str
2209
+ :param metric: The metric name
2210
+ :type metric: str
2211
+ :param metric_value: The value of the metric
2212
+ :type metric_value: Any
2213
+ """
2214
+ if metric == "xpia" and result_name:
2215
+ for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
2216
+ if nested_result_name is None:
2217
+ if metric_extended not in current_result_dict:
2218
+ current_result_dict[metric_extended] = {result_name: metric_value}
2219
+ else:
2220
+ current_result_dict[metric_extended][result_name] = metric_value
2221
+ elif nested_result_name is not None and secondnested_result_name is None:
2222
+ if metric_extended not in current_result_dict:
2223
+ current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
2224
+ elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
2225
+ current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
2226
+ elif (
2227
+ metric_extended in current_result_dict
2228
+ and result_name in current_result_dict[metric_extended]
2229
+ and nested_result_name not in current_result_dict[metric_extended][result_name]
2230
+ ):
2231
+ current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
2232
+ elif nested_result_name is not None and secondnested_result_name is not None:
2233
+ if metric_extended not in current_result_dict:
2234
+ current_result_dict[metric_extended] = {
2235
+ result_name: {nested_result_name: {secondnested_result_name: metric_value}}
2236
+ }
2237
+ elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
2238
+ current_result_dict[metric_extended][result_name] = {
2239
+ nested_result_name: {secondnested_result_name: metric_value}
2240
+ }
2241
+ elif (
2242
+ metric_extended in current_result_dict
2243
+ and result_name in current_result_dict[metric_extended]
2244
+ and nested_result_name not in current_result_dict[metric_extended][result_name]
2245
+ ):
2246
+ current_result_dict[metric_extended][result_name][nested_result_name] = {
2247
+ secondnested_result_name: metric_value
2248
+ }
2249
+ else:
2250
+ (
2251
+ current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
2252
+ ) = metric_value
2253
+
2254
+
2255
+ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
2256
+ """
2257
+ Get the metric name from the testing criteria and metric key.
2258
+
2259
+ :param testing_criteria_name: The name of the testing criteria
2260
+ :type testing_criteria_name: str
2261
+ :param metric_key: The metric key to look for
2262
+ :type metric_key: str
2263
+ :param metric_list: List of expected metrics for the testing criteria
2264
+ :type metric_list: List[str]
2265
+ :return: The metric name if found, otherwise the testing criteria name
2266
+ :rtype: str
2267
+ """
2268
+ metric = None
2269
+
2270
+ if metric_key == "xpia_manipulated_content":
2271
+ metric = "xpia_manipulated_content"
2272
+ return metric
2273
+ elif metric_key == "xpia_intrusion":
2274
+ metric = "xpia_intrusion"
2275
+ return metric
2276
+ elif metric_key == "xpia_information_gathering":
2277
+ metric = "xpia_information_gathering"
2278
+ return metric
2279
+ for expected_metric in metric_list:
2280
+ if metric_key.startswith(expected_metric):
2281
+ metric = expected_metric
2282
+ break
2283
+ if metric is None:
2284
+ metric = testing_criteria_name
2285
+ return metric
2286
+
2287
+
2288
+ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logger) -> Dict[str, Any]:
2289
+ """
2290
+ Calculate summary statistics for AOAI evaluation results.
2291
+
2292
+ :param aoai_results: List of AOAI result objects (run_output_items)
2293
+ :type aoai_results: list
2294
+ :return: Summary statistics dictionary
2295
+ :rtype: Dict[str, Any]
2296
+ """
2297
+ # Calculate result counts based on aoaiResults
2298
+ result_counts = {"total": 0, "errored": 0, "failed": 0, "passed": 0}
2299
+
2300
+ # Count results by status and calculate per model usage
2301
+ model_usage_stats = {} # Dictionary to aggregate usage by model
2302
+ result_counts_stats = {} # Dictionary to aggregate usage by model
2303
+
2304
+ for aoai_result in aoai_results:
2305
+ logger.info(
2306
+ f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}"
2307
+ )
2308
+ result_counts["total"] += 1
2309
+ passed_count = 0
2310
+ failed_count = 0
2311
+ error_count = 0
2312
+ if isinstance(aoai_result, dict) and "results" in aoai_result:
2313
+ logger.info(
2314
+ f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}"
2315
+ )
2316
+ for result_item in aoai_result["results"]:
2317
+ if isinstance(result_item, dict):
2318
+ # Check if the result has a 'passed' field
2319
+ if "passed" in result_item and result_item["passed"] is not None:
2320
+ testing_criteria = result_item.get("name", "")
2321
+ if testing_criteria not in result_counts_stats:
2322
+ result_counts_stats[testing_criteria] = {
2323
+ "testing_criteria": testing_criteria,
2324
+ "failed": 0,
2325
+ "passed": 0,
2326
+ }
2327
+ if result_item["passed"] is True:
2328
+ passed_count += 1
2329
+ result_counts_stats[testing_criteria]["passed"] += 1
2330
+
2331
+ elif result_item["passed"] is False:
2332
+ failed_count += 1
2333
+ result_counts_stats[testing_criteria]["failed"] += 1
2334
+ # Check if the result indicates an error status
2335
+ elif ("status" in result_item and result_item["status"] in ["error", "errored"]) or (
2336
+ "sample" in result_item
2337
+ and isinstance(result_item["sample"], dict)
2338
+ and result_item["sample"].get("error", None) is not None
2339
+ ):
2340
+ error_count += 1
2341
+ elif hasattr(aoai_result, "status") and aoai_result.status == "error":
2342
+ error_count += 1
2343
+ elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
2344
+ error_count += 1
2345
+
2346
+ if error_count > 0:
2347
+ result_counts["errored"] += 1
2348
+ elif failed_count > 0:
2349
+ result_counts["failed"] += 1
2350
+ elif (
2351
+ error_count == 0
2352
+ and failed_count == 0
2353
+ and passed_count > 0
2354
+ and passed_count == len(aoai_result.get("results", []))
2355
+ ):
2356
+ result_counts["passed"] += 1
2357
+
2358
+ # Extract usage statistics from aoai_result.sample
2359
+ sample_data_list = []
2360
+ dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
2361
+ dup_usage_list.remove("xpia")
2362
+ if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
2363
+ for result_item in aoai_result["results"]:
2364
+ if (
2365
+ isinstance(result_item, dict)
2366
+ and "sample" in result_item
2367
+ and result_item["sample"]
2368
+ and result_item["metric"] not in dup_usage_list
2369
+ ):
2370
+ sample_data_list.append(result_item["sample"])
2371
+
2372
+ for sample_data in sample_data_list:
2373
+ if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
2374
+ usage_data = sample_data["usage"]
2375
+ model_name = sample_data.get("model", "unknown")
2376
+ if model_name not in model_usage_stats:
2377
+ model_usage_stats[model_name] = {
2378
+ "invocation_count": 0,
2379
+ "total_tokens": 0,
2380
+ "prompt_tokens": 0,
2381
+ "completion_tokens": 0,
2382
+ "cached_tokens": 0,
2383
+ }
2384
+ # Aggregate usage statistics
2385
+ model_stats = model_usage_stats[model_name]
2386
+ model_stats["invocation_count"] += 1
2387
+ if isinstance(usage_data, dict):
2388
+ model_stats["total_tokens"] += usage_data.get("total_tokens", 0)
2389
+ model_stats["prompt_tokens"] += usage_data.get("prompt_tokens", 0)
2390
+ model_stats["completion_tokens"] += usage_data.get("completion_tokens", 0)
2391
+ model_stats["cached_tokens"] += usage_data.get("cached_tokens", 0)
2392
+
2393
+ # Convert model usage stats to list format matching EvaluationRunPerModelUsage
2394
+ per_model_usage = []
2395
+ for model_name, stats in model_usage_stats.items():
2396
+ per_model_usage.append(
2397
+ {
2398
+ "model_name": model_name,
2399
+ "invocation_count": stats["invocation_count"],
2400
+ "total_tokens": stats["total_tokens"],
2401
+ "prompt_tokens": stats["prompt_tokens"],
2402
+ "completion_tokens": stats["completion_tokens"],
2403
+ "cached_tokens": stats["cached_tokens"],
2404
+ }
2405
+ )
2406
+ result_counts_stats_val = []
2407
+ logger.info(f"\r\n Result counts stats: {result_counts_stats}")
2408
+ for criteria_name, stats_val in result_counts_stats.items():
2409
+ if isinstance(stats_val, dict):
2410
+ logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}")
2411
+ result_counts_stats_val.append(
2412
+ {
2413
+ "testing_criteria": criteria_name,
2414
+ "passed": stats_val.get("passed", 0),
2415
+ "failed": stats_val.get("failed", 0),
2416
+ }
2417
+ )
2418
+ return {
2419
+ "result_counts": result_counts,
2420
+ "per_model_usage": per_model_usage,
2421
+ "per_testing_criteria_results": result_counts_stats_val,
2422
+ }