azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
  3. azure/ai/evaluation/_aoai/label_grader.py +6 -10
  4. azure/ai/evaluation/_aoai/python_grader.py +7 -10
  5. azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
  6. azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +241 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -2
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
  33. azure/ai/evaluation/_evaluate/_utils.py +10 -3
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
  38. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  39. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
  40. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  41. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  42. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  43. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
  44. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  45. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  46. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  47. azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
  48. azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
  49. azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
  50. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
  52. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  53. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  55. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  56. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  57. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  58. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  59. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  60. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  61. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  62. azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
  63. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  64. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  65. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  66. azure/ai/evaluation/_exceptions.py +6 -1
  67. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  68. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  69. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  70. azure/ai/evaluation/_model_configurations.py +26 -0
  71. azure/ai/evaluation/_version.py +1 -1
  72. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  73. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  74. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  75. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  76. azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
  77. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  78. azure/ai/evaluation/red_team/_red_team.py +494 -37
  79. azure/ai/evaluation/red_team/_red_team_result.py +48 -28
  80. azure/ai/evaluation/red_team/_result_processor.py +558 -29
  81. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  82. azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
  83. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  84. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  85. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  86. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  87. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  88. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  90. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  91. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  92. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  94. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  95. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
  96. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
  97. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  98. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  99. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -5,10 +5,12 @@ import inspect
5
5
  import contextlib
6
6
  import json
7
7
  import logging
8
+ import math
8
9
  import os
9
10
  import re
10
11
  import tempfile
11
12
  import json
13
+ import time
12
14
  from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
13
15
 
14
16
  from openai import OpenAI, AzureOpenAI
@@ -20,7 +22,6 @@ from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform
20
22
  from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
21
23
  from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
22
24
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
23
-
24
25
  from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
25
26
 
26
27
  from .._constants import (
@@ -32,8 +33,10 @@ from .._constants import (
32
33
  _InternalEvaluationMetrics,
33
34
  BINARY_AGGREGATE_SUFFIX,
34
35
  DEFAULT_OAI_EVAL_RUN_NAME,
36
+ EVALUATION_EVENT_NAME,
37
+ _EvaluatorMetricMapping,
35
38
  )
36
- from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
39
+ from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig
37
40
  from .._user_agent import UserAgentSingleton
38
41
  from ._batch_run import (
39
42
  EvalRunContext,
@@ -283,6 +286,51 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
283
286
  return results
284
287
 
285
288
 
289
+ def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
290
+ """Identify token count columns from known SDK metrics that should be excluded from aggregation.
291
+
292
+ Token counts from custom evaluators are not excluded, only those from EvaluationMetrics
293
+ and _InternalEvaluationMetrics.
294
+
295
+ :param df: The dataframe of evaluation results.
296
+ :type df: ~pandas.DataFrame
297
+ :return: List of column names to exclude from aggregation.
298
+ :rtype: List[str]
299
+ """
300
+ # Get all metric values from EvaluationMetrics class
301
+ evaluation_metrics_values = [
302
+ getattr(EvaluationMetrics, attr)
303
+ for attr in dir(EvaluationMetrics)
304
+ if not attr.startswith("_") and isinstance(getattr(EvaluationMetrics, attr), str)
305
+ ]
306
+
307
+ # Get all metric values from _InternalEvaluationMetrics class
308
+ internal_metrics_values = [
309
+ getattr(_InternalEvaluationMetrics, attr)
310
+ for attr in dir(_InternalEvaluationMetrics)
311
+ if not attr.startswith("_") and isinstance(getattr(_InternalEvaluationMetrics, attr), str)
312
+ ]
313
+
314
+ # Combine all known metrics
315
+ all_known_metrics = evaluation_metrics_values + internal_metrics_values
316
+
317
+ # Find token count columns that belong to known metrics
318
+ token_count_cols = [
319
+ col
320
+ for col in df.columns
321
+ if (
322
+ any(
323
+ col.endswith(f"{metric}_prompt_tokens")
324
+ or col.endswith(f"{metric}_completion_tokens")
325
+ or col.endswith(f"{metric}_total_tokens")
326
+ for metric in all_known_metrics
327
+ )
328
+ )
329
+ ]
330
+
331
+ return token_count_cols
332
+
333
+
286
334
  def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
287
335
  """Aggregate metrics from the evaluation results.
288
336
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -315,6 +363,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
315
363
  handled_columns.extend(label_cols)
316
364
  defect_rates.update(label_defect_rates)
317
365
 
366
+ # Exclude token count columns from aggregation for known SDK metrics
367
+ token_count_cols = _get_token_count_columns_to_exclude(df)
368
+ handled_columns.extend(token_count_cols)
369
+
318
370
  # For rest of metrics, we will calculate mean
319
371
  df.drop(columns=handled_columns, inplace=True)
320
372
 
@@ -793,7 +845,7 @@ def evaluate(
793
845
  try:
794
846
  user_agent: Optional[str] = kwargs.get("user_agent")
795
847
  with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
796
- return _evaluate(
848
+ results = _evaluate(
797
849
  evaluation_name=evaluation_name,
798
850
  target=target,
799
851
  data=data,
@@ -805,6 +857,7 @@ def evaluate(
805
857
  tags=tags,
806
858
  **kwargs,
807
859
  )
860
+ return results
808
861
  except Exception as e:
809
862
  # Handle multiprocess bootstrap error
810
863
  bootstrap_error = (
@@ -900,6 +953,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
900
953
  results_df = pd.DataFrame()
901
954
  metrics: Dict[str, float] = {}
902
955
  eval_run_info_list: List[OAIEvalRunCreationInfo] = []
956
+ eval_run_summary_dict = {}
903
957
 
904
958
  # Start OAI eval runs if any graders are present.
905
959
  need_oai_run = len(graders) > 0
@@ -934,6 +988,8 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
934
988
  got_local_results = True
935
989
  # TODO figure out how to update this printing to include OAI results?
936
990
  _print_summary(per_evaluator_results)
991
+ eval_run_summary_dict = {name: result["run_summary"] for name, result in per_evaluator_results.items()}
992
+ LOGGER.info(f"run_summary: \r\n{json.dumps(eval_run_summary_dict, indent=4)}")
937
993
  except EvaluationException as e:
938
994
  if need_get_oai_results:
939
995
  # If there are OAI graders, we only print a warning on local failures.
@@ -981,13 +1037,322 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
981
1037
 
982
1038
  result_df_dict = results_df.to_dict("records")
983
1039
  result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
1040
+ # _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data"))
1041
+
1042
+ eval_id: Optional[str] = kwargs.get("_eval_id")
1043
+ eval_run_id: Optional[str] = kwargs.get("_eval_run_id")
1044
+ eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("_eval_meta_data")
1045
+ if kwargs.get("_convert_to_aoai_evaluation_result", False):
1046
+ _convert_results_to_aoai_evaluation_results(
1047
+ result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data
1048
+ )
1049
+ if app_insights_configuration := kwargs.get("_app_insights_configuration"):
1050
+ emit_eval_result_events_to_app_insights(
1051
+ app_insights_configuration, result["_evaluation_results_list"], evaluator_config
1052
+ )
984
1053
 
985
1054
  if output_path:
986
1055
  _write_output(output_path, result)
987
-
988
1056
  return result
989
1057
 
990
1058
 
1059
+ def _build_internal_log_attributes(
1060
+ event_data: Dict[str, Any],
1061
+ metric_name: str,
1062
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]],
1063
+ internal_log_attributes: Dict[str, str],
1064
+ ) -> Dict[str, str]:
1065
+ """
1066
+ Build internal log attributes for OpenTelemetry logging.
1067
+
1068
+ :param event_data: The event data containing threshold and name information
1069
+ :type event_data: Dict[str, Any]
1070
+ :param metric_name: The name of the metric being evaluated
1071
+ :type metric_name: str
1072
+ :param evaluator_config: Configuration for evaluators
1073
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1074
+ :return: Dictionary of internal log attributes
1075
+ :rtype: Dict[str, str]
1076
+ """
1077
+ # Add threshold if present
1078
+ if event_data.get("threshold"):
1079
+ internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"])
1080
+
1081
+ # Add testing criteria details if present
1082
+ testing_criteria_name = event_data.get("name")
1083
+ if testing_criteria_name:
1084
+ internal_log_attributes["gen_ai.evaluation.testing_criteria.name"] = testing_criteria_name
1085
+
1086
+ # Get evaluator definition details
1087
+ if evaluator_config and testing_criteria_name in evaluator_config:
1088
+ testing_criteria_config = evaluator_config[testing_criteria_name]
1089
+
1090
+ if evaluator_name := testing_criteria_config.get("_evaluator_name"):
1091
+ internal_log_attributes["gen_ai.evaluator.name"] = str(evaluator_name)
1092
+
1093
+ if evaluator_version := testing_criteria_config.get("_evaluator_version"):
1094
+ internal_log_attributes["gen_ai.evaluator.version"] = str(evaluator_version)
1095
+
1096
+ if evaluator_id := testing_criteria_config.get("_evaluator_id"):
1097
+ internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
1098
+
1099
+ if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
1100
+ metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
1101
+
1102
+ if metric_config_detail:
1103
+ if metric_config_detail.get("min_value") is not None:
1104
+ internal_log_attributes["gen_ai.evaluation.min_value"] = str(metric_config_detail["min_value"])
1105
+ if metric_config_detail.get("max_value") is not None:
1106
+ internal_log_attributes["gen_ai.evaluation.max_value"] = str(metric_config_detail["max_value"])
1107
+
1108
+ return internal_log_attributes
1109
+
1110
+
1111
+ def _log_events_to_app_insights(
1112
+ otel_logger,
1113
+ events: List[Dict[str, Any]],
1114
+ log_attributes: Dict[str, Any],
1115
+ app_insights_config: AppInsightsConfig,
1116
+ data_source_item: Optional[Dict[str, Any]] = None,
1117
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1118
+ ) -> None:
1119
+ """
1120
+ Log independent events directly to App Insights using OpenTelemetry logging.
1121
+ No spans are created - events are sent as pure log records.
1122
+
1123
+ :param otel_logger: OpenTelemetry logger instance
1124
+ :type otel_logger: Logger
1125
+ :param events: List of event data dictionaries to log
1126
+ :type events: List[Dict[str, Any]]
1127
+ :param log_attributes: Attributes dict to use for each event (already includes extra_attributes if present)
1128
+ :type log_attributes: Dict[str, Any]
1129
+ :param app_insights_config: App Insights configuration containing connection string
1130
+ :type app_insights_config: AppInsightsConfig
1131
+ :param data_source_item: Data source item containing trace, response, and agent information
1132
+ :type data_source_item: Optional[Dict[str, Any]]
1133
+ """
1134
+
1135
+ from opentelemetry import trace
1136
+ from opentelemetry.trace import SpanContext, TraceFlags, NonRecordingSpan
1137
+
1138
+ try:
1139
+ # Initialize values from AppInsights config as defaults
1140
+ trace_id = None
1141
+ span_id = None
1142
+ response_id = None
1143
+ conversation_id = None
1144
+ previous_response_id = None
1145
+ agent_id = app_insights_config.get("agent_id", None)
1146
+ agent_version = app_insights_config.get("agent_version", None)
1147
+ agent_name = app_insights_config.get("agent_name", None)
1148
+
1149
+ # Data source item values have higher priority and will override AppInsights config defaults
1150
+ if data_source_item:
1151
+ for key, value in data_source_item.items():
1152
+ if key.endswith("trace_id") and value and isinstance(value, str):
1153
+ # Remove dashes if present
1154
+ trace_id_str = str(value).replace("-", "").lower()
1155
+ if len(trace_id_str) == 32: # Valid trace_id length
1156
+ trace_id = int(trace_id_str, 16)
1157
+ elif key == "previous_response_id" and value and isinstance(value, str):
1158
+ previous_response_id = value
1159
+ elif key == "response_id" and value and isinstance(value, str):
1160
+ response_id = value
1161
+ elif key == "conversation_id" and value and isinstance(value, str):
1162
+ conversation_id = value
1163
+ elif key == "agent_id" and value and isinstance(value, str):
1164
+ agent_id = value
1165
+ elif key.endswith("span_id") and value and isinstance(value, str):
1166
+ # Remove dashes if present and convert to int
1167
+ span_id_str = str(value).replace("-", "").lower()
1168
+ if len(span_id_str) == 16: # Valid span_id length (64-bit = 16 hex chars)
1169
+ span_id = int(span_id_str, 16)
1170
+ elif key == "agent_version" and value and isinstance(value, str):
1171
+ agent_version = value
1172
+ elif key == "agent_name" and value and isinstance(value, str):
1173
+ agent_name = value
1174
+
1175
+ # Log each event as a separate log record
1176
+ for i, event_data in enumerate(events):
1177
+ try:
1178
+ # Prepare log record attributes with specific mappings
1179
+ # The standard attributes are already in https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-events.md#event-eventgen_aievaluationresult
1180
+ metric_name = event_data.get("metric")
1181
+ standard_log_attributes = {}
1182
+ standard_log_attributes["microsoft.custom_event.name"] = EVALUATION_EVENT_NAME
1183
+ standard_log_attributes["gen_ai.evaluation.name"] = metric_name
1184
+ if event_data.get("score") is not None:
1185
+ standard_log_attributes["gen_ai.evaluation.score.value"] = event_data.get("score")
1186
+ if event_data.get("label") is not None:
1187
+ standard_log_attributes["gen_ai.evaluation.score.label"] = event_data.get("label")
1188
+
1189
+ # Internal proposed attributes
1190
+ # Put it in internal property bag for now, will be expanded if we got sign-off to Otel standard later.
1191
+ internal_log_attributes = _build_internal_log_attributes(
1192
+ event_data, metric_name, evaluator_config, log_attributes
1193
+ )
1194
+
1195
+ # Optional field that may not always be present
1196
+ if "reason" in event_data:
1197
+ standard_log_attributes["gen_ai.evaluation.explanation"] = str(event_data["reason"])
1198
+
1199
+ # Handle error from sample if present
1200
+ # Put the error message in error.type to follow OTel semantic conventions
1201
+ error = event_data.get("sample", {}).get("error", {}).get("message", None)
1202
+ if error:
1203
+ standard_log_attributes["error.type"] = error
1204
+
1205
+ # Handle redteam attack properties if present
1206
+ if "properties" in event_data:
1207
+ properties = event_data["properties"]
1208
+
1209
+ if "attack_success" in properties:
1210
+ internal_log_attributes["gen_ai.redteam.attack.success"] = str(properties["attack_success"])
1211
+
1212
+ if "attack_technique" in properties:
1213
+ internal_log_attributes["gen_ai.redteam.attack.technique"] = str(properties["attack_technique"])
1214
+
1215
+ if "attack_complexity" in properties:
1216
+ internal_log_attributes["gen_ai.redteam.attack.complexity"] = str(
1217
+ properties["attack_complexity"]
1218
+ )
1219
+
1220
+ if "attack_success_threshold" in properties:
1221
+ internal_log_attributes["gen_ai.redteam.attack.success_threshold"] = str(
1222
+ properties["attack_success_threshold"]
1223
+ )
1224
+
1225
+ # Add data source item attributes if present
1226
+ if response_id:
1227
+ standard_log_attributes["gen_ai.response.id"] = response_id
1228
+ if conversation_id:
1229
+ standard_log_attributes["gen_ai.conversation.id"] = conversation_id
1230
+ if previous_response_id:
1231
+ internal_log_attributes["gen_ai.previous.response.id"] = previous_response_id
1232
+ if agent_id:
1233
+ standard_log_attributes["gen_ai.agent.id"] = agent_id
1234
+ if agent_name:
1235
+ standard_log_attributes["gen_ai.agent.name"] = agent_name
1236
+ if agent_version:
1237
+ internal_log_attributes["gen_ai.agent.version"] = agent_version
1238
+
1239
+ # Combine standard and internal attributes, put internal under the properties bag
1240
+ standard_log_attributes["internal_properties"] = json.dumps(internal_log_attributes)
1241
+ # Anonymize IP address to prevent Azure GeoIP enrichment and location tracking
1242
+ standard_log_attributes["http.client_ip"] = "0.0.0.0"
1243
+
1244
+ # Create context with trace_id and span_id if present (for distributed tracing correlation)
1245
+ ctx = None
1246
+ if trace_id:
1247
+ span_context = SpanContext(
1248
+ trace_id=trace_id,
1249
+ span_id=span_id if span_id else 0, # Use extracted span_id or 0 if not available
1250
+ is_remote=False,
1251
+ trace_flags=TraceFlags(0x01),
1252
+ )
1253
+ span = NonRecordingSpan(span_context)
1254
+ ctx = trace.set_span_in_context(span)
1255
+
1256
+ otel_logger.emit(
1257
+ timestamp=time.time_ns(),
1258
+ observed_timestamp=time.time_ns(),
1259
+ body=EVALUATION_EVENT_NAME,
1260
+ attributes=standard_log_attributes,
1261
+ context=ctx,
1262
+ )
1263
+
1264
+ except Exception as e:
1265
+ LOGGER.warning(f"Failed to log event {i}: {e}")
1266
+
1267
+ except Exception as e:
1268
+ LOGGER.error(f"Failed to log events to App Insights: {e}")
1269
+
1270
+
1271
+ def emit_eval_result_events_to_app_insights(
1272
+ app_insights_config: AppInsightsConfig,
1273
+ results: List[Dict],
1274
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1275
+ ) -> None:
1276
+ """
1277
+ Emit evaluation result events to App Insights using OpenTelemetry logging.
1278
+ Each result is logged as an independent log record, potentially including trace context.
1279
+
1280
+ :param app_insights_config: App Insights configuration containing connection string
1281
+ :type app_insights_config: AppInsightsConfig
1282
+ :param results: List of evaluation results to log
1283
+ :type results: List[Dict]
1284
+ """
1285
+
1286
+ from opentelemetry import _logs
1287
+ from opentelemetry.sdk._logs import LoggerProvider
1288
+ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
1289
+ from opentelemetry.sdk.resources import Resource
1290
+ from opentelemetry.semconv.resource import ResourceAttributes
1291
+ from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter
1292
+
1293
+ if not results:
1294
+ LOGGER.debug("No results to log to App Insights")
1295
+ return
1296
+
1297
+ try:
1298
+ # Configure OpenTelemetry logging with anonymized Resource attributes
1299
+
1300
+ # Create a resource with minimal attributes to prevent sensitive data collection
1301
+ # SERVICE_INSTANCE_ID maps to cloud_RoleInstance in Azure Monitor and prevents
1302
+ # Azure Monitor from auto-detecting the device hostname
1303
+ anonymized_resource = Resource.create(
1304
+ {
1305
+ ResourceAttributes.SERVICE_NAME: "unknown",
1306
+ ResourceAttributes.SERVICE_INSTANCE_ID: "unknown",
1307
+ }
1308
+ )
1309
+
1310
+ logger_provider = LoggerProvider(resource=anonymized_resource)
1311
+ _logs.set_logger_provider(logger_provider)
1312
+
1313
+ # Create Azure Monitor log exporter
1314
+ azure_log_exporter = AzureMonitorLogExporter(connection_string=app_insights_config["connection_string"])
1315
+
1316
+ # Add the Azure Monitor exporter to the logger provider
1317
+ logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
1318
+
1319
+ # Create a logger from OUR configured logger_provider (not the global one)
1320
+ # This ensures the logger uses our anonymized resource
1321
+ otel_logger = logger_provider.get_logger(__name__)
1322
+
1323
+ # Initialize base log attributes with extra_attributes if present, otherwise empty dict
1324
+ base_log_attributes = app_insights_config.get("extra_attributes", {})
1325
+
1326
+ # Add AppInsights config attributes with proper semantic convention mappings
1327
+ if "run_type" in app_insights_config:
1328
+ base_log_attributes["gen_ai.evaluation.azure_ai_type"] = str(app_insights_config["run_type"])
1329
+ if "schedule_type" in app_insights_config:
1330
+ base_log_attributes["gen_ai.evaluation.azure_ai_scheduled"] = str(app_insights_config["schedule_type"])
1331
+ if "run_id" in app_insights_config:
1332
+ base_log_attributes["gen_ai.evaluation.run.id"] = str(app_insights_config["run_id"])
1333
+ if "project_id" in app_insights_config:
1334
+ base_log_attributes["gen_ai.azure_ai_project.id"] = str(app_insights_config["project_id"])
1335
+
1336
+ for result in results:
1337
+ # Create a copy of base attributes for this result's events
1338
+ log_attributes = base_log_attributes.copy()
1339
+
1340
+ _log_events_to_app_insights(
1341
+ otel_logger=otel_logger,
1342
+ events=result["results"],
1343
+ log_attributes=log_attributes,
1344
+ data_source_item=result["datasource_item"] if "datasource_item" in result else None,
1345
+ evaluator_config=evaluator_config,
1346
+ app_insights_config=app_insights_config,
1347
+ )
1348
+ # Force flush to ensure events are sent
1349
+ logger_provider.force_flush()
1350
+ LOGGER.info(f"Successfully logged {len(results)} evaluation results to App Insights")
1351
+
1352
+ except Exception as e:
1353
+ LOGGER.error(f"Failed to emit evaluation results to App Insights: {e}")
1354
+
1355
+
991
1356
  def _preprocess_data(
992
1357
  data: Union[str, os.PathLike],
993
1358
  evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
@@ -1070,7 +1435,7 @@ def _preprocess_data(
1070
1435
  batch_run_data = input_data_df
1071
1436
  elif client_type == "pf_client":
1072
1437
  batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
1073
- # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
1438
+ # Ensure the absolute path is Re to pf.run, as relative path doesn't work with
1074
1439
  # multiple evaluators. If the path is already absolute, abspath will return the original path.
1075
1440
  batch_run_data = os.path.abspath(data)
1076
1441
  elif client_type == "code_client":
@@ -1406,3 +1771,652 @@ def _turn_error_logs_into_exception(log_path: str) -> None:
1406
1771
  category=ErrorCategory.FAILED_EXECUTION,
1407
1772
  blame=ErrorBlame.UNKNOWN,
1408
1773
  )
1774
+
1775
+
1776
+ def _convert_results_to_aoai_evaluation_results(
1777
+ results: EvaluationResult,
1778
+ logger: logging.Logger,
1779
+ eval_id: Optional[str] = None,
1780
+ eval_run_id: Optional[str] = None,
1781
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None,
1782
+ eval_run_summary: Optional[Dict[str, Any]] = None,
1783
+ eval_meta_data: Optional[Dict[str, Any]] = None,
1784
+ ) -> None:
1785
+ """
1786
+ Convert evaluation results to AOAI evaluation results format.
1787
+
1788
+ Each row of input results.rows looks like:
1789
+ {"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe",
1790
+ "inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.",
1791
+ "outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5}
1792
+
1793
+ Convert each row into new RunOutputItem object with results array.
1794
+
1795
+ :param results: The evaluation results to convert
1796
+ :type results: EvaluationResult
1797
+ :param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria
1798
+ :type eval_meta_data: Dict[str, Any]
1799
+ :param logger: Logger instance
1800
+ :type logger: logging.Logger
1801
+ :return: EvaluationResult with converted evaluation results in AOAI format
1802
+ :rtype: EvaluationResult
1803
+ """
1804
+
1805
+ if evaluators is None:
1806
+ return
1807
+
1808
+ # Get the testing_criteria_name and testing_criteria_type from evaluators
1809
+ testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {}
1810
+ criteria_name_types_from_meta: Optional[Dict[str, str]] = {}
1811
+ if eval_meta_data and "testing_criteria" in eval_meta_data:
1812
+ testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")
1813
+ if testing_criteria_list is not None:
1814
+ for criteria in testing_criteria_list:
1815
+ criteria_name = criteria.get("name")
1816
+ criteria_type = criteria.get("type")
1817
+ if criteria_name is not None and criteria_type is not None:
1818
+ criteria_name_types_from_meta[criteria_name] = criteria
1819
+
1820
+ for criteria_name, evaluator in evaluators.items():
1821
+ criteria_type = None
1822
+ metrics = []
1823
+ if criteria_name in criteria_name_types_from_meta:
1824
+ criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
1825
+ evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
1826
+ current_evaluator_metrics = criteria_name_types_from_meta[criteria_name].get("metrics", None)
1827
+ if current_evaluator_metrics and len(current_evaluator_metrics) > 0:
1828
+ metrics.extend(current_evaluator_metrics)
1829
+ elif evaluator_name:
1830
+ if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
1831
+ evaluator_name = evaluator_name.replace("builtin.", "")
1832
+ metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
1833
+ if metrics_mapped and len(metrics_mapped) > 0:
1834
+ metrics.extend(metrics_mapped)
1835
+ else:
1836
+ metrics.append(criteria_name)
1837
+ elif isinstance(evaluator, AzureOpenAIGrader):
1838
+ criteria_type = evaluator._type # pylint: disable=protected-access
1839
+ metrics.append(criteria_name)
1840
+ elif isinstance(evaluator, EvaluatorBase):
1841
+ criteria_type = "azure_ai_evaluator"
1842
+ evaluator_class_name = evaluator.__class__.__name__
1843
+ eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None)
1844
+ if eval_name:
1845
+ metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, [])
1846
+ if metrics_mapped and len(metrics_mapped) > 0:
1847
+ metrics.extend(metrics_mapped)
1848
+ else:
1849
+ metrics.append(criteria_name)
1850
+ else:
1851
+ criteria_type = "unknown"
1852
+ metrics.append(criteria_name)
1853
+ testing_criteria_name_types_metrics[criteria_name] = {"type": criteria_type, "metrics": metrics}
1854
+
1855
+ created_time = int(time.time())
1856
+ converted_rows = []
1857
+
1858
+ for row_idx, row in enumerate(results.get("rows", [])):
1859
+ # Group outputs by test criteria name
1860
+ criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()}
1861
+ input_groups = {}
1862
+ top_sample = {}
1863
+ for key, value in row.items():
1864
+ if key.startswith("outputs."):
1865
+ # Parse key: outputs.<test-criteria-name>.<metric>
1866
+ parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '<criteria-name>', '<metric>']
1867
+ if len(parts) >= 3:
1868
+ criteria_name = parts[1]
1869
+ metric_name = parts[2]
1870
+
1871
+ if criteria_name not in criteria_groups:
1872
+ criteria_groups[criteria_name] = {}
1873
+
1874
+ criteria_groups[criteria_name][metric_name] = value
1875
+ elif key.startswith("inputs."):
1876
+ input_key = key.replace("inputs.", "")
1877
+ if input_key not in input_groups:
1878
+ input_groups[input_key] = value
1879
+
1880
+ # Convert each criteria group to RunOutputItem result
1881
+ run_output_results = []
1882
+ for criteria_name, metrics in criteria_groups.items():
1883
+ # Extract metrics for this criteria
1884
+ expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
1885
+ criteria_type = testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "unknown")
1886
+ result_per_metric = {}
1887
+ # Find score - look for various score patterns
1888
+ for metric_key, metric_value in metrics.items():
1889
+ if metric_key.endswith("_score") or metric_key == "score":
1890
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1891
+ if metric not in result_per_metric:
1892
+ result_per_metric[metric] = {"score": metric_value}
1893
+ else:
1894
+ result_per_metric[metric]["score"] = metric_value
1895
+ _append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
1896
+ if metric_key == "passed":
1897
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1898
+ if metric not in result_per_metric:
1899
+ result_per_metric[metric] = {"passed": metric_value}
1900
+ else:
1901
+ result_per_metric[metric]["passed"] = metric_value
1902
+ _append_indirect_attachments_to_results(result_per_metric, "passed", metric, metric_value)
1903
+ elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
1904
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1905
+ label = metric_value
1906
+ passed = (
1907
+ True if (str(metric_value).lower() == "pass" or str(metric_value).lower() == "true") else False
1908
+ )
1909
+ if metric not in result_per_metric:
1910
+ if criteria_type == "azure_ai_evaluator":
1911
+ result_per_metric[metric] = {"label": label, "passed": passed}
1912
+ else:
1913
+ result_per_metric[metric] = {"label": label}
1914
+ else:
1915
+ result_per_metric[metric]["label"] = metric_value
1916
+ if criteria_type == "azure_ai_evaluator":
1917
+ result_per_metric[metric]["passed"] = passed
1918
+ _append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
1919
+ if criteria_type == "azure_ai_evaluator":
1920
+ _append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
1921
+ elif (
1922
+ metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
1923
+ ) or metric_key == "reason":
1924
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1925
+ if metric not in result_per_metric:
1926
+ result_per_metric[metric] = {"reason": metric_value}
1927
+ else:
1928
+ result_per_metric[metric]["reason"] = metric_value
1929
+ _append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
1930
+ elif metric_key.endswith("_threshold") or metric_key == "threshold":
1931
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1932
+ if metric not in result_per_metric:
1933
+ result_per_metric[metric] = {"threshold": metric_value}
1934
+ else:
1935
+ result_per_metric[metric]["threshold"] = metric_value
1936
+ _append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
1937
+ elif metric_key == "sample":
1938
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1939
+ if metric not in result_per_metric:
1940
+ result_per_metric[metric] = {"sample": metric_value}
1941
+ else:
1942
+ result_per_metric[metric]["sample"] = metric_value
1943
+ _append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
1944
+ elif metric_key.endswith("_finish_reason"):
1945
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1946
+ if metric not in result_per_metric:
1947
+ result_per_metric[metric] = {"sample": {"finish_reason": metric_value}}
1948
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1949
+ result_per_metric[metric]["sample"] = {"finish_reason": metric_value}
1950
+ elif (
1951
+ metric in result_per_metric
1952
+ and "sample" in result_per_metric[metric]
1953
+ and "finish_reason" not in result_per_metric[metric]["sample"]
1954
+ ):
1955
+ result_per_metric[metric]["sample"]["finish_reason"] = metric_value
1956
+ _append_indirect_attachments_to_results(
1957
+ result_per_metric, "sample", metric, metric_value, "finish_reason"
1958
+ )
1959
+ elif metric_key.endswith("_model"):
1960
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1961
+ if metric not in result_per_metric:
1962
+ result_per_metric[metric] = {"sample": {"model": metric_value}}
1963
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1964
+ result_per_metric[metric]["sample"] = {"model": metric_value}
1965
+ elif (
1966
+ metric in result_per_metric
1967
+ and "sample" in result_per_metric[metric]
1968
+ and "model" not in result_per_metric[metric]["sample"]
1969
+ ):
1970
+ result_per_metric[metric]["sample"]["model"] = metric_value
1971
+ _append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
1972
+ elif metric_key.endswith("_sample_input"):
1973
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1974
+ input_metric_val_json: Optional[List[Dict[str, Any]]] = []
1975
+ try:
1976
+ input_metric_val_json = json.loads(metric_value)
1977
+ except Exception as e:
1978
+ logger.warning(f"Failed to parse _sample_input value as JSON: {e}")
1979
+ if metric not in result_per_metric:
1980
+ result_per_metric[metric] = {"sample": {"input": input_metric_val_json}}
1981
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1982
+ result_per_metric[metric]["sample"] = {"input": input_metric_val_json}
1983
+ elif (
1984
+ metric in result_per_metric
1985
+ and "sample" in result_per_metric[metric]
1986
+ and "input" not in result_per_metric[metric]["sample"]
1987
+ ):
1988
+ result_per_metric[metric]["sample"]["input"] = input_metric_val_json
1989
+ _append_indirect_attachments_to_results(
1990
+ result_per_metric, "sample", metric, input_metric_val_json, "input"
1991
+ )
1992
+ elif metric_key.endswith("_sample_output"):
1993
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1994
+ output_metric_val_json: Optional[List[Dict[str, Any]]] = []
1995
+ try:
1996
+ output_metric_val_json = json.loads(metric_value)
1997
+ except Exception as e:
1998
+ logger.warning(f"Failed to parse _sample_output value as JSON: {e}")
1999
+ if metric not in result_per_metric:
2000
+ result_per_metric[metric] = {"sample": {"output": output_metric_val_json}}
2001
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2002
+ result_per_metric[metric]["sample"] = {"output": output_metric_val_json}
2003
+ elif (
2004
+ metric in result_per_metric
2005
+ and "sample" in result_per_metric[metric]
2006
+ and "output" not in result_per_metric[metric]["sample"]
2007
+ ):
2008
+ result_per_metric[metric]["sample"]["output"] = output_metric_val_json
2009
+ _append_indirect_attachments_to_results(
2010
+ result_per_metric, "sample", metric, output_metric_val_json, "output"
2011
+ )
2012
+ elif metric_key.endswith("_total_tokens"):
2013
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2014
+ if metric not in result_per_metric:
2015
+ result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
2016
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2017
+ result_per_metric[metric]["sample"] = {"usage": {"total_tokens": metric_value}}
2018
+ elif (
2019
+ metric in result_per_metric
2020
+ and "sample" in result_per_metric[metric]
2021
+ and "usage" not in result_per_metric[metric]["sample"]
2022
+ ):
2023
+ result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
2024
+ else:
2025
+ result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
2026
+ _append_indirect_attachments_to_results(
2027
+ result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
2028
+ )
2029
+ elif metric_key.endswith("_prompt_tokens"):
2030
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2031
+ if metric not in result_per_metric:
2032
+ result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
2033
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2034
+ result_per_metric[metric]["sample"] = {"usage": {"prompt_tokens": metric_value}}
2035
+ elif (
2036
+ metric in result_per_metric
2037
+ and "sample" in result_per_metric[metric]
2038
+ and "usage" not in result_per_metric[metric]["sample"]
2039
+ ):
2040
+ result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
2041
+ else:
2042
+ result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
2043
+ _append_indirect_attachments_to_results(
2044
+ result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
2045
+ )
2046
+ elif metric_key.endswith("_completion_tokens"):
2047
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2048
+ if metric not in result_per_metric:
2049
+ result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
2050
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2051
+ result_per_metric[metric]["sample"] = {"usage": {"completion_tokens": metric_value}}
2052
+ elif (
2053
+ metric in result_per_metric
2054
+ and "sample" in result_per_metric[metric]
2055
+ and "usage" not in result_per_metric[metric]["sample"]
2056
+ ):
2057
+ result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
2058
+ else:
2059
+ result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
2060
+ _append_indirect_attachments_to_results(
2061
+ result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
2062
+ )
2063
+ elif not any(
2064
+ metric_key.endswith(suffix)
2065
+ for suffix in [
2066
+ "_result",
2067
+ "_reason",
2068
+ "_threshold",
2069
+ "_label",
2070
+ "_score",
2071
+ "_model",
2072
+ "_finish_reason",
2073
+ "_sample_input",
2074
+ "_sample_output",
2075
+ "_total_tokens",
2076
+ "_prompt_tokens",
2077
+ "_completion_tokens",
2078
+ ]
2079
+ ):
2080
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2081
+ # If no score found yet and this doesn't match other patterns, use as score
2082
+ if metric_key == metric and metric not in result_per_metric:
2083
+ result_per_metric[metric] = {"score": metric_value}
2084
+ elif metric_key == metric and result_per_metric[metric].get("score", None) is None:
2085
+ result_per_metric[metric]["score"] = metric_value
2086
+
2087
+ for metric, metric_values in result_per_metric.items():
2088
+ score = metric_values.get("score", None)
2089
+ label = metric_values.get("label", None)
2090
+ reason = metric_values.get("reason", None)
2091
+ threshold = metric_values.get("threshold", None)
2092
+ passed = metric_values.get("passed", None)
2093
+ sample = metric_values.get("sample", None)
2094
+
2095
+ # Create result object for this criteria
2096
+ result_obj = {
2097
+ "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2098
+ "type", "azure_ai_evaluator"
2099
+ ),
2100
+ "name": criteria_name, # Use criteria name as name
2101
+ "metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2102
+ }
2103
+ # Add optional fields
2104
+ if (
2105
+ metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
2106
+ or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
2107
+ or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
2108
+ ):
2109
+ copy_label = label
2110
+ if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
2111
+ label = "fail"
2112
+ score = 0.0
2113
+ passed = False
2114
+ else:
2115
+ label = "pass"
2116
+ score = 1.0
2117
+ passed = True
2118
+ result_obj["score"] = (
2119
+ score if not (score is None or (isinstance(score, float) and math.isnan(score))) else None
2120
+ )
2121
+ result_obj["label"] = label
2122
+ result_obj["reason"] = reason
2123
+ result_obj["threshold"] = threshold
2124
+ result_obj["passed"] = passed
2125
+
2126
+ if sample is not None:
2127
+ result_obj["sample"] = sample
2128
+ top_sample = sample # Save top sample for the row
2129
+ run_output_results.append(result_obj)
2130
+
2131
+ if (
2132
+ eval_run_summary
2133
+ and criteria_name in eval_run_summary
2134
+ and isinstance(eval_run_summary[criteria_name], dict)
2135
+ and "error_code" in eval_run_summary[criteria_name]
2136
+ ) and eval_run_summary[criteria_name].get("error_code", None) is not None:
2137
+ error_info = (
2138
+ {
2139
+ "code": eval_run_summary[criteria_name].get("error_code", None),
2140
+ "message": eval_run_summary[criteria_name].get("error_message", None),
2141
+ }
2142
+ if eval_run_summary[criteria_name].get("error_code", None) is not None
2143
+ else None
2144
+ )
2145
+ sample = {"error": error_info} if error_info is not None else None
2146
+ # Create result object for this criteria
2147
+ metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
2148
+ for metric in metrics:
2149
+ result_obj = {
2150
+ "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2151
+ "type", "azure_ai_evaluator"
2152
+ ),
2153
+ "name": criteria_name, # Use criteria name as name
2154
+ "metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2155
+ "score": None,
2156
+ "label": None,
2157
+ "reason": None,
2158
+ "threshold": None,
2159
+ "passed": None,
2160
+ "sample": sample,
2161
+ }
2162
+ run_output_results.append(result_obj)
2163
+
2164
+ # Create RunOutputItem structure
2165
+ run_output_item = {
2166
+ "object": "eval.run.output_item",
2167
+ "id": f"{row_idx+1}",
2168
+ "run_id": eval_run_id,
2169
+ "eval_id": eval_id,
2170
+ "created_at": created_time,
2171
+ "datasource_item_id": row_idx,
2172
+ "datasource_item": input_groups,
2173
+ "results": run_output_results,
2174
+ "status": "completed" if len(run_output_results) > 0 else "error",
2175
+ }
2176
+
2177
+ run_output_item["sample"] = top_sample
2178
+
2179
+ converted_rows.append(run_output_item)
2180
+
2181
+ # Create converted results maintaining the same structure
2182
+ results["_evaluation_results_list"] = converted_rows
2183
+ logger.info(
2184
+ f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
2185
+ )
2186
+ # Calculate summary statistics
2187
+ evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger)
2188
+ results["_evaluation_summary"] = evaluation_summary
2189
+ logger.info(
2190
+ f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
2191
+ )
2192
+
2193
+
2194
+ def _append_indirect_attachments_to_results(
2195
+ current_result_dict: Dict[str, Any],
2196
+ result_name: str,
2197
+ metric: str,
2198
+ metric_value: Any,
2199
+ nested_result_name: Optional[str] = None,
2200
+ secondnested_result_name: Optional[str] = None,
2201
+ ) -> None:
2202
+ """
2203
+ Append indirect attachments to the current result dictionary.
2204
+
2205
+ :param current_result_dict: The current result dictionary to update
2206
+ :type current_result_dict: Dict[str, Any]
2207
+ :param result_name: The result name
2208
+ :type result_name: str
2209
+ :param metric: The metric name
2210
+ :type metric: str
2211
+ :param metric_value: The value of the metric
2212
+ :type metric_value: Any
2213
+ """
2214
+ if metric == "xpia" and result_name:
2215
+ for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
2216
+ if nested_result_name is None:
2217
+ if metric_extended not in current_result_dict:
2218
+ current_result_dict[metric_extended] = {result_name: metric_value}
2219
+ else:
2220
+ current_result_dict[metric_extended][result_name] = metric_value
2221
+ elif nested_result_name is not None and secondnested_result_name is None:
2222
+ if metric_extended not in current_result_dict:
2223
+ current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
2224
+ elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
2225
+ current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
2226
+ elif (
2227
+ metric_extended in current_result_dict
2228
+ and result_name in current_result_dict[metric_extended]
2229
+ and nested_result_name not in current_result_dict[metric_extended][result_name]
2230
+ ):
2231
+ current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
2232
+ elif nested_result_name is not None and secondnested_result_name is not None:
2233
+ if metric_extended not in current_result_dict:
2234
+ current_result_dict[metric_extended] = {
2235
+ result_name: {nested_result_name: {secondnested_result_name: metric_value}}
2236
+ }
2237
+ elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
2238
+ current_result_dict[metric_extended][result_name] = {
2239
+ nested_result_name: {secondnested_result_name: metric_value}
2240
+ }
2241
+ elif (
2242
+ metric_extended in current_result_dict
2243
+ and result_name in current_result_dict[metric_extended]
2244
+ and nested_result_name not in current_result_dict[metric_extended][result_name]
2245
+ ):
2246
+ current_result_dict[metric_extended][result_name][nested_result_name] = {
2247
+ secondnested_result_name: metric_value
2248
+ }
2249
+ else:
2250
+ (
2251
+ current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
2252
+ ) = metric_value
2253
+
2254
+
2255
+ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
2256
+ """
2257
+ Get the metric name from the testing criteria and metric key.
2258
+
2259
+ :param testing_criteria_name: The name of the testing criteria
2260
+ :type testing_criteria_name: str
2261
+ :param metric_key: The metric key to look for
2262
+ :type metric_key: str
2263
+ :param metric_list: List of expected metrics for the testing criteria
2264
+ :type metric_list: List[str]
2265
+ :return: The metric name if found, otherwise the testing criteria name
2266
+ :rtype: str
2267
+ """
2268
+ metric = None
2269
+
2270
+ if metric_key == "xpia_manipulated_content":
2271
+ metric = "xpia_manipulated_content"
2272
+ return metric
2273
+ elif metric_key == "xpia_intrusion":
2274
+ metric = "xpia_intrusion"
2275
+ return metric
2276
+ elif metric_key == "xpia_information_gathering":
2277
+ metric = "xpia_information_gathering"
2278
+ return metric
2279
+ for expected_metric in metric_list:
2280
+ if metric_key.startswith(expected_metric):
2281
+ metric = expected_metric
2282
+ break
2283
+ if metric is None:
2284
+ metric = testing_criteria_name
2285
+ return metric
2286
+
2287
+
2288
+ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logger) -> Dict[str, Any]:
2289
+ """
2290
+ Calculate summary statistics for AOAI evaluation results.
2291
+
2292
+ :param aoai_results: List of AOAI result objects (run_output_items)
2293
+ :type aoai_results: list
2294
+ :return: Summary statistics dictionary
2295
+ :rtype: Dict[str, Any]
2296
+ """
2297
+ # Calculate result counts based on aoaiResults
2298
+ result_counts = {"total": 0, "errored": 0, "failed": 0, "passed": 0}
2299
+
2300
+ # Count results by status and calculate per model usage
2301
+ model_usage_stats = {} # Dictionary to aggregate usage by model
2302
+ result_counts_stats = {} # Dictionary to aggregate usage by model
2303
+
2304
+ for aoai_result in aoai_results:
2305
+ logger.info(
2306
+ f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}"
2307
+ )
2308
+ result_counts["total"] += 1
2309
+ passed_count = 0
2310
+ failed_count = 0
2311
+ error_count = 0
2312
+ if isinstance(aoai_result, dict) and "results" in aoai_result:
2313
+ logger.info(
2314
+ f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}"
2315
+ )
2316
+ for result_item in aoai_result["results"]:
2317
+ if isinstance(result_item, dict):
2318
+ # Check if the result has a 'passed' field
2319
+ if "passed" in result_item and result_item["passed"] is not None:
2320
+ testing_criteria = result_item.get("name", "")
2321
+ if testing_criteria not in result_counts_stats:
2322
+ result_counts_stats[testing_criteria] = {
2323
+ "testing_criteria": testing_criteria,
2324
+ "failed": 0,
2325
+ "passed": 0,
2326
+ }
2327
+ if result_item["passed"] is True:
2328
+ passed_count += 1
2329
+ result_counts_stats[testing_criteria]["passed"] += 1
2330
+
2331
+ elif result_item["passed"] is False:
2332
+ failed_count += 1
2333
+ result_counts_stats[testing_criteria]["failed"] += 1
2334
+ # Check if the result indicates an error status
2335
+ elif ("status" in result_item and result_item["status"] in ["error", "errored"]) or (
2336
+ "sample" in result_item
2337
+ and isinstance(result_item["sample"], dict)
2338
+ and result_item["sample"].get("error", None) is not None
2339
+ ):
2340
+ error_count += 1
2341
+ elif hasattr(aoai_result, "status") and aoai_result.status == "error":
2342
+ error_count += 1
2343
+ elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
2344
+ error_count += 1
2345
+
2346
+ if error_count > 0:
2347
+ result_counts["errored"] += 1
2348
+ elif failed_count > 0:
2349
+ result_counts["failed"] += 1
2350
+ elif (
2351
+ error_count == 0
2352
+ and failed_count == 0
2353
+ and passed_count > 0
2354
+ and passed_count == len(aoai_result.get("results", []))
2355
+ ):
2356
+ result_counts["passed"] += 1
2357
+
2358
+ # Extract usage statistics from aoai_result.sample
2359
+ sample_data_list = []
2360
+ dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
2361
+ dup_usage_list.remove("xpia")
2362
+ if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
2363
+ for result_item in aoai_result["results"]:
2364
+ if (
2365
+ isinstance(result_item, dict)
2366
+ and "sample" in result_item
2367
+ and result_item["sample"]
2368
+ and result_item["metric"] not in dup_usage_list
2369
+ ):
2370
+ sample_data_list.append(result_item["sample"])
2371
+
2372
+ for sample_data in sample_data_list:
2373
+ if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
2374
+ usage_data = sample_data["usage"]
2375
+ model_name = sample_data.get("model", "unknown")
2376
+ if model_name not in model_usage_stats:
2377
+ model_usage_stats[model_name] = {
2378
+ "invocation_count": 0,
2379
+ "total_tokens": 0,
2380
+ "prompt_tokens": 0,
2381
+ "completion_tokens": 0,
2382
+ "cached_tokens": 0,
2383
+ }
2384
+ # Aggregate usage statistics
2385
+ model_stats = model_usage_stats[model_name]
2386
+ model_stats["invocation_count"] += 1
2387
+ if isinstance(usage_data, dict):
2388
+ model_stats["total_tokens"] += usage_data.get("total_tokens", 0)
2389
+ model_stats["prompt_tokens"] += usage_data.get("prompt_tokens", 0)
2390
+ model_stats["completion_tokens"] += usage_data.get("completion_tokens", 0)
2391
+ model_stats["cached_tokens"] += usage_data.get("cached_tokens", 0)
2392
+
2393
+ # Convert model usage stats to list format matching EvaluationRunPerModelUsage
2394
+ per_model_usage = []
2395
+ for model_name, stats in model_usage_stats.items():
2396
+ per_model_usage.append(
2397
+ {
2398
+ "model_name": model_name,
2399
+ "invocation_count": stats["invocation_count"],
2400
+ "total_tokens": stats["total_tokens"],
2401
+ "prompt_tokens": stats["prompt_tokens"],
2402
+ "completion_tokens": stats["completion_tokens"],
2403
+ "cached_tokens": stats["cached_tokens"],
2404
+ }
2405
+ )
2406
+ result_counts_stats_val = []
2407
+ logger.info(f"\r\n Result counts stats: {result_counts_stats}")
2408
+ for criteria_name, stats_val in result_counts_stats.items():
2409
+ if isinstance(stats_val, dict):
2410
+ logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}")
2411
+ result_counts_stats_val.append(
2412
+ {
2413
+ "testing_criteria": criteria_name,
2414
+ "passed": stats_val.get("passed", 0),
2415
+ "failed": stats_val.get("failed", 0),
2416
+ }
2417
+ )
2418
+ return {
2419
+ "result_counts": result_counts,
2420
+ "per_model_usage": per_model_usage,
2421
+ "per_testing_criteria_results": result_counts_stats_val,
2422
+ }