azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +85 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +147 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +87 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +430 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
  155. azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
  156. azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
  157. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  158. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  159. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  160. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  161. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  162. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  163. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  165. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/METADATA +378 -27
  264. azure_ai_evaluation-1.13.5.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/top_level.txt +0 -0
@@ -2,46 +2,72 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
+ import contextlib
5
6
  import json
6
7
  import logging
8
+ import math
7
9
  import os
8
10
  import re
9
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
11
+ import tempfile
12
+ import json
13
+ import time
14
+ from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
10
15
 
16
+ from openai import OpenAI, AzureOpenAI
17
+ from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
18
+ from azure.ai.evaluation._legacy._adapters.entities import Run
11
19
  import pandas as pd
12
- from promptflow._sdk._constants import LINE_NUMBER
13
- from promptflow._sdk._errors import UserAuthenticationError, UploadInternalError
14
- from promptflow.client import PFClient
15
- from promptflow.entities import Run
16
20
 
17
21
  from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
18
- from azure.ai.evaluation._common.utils import validate_azure_ai_project
22
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
23
+ from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
19
24
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
+ from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
20
26
 
21
27
  from .._constants import (
22
28
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
29
+ EVALUATION_PASS_FAIL_MAPPING,
23
30
  EvaluationMetrics,
24
- EvaluationRunProperties,
31
+ DefaultOpenEncoding,
25
32
  Prefixes,
26
33
  _InternalEvaluationMetrics,
34
+ BINARY_AGGREGATE_SUFFIX,
35
+ DEFAULT_OAI_EVAL_RUN_NAME,
36
+ EVALUATION_EVENT_NAME,
37
+ _EvaluatorMetricMapping,
38
+ )
39
+ from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig
40
+ from .._user_agent import UserAgentSingleton
41
+ from ._batch_run import (
42
+ EvalRunContext,
43
+ CodeClient,
44
+ ProxyClient,
45
+ TargetRunContext,
46
+ RunSubmitterClient,
27
47
  )
28
- from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
29
- from .._user_agent import USER_AGENT
30
- from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
31
48
  from ._utils import (
32
49
  _apply_column_mapping,
33
50
  _log_metrics_and_instance_results,
34
51
  _trace_destination_from_project_scope,
35
52
  _write_output,
53
+ DataLoaderFactory,
54
+ _log_metrics_and_instance_results_onedp,
55
+ )
56
+ from ._batch_run.batch_clients import BatchClient, BatchClientRun
57
+
58
+ from ._evaluate_aoai import (
59
+ _begin_aoai_evaluation,
60
+ _split_evaluators_and_grader_configs,
61
+ _get_evaluation_run_results,
62
+ OAIEvalRunCreationInfo,
36
63
  )
37
64
 
38
- TClient = TypeVar("TClient", ProxyClient, CodeClient)
39
65
  LOGGER = logging.getLogger(__name__)
40
66
 
41
67
  # For metrics (aggregates) whose metric names intentionally differ from their
42
68
  # originating column name, usually because the aggregation of the original value
43
69
  # means something sufficiently different.
44
- # Note that content safety metrics are handled seprately.
70
+ # Note that content safety metrics are handled separately.
45
71
  METRIC_COLUMN_NAME_REPLACEMENTS = {
46
72
  "groundedness_pro_label": "groundedness_pro_passing_rate",
47
73
  }
@@ -53,6 +79,21 @@ class __EvaluatorInfo(TypedDict):
53
79
  run_summary: Dict[str, Any]
54
80
 
55
81
 
82
+ class __ValidatedData(TypedDict):
83
+ """
84
+ Simple dictionary that contains ALL pre-processed data and
85
+ the resultant objects that are needed for downstream evaluation.
86
+ """
87
+
88
+ evaluators: Dict[str, Callable]
89
+ graders: Dict[str, AzureOpenAIGrader]
90
+ input_data_df: pd.DataFrame
91
+ column_mapping: Dict[str, Dict[str, str]]
92
+ target_run: Optional[BatchClientRun]
93
+ batch_run_client: BatchClient
94
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame]
95
+
96
+
56
97
  def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
57
98
  """Identify and average various metrics that need to have the metric name be replaced,
58
99
  instead of having the metric match the originating column name.
@@ -70,7 +111,7 @@ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, flo
70
111
  if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
71
112
  renamed_cols.append(col)
72
113
  new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
73
- col_with_numeric_values = pd.to_numeric(df[col], errors="coerce")
114
+ col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
74
115
  try:
75
116
  metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
76
117
  except EvaluationException: # only exception that can be cause is all NaN values
@@ -111,7 +152,6 @@ def _aggregate_content_safety_metrics(
111
152
  module = inspect.getmodule(evaluators[evaluator_name])
112
153
  if (
113
154
  module
114
- and module.__name__.startswith("azure.ai.evaluation.")
115
155
  and metric_name.endswith("_score")
116
156
  and metric_name.replace("_score", "") in content_safety_metrics
117
157
  ):
@@ -121,7 +161,7 @@ def _aggregate_content_safety_metrics(
121
161
  defect_rates = {}
122
162
  for col in content_safety_df.columns:
123
163
  defect_rate_name = col.replace("_score", "_defect_rate")
124
- col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
164
+ col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
125
165
  try:
126
166
  col_with_boolean_values = apply_transform_nan_safe(
127
167
  col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
@@ -146,28 +186,151 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
146
186
  """
147
187
  handled_metrics = [
148
188
  EvaluationMetrics.PROTECTED_MATERIAL,
189
+ EvaluationMetrics.FICTIONAL_CHARACTERS,
190
+ EvaluationMetrics.ARTWORK,
191
+ EvaluationMetrics.LOGOS_AND_BRANDS,
149
192
  _InternalEvaluationMetrics.ECI,
150
193
  EvaluationMetrics.XPIA,
194
+ EvaluationMetrics.CODE_VULNERABILITY,
195
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
151
196
  ]
152
197
  label_cols = []
198
+ details_cols = []
153
199
  for col in df.columns:
154
200
  metric_name = col.split(".")[1]
155
201
  if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
156
202
  label_cols.append(col)
203
+ if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
204
+ details_cols = col
157
205
 
158
206
  label_df = df[label_cols]
159
207
  defect_rates = {}
160
208
  for col in label_df.columns:
161
209
  defect_rate_name = col.replace("_label", "_defect_rate")
162
- col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
210
+ col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
163
211
  try:
164
212
  defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
165
213
  except EvaluationException: # only exception that can be cause is all NaN values
166
214
  msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
167
215
  LOGGER.warning(msg)
216
+
217
+ if details_cols:
218
+ details_df = df[details_cols]
219
+ detail_defect_rates = {}
220
+
221
+ for key, value in details_df.items():
222
+ _process_rows(value, detail_defect_rates)
223
+
224
+ for key, value in detail_defect_rates.items():
225
+ col_with_boolean_values = pd.to_numeric(value, errors="coerce")
226
+ try:
227
+ defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
228
+ list_mean_nan_safe(col_with_boolean_values), 2
229
+ )
230
+ except EvaluationException: # only exception that can be cause is all NaN values
231
+ msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
232
+ LOGGER.warning(msg)
233
+
168
234
  return label_cols, defect_rates
169
235
 
170
236
 
237
+ def _process_rows(row, detail_defect_rates):
238
+ for key, value in row.items():
239
+ if key not in detail_defect_rates:
240
+ detail_defect_rates[key] = []
241
+ detail_defect_rates[key].append(value)
242
+ return detail_defect_rates
243
+
244
+
245
+ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
246
+ """
247
+ Aggregate binary output results (pass/fail) from evaluation dataframe.
248
+
249
+ For each evaluator, calculates the proportion of "pass" results.
250
+
251
+ :param df: The dataframe of evaluation results.
252
+ :type df: ~pandas.DataFrame
253
+ :return: A dictionary mapping evaluator names to the proportion of pass results.
254
+ :rtype: Dict[str, float]
255
+ """
256
+ results = {}
257
+
258
+ # Find all columns that end with "_result"
259
+ result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
260
+
261
+ for col in result_columns:
262
+ # Extract the evaluator name from the column name
263
+ # (outputs.<evaluator>.<metric>_result)
264
+ parts = col.split(".")
265
+ evaluator_name = None
266
+ if len(parts) >= 3:
267
+ evaluator_name = parts[1]
268
+ else:
269
+ LOGGER.warning(
270
+ "Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col
271
+ )
272
+ continue
273
+ if evaluator_name:
274
+ # Count the occurrences of each unique value (pass/fail)
275
+ value_counts = df[col].value_counts().to_dict()
276
+
277
+ # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
278
+ total_rows = len(df)
279
+ pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
280
+ proportion = pass_count / total_rows if total_rows > 0 else 0.0
281
+
282
+ # Set the result with the evaluator name as the key
283
+ result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
284
+ results[result_key] = round(proportion, 2)
285
+
286
+ return results
287
+
288
+
289
+ def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
290
+ """Identify token count columns from known SDK metrics that should be excluded from aggregation.
291
+
292
+ Token counts from custom evaluators are not excluded, only those from EvaluationMetrics
293
+ and _InternalEvaluationMetrics.
294
+
295
+ :param df: The dataframe of evaluation results.
296
+ :type df: ~pandas.DataFrame
297
+ :return: List of column names to exclude from aggregation.
298
+ :rtype: List[str]
299
+ """
300
+ # Get all metric values from EvaluationMetrics class
301
+ evaluation_metrics_values = [
302
+ getattr(EvaluationMetrics, attr)
303
+ for attr in dir(EvaluationMetrics)
304
+ if not attr.startswith("_") and isinstance(getattr(EvaluationMetrics, attr), str)
305
+ ]
306
+
307
+ # Get all metric values from _InternalEvaluationMetrics class
308
+ internal_metrics_values = [
309
+ getattr(_InternalEvaluationMetrics, attr)
310
+ for attr in dir(_InternalEvaluationMetrics)
311
+ if not attr.startswith("_") and isinstance(getattr(_InternalEvaluationMetrics, attr), str)
312
+ ]
313
+
314
+ # Combine all known metrics
315
+ all_known_metrics = evaluation_metrics_values + internal_metrics_values
316
+
317
+ # Find token count columns that belong to known metrics
318
+ token_count_cols = [
319
+ col
320
+ for col in df.columns
321
+ if (
322
+ any(
323
+ col.endswith(f"{metric}_prompt_tokens")
324
+ or col.endswith(f"{metric}_completion_tokens")
325
+ or col.endswith(f"{metric}_total_tokens")
326
+ for metric in all_known_metrics
327
+ )
328
+ )
329
+ ]
330
+
331
+ return token_count_cols
332
+
333
+
171
334
  def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
172
335
  """Aggregate metrics from the evaluation results.
173
336
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -181,6 +344,8 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
181
344
  :return: The aggregated metrics.
182
345
  :rtype: Dict[str, float]
183
346
  """
347
+ binary_metrics = _aggregation_binary_output(df)
348
+
184
349
  df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
185
350
 
186
351
  handled_columns = []
@@ -198,9 +363,16 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
198
363
  handled_columns.extend(label_cols)
199
364
  defect_rates.update(label_defect_rates)
200
365
 
366
+ # Exclude token count columns from aggregation for known SDK metrics
367
+ token_count_cols = _get_token_count_columns_to_exclude(df)
368
+ handled_columns.extend(token_count_cols)
369
+
201
370
  # For rest of metrics, we will calculate mean
202
371
  df.drop(columns=handled_columns, inplace=True)
203
372
 
373
+ # Convert "not applicable" strings to None to allow proper numeric aggregation
374
+ df = df.replace(EvaluatorBase._NOT_APPLICABLE_RESULT, None)
375
+
204
376
  # NOTE: nan/None values don't count as as booleans, so boolean columns with
205
377
  # nan/None values won't have a mean produced from them.
206
378
  # This is different from label-based known evaluators, which have special handling.
@@ -208,6 +380,10 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dic
208
380
  metrics = mean_value.to_dict()
209
381
  # Add defect rates back into metrics
210
382
  metrics.update(defect_rates)
383
+
384
+ # Add binary threshold metrics based on pass/fail results
385
+ metrics.update(binary_metrics)
386
+
211
387
  return metrics
212
388
 
213
389
 
@@ -299,7 +475,7 @@ def _validate_columns_for_evaluators(
299
475
  missing_inputs = []
300
476
  else:
301
477
  optional_params = (
302
- evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
478
+ cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
303
479
  if hasattr(evaluator, "_OPTIONAL_PARAMS")
304
480
  else []
305
481
  )
@@ -344,7 +520,7 @@ def _validate_columns_for_evaluators(
344
520
  )
345
521
 
346
522
 
347
- def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
523
+ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name, tags):
348
524
  if data is None:
349
525
  msg = "The 'data' parameter is required for evaluation."
350
526
  raise EvaluationException(
@@ -431,10 +607,11 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
431
607
  )
432
608
 
433
609
  try:
434
- initial_data_df = pd.read_json(data, lines=True)
610
+ data_loader = DataLoaderFactory.get_loader(data)
611
+ initial_data_df = data_loader.load()
435
612
  except Exception as e:
436
613
  raise EvaluationException(
437
- message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
614
+ message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
438
615
  target=ErrorTarget.EVALUATE,
439
616
  category=ErrorCategory.INVALID_VALUE,
440
617
  blame=ErrorBlame.USER_ERROR,
@@ -445,21 +622,21 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
445
622
 
446
623
  def _apply_target_to_data(
447
624
  target: Callable,
448
- data: Union[str, os.PathLike],
449
- pf_client: PFClient,
625
+ data: Union[str, os.PathLike, pd.DataFrame],
626
+ batch_client: BatchClient,
450
627
  initial_data: pd.DataFrame,
451
628
  evaluation_name: Optional[str] = None,
452
629
  **kwargs,
453
- ) -> Tuple[pd.DataFrame, Set[str], Run]:
630
+ ) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
454
631
  """
455
632
  Apply the target function to the data set and return updated data and generated columns.
456
633
 
457
634
  :param target: The function to be applied to data.
458
635
  :type target: Callable
459
- :param data: The path to input jsonl file.
636
+ :param data: The path to input jsonl or csv file.
460
637
  :type data: Union[str, os.PathLike]
461
- :param pf_client: The promptflow client to be used.
462
- :type pf_client: PFClient
638
+ :param batch_client: The promptflow client to be used.
639
+ :type batch_client: PFClient
463
640
  :param initial_data: The data frame with the loaded data.
464
641
  :type initial_data: pd.DataFrame
465
642
  :param evaluation_name: The name of the evaluation.
@@ -467,36 +644,43 @@ def _apply_target_to_data(
467
644
  :return: The tuple, containing data frame and the list of added columns.
468
645
  :rtype: Tuple[pandas.DataFrame, List[str]]
469
646
  """
647
+
470
648
  _run_name = kwargs.get("_run_name")
471
- upload_target_snaphot = kwargs.get("_upload_target_snapshot", False)
649
+ with TargetRunContext(batch_client):
650
+ run: BatchClientRun = batch_client.run(
651
+ flow=target,
652
+ display_name=evaluation_name,
653
+ data=data,
654
+ stream=True,
655
+ name=_run_name,
656
+ evaluator_name=getattr(target, "__qualname__", "TARGET"),
657
+ )
658
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
659
+ run_summary = batch_client.get_run_summary(run)
472
660
 
473
- try:
474
- with TargetRunContext(upload_target_snaphot):
475
- run: Run = pf_client.run(
476
- flow=target,
477
- display_name=evaluation_name,
478
- data=data,
479
- properties={EvaluationRunProperties.RUN_TYPE: "eval_run", "isEvaluatorRun": "true"},
480
- stream=True,
481
- name=_run_name,
482
- )
483
- except (UserAuthenticationError, UploadInternalError) as ex:
484
- if "Failed to upload run" in ex.message:
485
- msg = (
486
- "Failed to upload the target run to the cloud. "
487
- "This may be caused by insufficient permission to access storage or other errors."
488
- )
489
- raise EvaluationException(
490
- message=msg,
491
- target=ErrorTarget.EVALUATE,
492
- category=ErrorCategory.FAILED_REMOTE_TRACKING,
493
- blame=ErrorBlame.USER_ERROR,
494
- tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
495
- ) from ex
661
+ if run_summary["completed_lines"] == 0:
662
+ msg = (
663
+ f"Evaluation target failed to produce any results."
664
+ f" Please check the logs at {run_summary['log_path']} for more details about cause of failure."
665
+ )
666
+ raise EvaluationException(
667
+ message=msg,
668
+ target=ErrorTarget.EVALUATE,
669
+ category=ErrorCategory.FAILED_EXECUTION,
670
+ blame=ErrorBlame.USER_ERROR,
671
+ )
496
672
 
497
- raise ex
673
+ # Log a warning if some rows failed
674
+ failed_lines = run_summary.get("failed_lines", 0)
675
+ completed_lines = run_summary["completed_lines"]
676
+ total_lines = failed_lines + completed_lines
677
+
678
+ if failed_lines > 0:
679
+ LOGGER.warning(
680
+ f"Target function completed {completed_lines} out of {total_lines} rows. "
681
+ f"{failed_lines} rows failed and will be filled with NaN values."
682
+ )
498
683
 
499
- target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
500
684
  # Remove input and output prefix
501
685
  generated_columns = {
502
686
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -504,6 +688,13 @@ def _apply_target_to_data(
504
688
  # Sort output by line numbers
505
689
  target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
506
690
  target_output.sort_index(inplace=True)
691
+
692
+ initial_data_with_line_numbers = initial_data.copy()
693
+ initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
694
+
695
+ complete_index = initial_data_with_line_numbers[LINE_NUMBER]
696
+ target_output = target_output.reindex(complete_index)
697
+
507
698
  target_output.reset_index(inplace=True, drop=False)
508
699
  # target_output contains only input columns, taken by function,
509
700
  # so we need to concatenate it to the input data frame.
@@ -512,8 +703,8 @@ def _apply_target_to_data(
512
703
  # Rename outputs columns to __outputs
513
704
  rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
514
705
  target_output.rename(columns=rename_dict, inplace=True)
515
- # Concatenate output to input
516
- target_output = pd.concat([target_output, initial_data], axis=1)
706
+ # Concatenate output to input - now both dataframes have the same number of rows
707
+ target_output = pd.concat([initial_data, target_output], axis=1)
517
708
 
518
709
  return target_output, generated_columns, run
519
710
 
@@ -531,7 +722,7 @@ def _process_column_mappings(
531
722
 
532
723
  processed_config: Dict[str, Dict[str, str]] = {}
533
724
 
534
- unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
725
+ expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
535
726
 
536
727
  if column_mapping:
537
728
  for evaluator, mapping_config in column_mapping.items():
@@ -540,7 +731,7 @@ def _process_column_mappings(
540
731
 
541
732
  for map_to_key, map_value in mapping_config.items():
542
733
  # Check if there's any unexpected reference other than ${target.} or ${data.}
543
- if unexpected_references.search(map_value):
734
+ if not expected_references.search(map_value):
544
735
  msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
545
736
  raise EvaluationException(
546
737
  message=msg,
@@ -580,27 +771,29 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
580
771
  return df
581
772
 
582
773
 
583
- # @log_evaluate_activity
584
774
  def evaluate(
585
775
  *,
586
776
  data: Union[str, os.PathLike],
587
- evaluators: Dict[str, Callable],
777
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
588
778
  evaluation_name: Optional[str] = None,
589
779
  target: Optional[Callable] = None,
590
780
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
591
- azure_ai_project: Optional[AzureAIProject] = None,
781
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
592
782
  output_path: Optional[Union[str, os.PathLike]] = None,
783
+ fail_on_evaluator_errors: bool = False,
784
+ tags: Optional[Dict[str, str]] = None,
593
785
  **kwargs,
594
786
  ) -> EvaluationResult:
595
787
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
596
788
  data will be run through target function and then results will be evaluated.
597
789
 
598
790
  :keyword data: Path to the data to be evaluated or passed to target if target is set.
599
- Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
791
+ JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
600
792
  :paramtype data: str
601
793
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
602
- and value as the evaluator function. Required.
603
- :paramtype evaluators: Dict[str, Callable]
794
+ and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
795
+ Required.
796
+ :paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
604
797
  :keyword evaluation_name: Display name of the evaluation.
605
798
  :paramtype evaluation_name: Optional[str]
606
799
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
@@ -613,8 +806,20 @@ def evaluate(
613
806
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
614
807
  the results will be saved to a file named `evaluation_results.json` in the folder.
615
808
  :paramtype output_path: Optional[str]
616
- :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
617
- :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
809
+ :keyword azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
810
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
811
+ :paramtype azure_ai_project: Optional[Union[str, ~azure.ai.evaluation.AzureAIProject]]
812
+ :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
813
+ if ANY evaluator fails during their evaluation.
814
+ Defaults to false, which means that evaluations will continue regardless of failures.
815
+ If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
816
+ :paramtype fail_on_evaluator_errors: bool
817
+ :keyword tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
818
+ Keys and values must be strings. For more information about tag limits, see:
819
+ https://learn.microsoft.com/en-us/azure/machine-learning/resource-limits-capacity?view=azureml-api-2#runs
820
+ :paramtype tags: Optional[Dict[str, str]]
821
+ :keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
822
+ :paramtype user_agent: Optional[str]
618
823
  :return: Evaluation results.
619
824
  :rtype: ~azure.ai.evaluation.EvaluationResult
620
825
 
@@ -625,19 +830,34 @@ def evaluate(
625
830
  :end-before: [END evaluate_method]
626
831
  :language: python
627
832
  :dedent: 8
628
- :caption: Run an evaluation on local data with Coherence and Relevance evaluators.
833
+ :caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
834
+
835
+ .. admonition:: Example using Azure AI Project URL:
836
+
837
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
838
+ :start-after: [START evaluate_method]
839
+ :end-before: [END evaluate_method]
840
+ :language: python
841
+ :dedent: 8
842
+ :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
843
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
629
844
  """
630
845
  try:
631
- return _evaluate(
632
- evaluation_name=evaluation_name,
633
- target=target,
634
- data=data,
635
- evaluators=evaluators,
636
- evaluator_config=evaluator_config,
637
- azure_ai_project=azure_ai_project,
638
- output_path=output_path,
639
- **kwargs,
640
- )
846
+ user_agent: Optional[str] = kwargs.get("user_agent")
847
+ with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
848
+ results = _evaluate(
849
+ evaluation_name=evaluation_name,
850
+ target=target,
851
+ data=data,
852
+ evaluators_and_graders=evaluators,
853
+ evaluator_config=evaluator_config,
854
+ azure_ai_project=azure_ai_project,
855
+ output_path=output_path,
856
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
857
+ tags=tags,
858
+ **kwargs,
859
+ )
860
+ return results
641
861
  except Exception as e:
642
862
  # Handle multiprocess bootstrap error
643
863
  bootstrap_error = (
@@ -684,22 +904,468 @@ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
684
904
  print("\n====================================================\n")
685
905
 
686
906
 
907
+ def _print_fail_flag_warning() -> None:
908
+ print(
909
+ "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
910
+ + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
911
+ + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
912
+ + "without producing any outputs, since a single failure will cancel the entire run "
913
+ "when fail_on_evaluator_errors is enabled."
914
+ )
915
+
916
+
687
917
  def _evaluate( # pylint: disable=too-many-locals,too-many-statements
688
918
  *,
689
- evaluators: Dict[str, Callable],
919
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
690
920
  evaluation_name: Optional[str] = None,
691
921
  target: Optional[Callable] = None,
692
922
  data: Union[str, os.PathLike],
693
923
  evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
694
- azure_ai_project: Optional[AzureAIProject] = None,
924
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
695
925
  output_path: Optional[Union[str, os.PathLike]] = None,
926
+ fail_on_evaluator_errors: bool = False,
927
+ tags: Optional[Dict[str, str]] = None,
696
928
  **kwargs,
697
929
  ) -> EvaluationResult:
698
- input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
930
+ if fail_on_evaluator_errors:
931
+ _print_fail_flag_warning()
932
+
933
+ # Turn inputted mess of data into a dataframe, apply targets if needed
934
+ # split graders and evaluators, and verify that column mappings are sensible.
935
+ validated_data = _preprocess_data(
936
+ data=data,
937
+ evaluators_and_graders=evaluators_and_graders,
938
+ evaluator_config=evaluator_config,
939
+ target=target,
940
+ output_path=output_path,
941
+ azure_ai_project=azure_ai_project,
942
+ evaluation_name=evaluation_name,
943
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
944
+ tags=tags,
945
+ **kwargs,
946
+ )
947
+
948
+ # extract relevant info from validated data
949
+ column_mapping = validated_data["column_mapping"]
950
+ evaluators = validated_data["evaluators"]
951
+ graders = validated_data["graders"]
952
+ input_data_df = validated_data["input_data_df"]
953
+ results_df = pd.DataFrame()
954
+ metrics: Dict[str, float] = {}
955
+ eval_run_info_list: List[OAIEvalRunCreationInfo] = []
956
+ eval_run_summary_dict = {}
957
+
958
+ # Start OAI eval runs if any graders are present.
959
+ need_oai_run = len(graders) > 0
960
+ need_local_run = len(evaluators) > 0
961
+ need_get_oai_results = False
962
+ got_local_results = False
963
+ if need_oai_run:
964
+ try:
965
+ aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
966
+ eval_run_info_list = _begin_aoai_evaluation(graders, column_mapping, input_data_df, aoi_name, **kwargs)
967
+ need_get_oai_results = len(eval_run_info_list) > 0
968
+ except EvaluationException as e:
969
+ if need_local_run:
970
+ # If there are normal evaluators, don't stop execution and try to run
971
+ # those.
972
+ LOGGER.warning(
973
+ "Remote Azure Open AI grader evaluations failed during run creation."
974
+ + " Continuing with local evaluators."
975
+ )
976
+ LOGGER.warning(e)
977
+ else:
978
+ raise e
979
+
980
+ # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
981
+ if need_local_run:
982
+ try:
983
+ eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
984
+ validated_data=validated_data, fail_on_evaluator_errors=fail_on_evaluator_errors
985
+ )
986
+ results_df = eval_result_df
987
+ metrics = eval_metrics
988
+ got_local_results = True
989
+ # TODO figure out how to update this printing to include OAI results?
990
+ _print_summary(per_evaluator_results)
991
+ eval_run_summary_dict = {name: result["run_summary"] for name, result in per_evaluator_results.items()}
992
+ LOGGER.info(f"run_summary: \r\n{json.dumps(eval_run_summary_dict, indent=4)}")
993
+ except EvaluationException as e:
994
+ if need_get_oai_results:
995
+ # If there are OAI graders, we only print a warning on local failures.
996
+ LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
997
+ LOGGER.warning(e)
998
+ else:
999
+ raise e
1000
+
1001
+ # Retrieve OAI eval run results if needed.
1002
+ if need_get_oai_results:
1003
+ try:
1004
+ aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
1005
+ # Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
1006
+
1007
+ # Combine results if both evaluators and graders are present
1008
+ if len(evaluators) > 0:
1009
+ results_df = pd.concat([results_df, aoai_results], axis=1)
1010
+ metrics.update(aoai_metrics)
1011
+ else:
1012
+ # Otherwise combine aoai results with input data df to include input columns in outputs.
1013
+ results_df = pd.concat([input_data_df, aoai_results], axis=1)
1014
+ metrics = aoai_metrics
1015
+ except EvaluationException as e:
1016
+ if got_local_results:
1017
+ # If there are local eval results, we only print a warning on OAI failure.
1018
+ LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
1019
+ LOGGER.warning(e)
1020
+ else:
1021
+ raise e
1022
+
1023
+ # Done with all evaluations, message outputs into final forms, and log results if needed.
1024
+ name_map = _map_names_to_builtins(evaluators, graders)
1025
+ if is_onedp_project(azure_ai_project):
1026
+ studio_url = _log_metrics_and_instance_results_onedp(
1027
+ metrics, results_df, azure_ai_project, evaluation_name, name_map, tags=tags, **kwargs
1028
+ )
1029
+ else:
1030
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
1031
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
1032
+ studio_url = None
1033
+ if trace_destination:
1034
+ studio_url = _log_metrics_and_instance_results(
1035
+ metrics, results_df, trace_destination, None, evaluation_name, name_map, tags=tags, **kwargs
1036
+ )
1037
+
1038
+ result_df_dict = results_df.to_dict("records")
1039
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
1040
+ # _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data"))
1041
+
1042
+ eval_id: Optional[str] = kwargs.get("_eval_id")
1043
+ eval_run_id: Optional[str] = kwargs.get("_eval_run_id")
1044
+ eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("_eval_meta_data")
1045
+ if kwargs.get("_convert_to_aoai_evaluation_result", False):
1046
+ _convert_results_to_aoai_evaluation_results(
1047
+ result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data
1048
+ )
1049
+ if app_insights_configuration := kwargs.get("_app_insights_configuration"):
1050
+ emit_eval_result_events_to_app_insights(
1051
+ app_insights_configuration, result["_evaluation_results_list"], evaluator_config
1052
+ )
1053
+
1054
+ if output_path:
1055
+ _write_output(output_path, result)
1056
+ return result
1057
+
1058
+
1059
+ def _build_internal_log_attributes(
1060
+ event_data: Dict[str, Any],
1061
+ metric_name: str,
1062
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]],
1063
+ internal_log_attributes: Dict[str, str],
1064
+ ) -> Dict[str, str]:
1065
+ """
1066
+ Build internal log attributes for OpenTelemetry logging.
1067
+
1068
+ :param event_data: The event data containing threshold and name information
1069
+ :type event_data: Dict[str, Any]
1070
+ :param metric_name: The name of the metric being evaluated
1071
+ :type metric_name: str
1072
+ :param evaluator_config: Configuration for evaluators
1073
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1074
+ :return: Dictionary of internal log attributes
1075
+ :rtype: Dict[str, str]
1076
+ """
1077
+ # Add threshold if present
1078
+ if event_data.get("threshold"):
1079
+ internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"])
1080
+
1081
+ # Add testing criteria details if present
1082
+ testing_criteria_name = event_data.get("name")
1083
+ if testing_criteria_name:
1084
+ internal_log_attributes["gen_ai.evaluation.testing_criteria.name"] = testing_criteria_name
1085
+
1086
+ # Get evaluator definition details
1087
+ if evaluator_config and testing_criteria_name in evaluator_config:
1088
+ testing_criteria_config = evaluator_config[testing_criteria_name]
1089
+
1090
+ if evaluator_name := testing_criteria_config.get("_evaluator_name"):
1091
+ internal_log_attributes["gen_ai.evaluator.name"] = str(evaluator_name)
1092
+
1093
+ if evaluator_version := testing_criteria_config.get("_evaluator_version"):
1094
+ internal_log_attributes["gen_ai.evaluator.version"] = str(evaluator_version)
1095
+
1096
+ if evaluator_id := testing_criteria_config.get("_evaluator_id"):
1097
+ internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
1098
+
1099
+ if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
1100
+ metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
1101
+
1102
+ if metric_config_detail:
1103
+ if metric_config_detail.get("min_value") is not None:
1104
+ internal_log_attributes["gen_ai.evaluation.min_value"] = str(metric_config_detail["min_value"])
1105
+ if metric_config_detail.get("max_value") is not None:
1106
+ internal_log_attributes["gen_ai.evaluation.max_value"] = str(metric_config_detail["max_value"])
1107
+
1108
+ return internal_log_attributes
1109
+
1110
+
1111
+ def _log_events_to_app_insights(
1112
+ event_logger,
1113
+ events: List[Dict[str, Any]],
1114
+ log_attributes: Dict[str, Any],
1115
+ app_insights_config: AppInsightsConfig,
1116
+ data_source_item: Optional[Dict[str, Any]] = None,
1117
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1118
+ ) -> None:
1119
+ """
1120
+ Log independent events directly to App Insights using OpenTelemetry event logging.
1121
+
1122
+ :param event_logger: OpenTelemetry event logger instance
1123
+ :type event_logger: EventLogger
1124
+ :param events: List of event data dictionaries to log
1125
+ :type events: List[Dict[str, Any]]
1126
+ :param log_attributes: Attributes dict to use for each event (already includes extra_attributes if present)
1127
+ :type log_attributes: Dict[str, Any]
1128
+ :param app_insights_config: App Insights configuration containing connection string
1129
+ :type app_insights_config: AppInsightsConfig
1130
+ :param data_source_item: Data source item containing trace, response, and agent information
1131
+ :type data_source_item: Optional[Dict[str, Any]]
1132
+ """
1133
+
1134
+ from opentelemetry._events import Event
1135
+
1136
+ try:
1137
+ # Initialize values from AppInsights config as defaults
1138
+ trace_id = None
1139
+ span_id = None
1140
+ response_id = None
1141
+ conversation_id = None
1142
+ previous_response_id = None
1143
+ agent_id = app_insights_config.get("agent_id", None)
1144
+ agent_version = app_insights_config.get("agent_version", None)
1145
+ agent_name = app_insights_config.get("agent_name", None)
1146
+
1147
+ # Data source item values have higher priority and will override AppInsights config defaults
1148
+ if data_source_item:
1149
+ for key, value in data_source_item.items():
1150
+ if key.endswith("trace_id") and value and isinstance(value, str):
1151
+ # Remove dashes if present
1152
+ trace_id_str = str(value).replace("-", "").lower()
1153
+ if len(trace_id_str) == 32: # Valid trace_id length
1154
+ trace_id = int(trace_id_str, 16)
1155
+ elif key == "previous_response_id" and value and isinstance(value, str):
1156
+ previous_response_id = value
1157
+ elif key == "response_id" and value and isinstance(value, str):
1158
+ response_id = value
1159
+ elif key == "conversation_id" and value and isinstance(value, str):
1160
+ conversation_id = value
1161
+ elif key == "agent_id" and value and isinstance(value, str):
1162
+ agent_id = value
1163
+ elif key.endswith("span_id") and value and isinstance(value, str):
1164
+ # Remove dashes if present and convert to int
1165
+ span_id_str = str(value).replace("-", "").lower()
1166
+ if len(span_id_str) == 16: # Valid span_id length (64-bit = 16 hex chars)
1167
+ span_id = int(span_id_str, 16)
1168
+ elif key == "agent_version" and value and isinstance(value, str):
1169
+ agent_version = value
1170
+ elif key == "agent_name" and value and isinstance(value, str):
1171
+ agent_name = value
1172
+
1173
+ # Log each event as a separate log record
1174
+ for i, event_data in enumerate(events):
1175
+ try:
1176
+ # Prepare log record attributes with specific mappings
1177
+ # The standard attributes are already in https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-events.md#event-eventgen_aievaluationresult
1178
+ metric_name = event_data.get("metric")
1179
+ standard_log_attributes = {}
1180
+ # This attributes makes evaluation events to go into customEvents table in App Insights
1181
+ standard_log_attributes["microsoft.custom_event.name"] = EVALUATION_EVENT_NAME
1182
+ standard_log_attributes["gen_ai.evaluation.name"] = metric_name
1183
+ if event_data.get("score") is not None:
1184
+ standard_log_attributes["gen_ai.evaluation.score.value"] = event_data.get("score")
1185
+ if event_data.get("label") is not None:
1186
+ standard_log_attributes["gen_ai.evaluation.score.label"] = event_data.get("label")
1187
+
1188
+ # Internal proposed attributes
1189
+ # Put it in internal property bag for now, will be expanded if we got sign-off to Otel standard later.
1190
+ internal_log_attributes = _build_internal_log_attributes(
1191
+ event_data, metric_name, evaluator_config, log_attributes
1192
+ )
1193
+
1194
+ # Optional field that may not always be present
1195
+ if "reason" in event_data:
1196
+ standard_log_attributes["gen_ai.evaluation.explanation"] = str(event_data["reason"])
1197
+
1198
+ # Handle error from sample if present
1199
+ # Put the error message in error.type to follow OTel semantic conventions
1200
+ error = event_data.get("sample", {}).get("error", {}).get("message", None)
1201
+ if error:
1202
+ standard_log_attributes["error.type"] = error
1203
+
1204
+ # Handle redteam attack properties if present
1205
+ if "properties" in event_data:
1206
+ properties = event_data["properties"]
1207
+
1208
+ if "attack_success" in properties:
1209
+ internal_log_attributes["gen_ai.redteam.attack.success"] = str(properties["attack_success"])
1210
+
1211
+ if "attack_technique" in properties:
1212
+ internal_log_attributes["gen_ai.redteam.attack.technique"] = str(properties["attack_technique"])
1213
+
1214
+ if "attack_complexity" in properties:
1215
+ internal_log_attributes["gen_ai.redteam.attack.complexity"] = str(
1216
+ properties["attack_complexity"]
1217
+ )
1218
+
1219
+ if "attack_success_threshold" in properties:
1220
+ internal_log_attributes["gen_ai.redteam.attack.success_threshold"] = str(
1221
+ properties["attack_success_threshold"]
1222
+ )
1223
+
1224
+ # Add data source item attributes if present
1225
+ if response_id:
1226
+ standard_log_attributes["gen_ai.response.id"] = response_id
1227
+ if conversation_id:
1228
+ standard_log_attributes["gen_ai.conversation.id"] = conversation_id
1229
+ if previous_response_id:
1230
+ internal_log_attributes["gen_ai.previous.response.id"] = previous_response_id
1231
+ if agent_id:
1232
+ standard_log_attributes["gen_ai.agent.id"] = agent_id
1233
+ if agent_name:
1234
+ standard_log_attributes["gen_ai.agent.name"] = agent_name
1235
+ if agent_version:
1236
+ internal_log_attributes["gen_ai.agent.version"] = agent_version
1237
+
1238
+ # Combine standard and internal attributes, put internal under the properties bag
1239
+ standard_log_attributes["internal_properties"] = json.dumps(internal_log_attributes)
1240
+ # Anonymize IP address to prevent Azure GeoIP enrichment and location tracking
1241
+ standard_log_attributes["http.client_ip"] = "0.0.0.0"
1242
+
1243
+ event_logger.emit(
1244
+ Event(
1245
+ name=EVALUATION_EVENT_NAME,
1246
+ attributes=standard_log_attributes,
1247
+ body=EVALUATION_EVENT_NAME,
1248
+ trace_id=trace_id if trace_id is not None else None,
1249
+ span_id=span_id if span_id is not None else None,
1250
+ )
1251
+ )
1252
+
1253
+ except Exception as e:
1254
+ LOGGER.warning(f"Failed to log event {i}: {e}")
1255
+
1256
+ except Exception as e:
1257
+ LOGGER.error(f"Failed to log events to App Insights: {e}")
1258
+
699
1259
 
1260
+ def emit_eval_result_events_to_app_insights(
1261
+ app_insights_config: AppInsightsConfig,
1262
+ results: List[Dict],
1263
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1264
+ ) -> None:
1265
+ """
1266
+ Emit evaluation result events to App Insights using OpenTelemetry logging.
1267
+ Each result is logged as an independent log record, potentially including trace context.
1268
+
1269
+ :param app_insights_config: App Insights configuration containing connection string
1270
+ :type app_insights_config: AppInsightsConfig
1271
+ :param results: List of evaluation results to log
1272
+ :type results: List[Dict]
1273
+ """
1274
+
1275
+ from opentelemetry import _logs
1276
+ from opentelemetry.sdk._logs import LoggerProvider
1277
+ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
1278
+ from opentelemetry.sdk.resources import Resource
1279
+ from opentelemetry.semconv.resource import ResourceAttributes
1280
+ from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter
1281
+ from opentelemetry._events import get_event_logger
1282
+ from opentelemetry.sdk._events import EventLoggerProvider
1283
+
1284
+ if not results:
1285
+ LOGGER.debug("No results to log to App Insights")
1286
+ return
1287
+
1288
+ try:
1289
+ # Configure OpenTelemetry logging with anonymized Resource attributes
1290
+
1291
+ # Create a resource with minimal attributes to prevent sensitive data collection
1292
+ # SERVICE_INSTANCE_ID maps to cloud_RoleInstance in Azure Monitor and prevents
1293
+ # Azure Monitor from auto-detecting the device hostname
1294
+ anonymized_resource = Resource.create(
1295
+ {
1296
+ ResourceAttributes.SERVICE_NAME: "unknown",
1297
+ ResourceAttributes.SERVICE_INSTANCE_ID: "unknown",
1298
+ }
1299
+ )
1300
+
1301
+ logger_provider = LoggerProvider(resource=anonymized_resource)
1302
+ _logs.set_logger_provider(logger_provider)
1303
+
1304
+ # Create Azure Monitor log exporter
1305
+ azure_log_exporter = AzureMonitorLogExporter(connection_string=app_insights_config["connection_string"])
1306
+
1307
+ # Add the Azure Monitor exporter to the logger provider
1308
+ logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
1309
+
1310
+ # Create event logger
1311
+ event_provider = EventLoggerProvider(logger_provider)
1312
+ event_logger = get_event_logger(__name__, event_logger_provider=event_provider)
1313
+
1314
+ # Initialize base log attributes with extra_attributes if present, otherwise empty dict
1315
+ base_log_attributes = app_insights_config.get("extra_attributes", {})
1316
+
1317
+ # Add AppInsights config attributes with proper semantic convention mappings
1318
+ if "run_type" in app_insights_config:
1319
+ base_log_attributes["gen_ai.evaluation.azure_ai_type"] = str(app_insights_config["run_type"])
1320
+ if "schedule_type" in app_insights_config:
1321
+ base_log_attributes["gen_ai.evaluation.azure_ai_scheduled"] = str(app_insights_config["schedule_type"])
1322
+ if "run_id" in app_insights_config:
1323
+ base_log_attributes["gen_ai.evaluation.run.id"] = str(app_insights_config["run_id"])
1324
+ if "project_id" in app_insights_config:
1325
+ base_log_attributes["gen_ai.azure_ai_project.id"] = str(app_insights_config["project_id"])
1326
+
1327
+ for result in results:
1328
+ # Create a copy of base attributes for this result's events
1329
+ log_attributes = base_log_attributes.copy()
1330
+
1331
+ _log_events_to_app_insights(
1332
+ event_logger=event_logger,
1333
+ events=result["results"],
1334
+ log_attributes=log_attributes,
1335
+ data_source_item=result["datasource_item"] if "datasource_item" in result else None,
1336
+ evaluator_config=evaluator_config,
1337
+ app_insights_config=app_insights_config,
1338
+ )
1339
+ # Force flush to ensure events are sent
1340
+ logger_provider.force_flush()
1341
+ LOGGER.info(f"Successfully logged {len(results)} evaluation results to App Insights")
1342
+
1343
+ except Exception as e:
1344
+ LOGGER.error(f"Failed to emit evaluation results to App Insights: {e}")
1345
+
1346
+
1347
+ def _preprocess_data(
1348
+ data: Union[str, os.PathLike],
1349
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
1350
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1351
+ target: Optional[Callable] = None,
1352
+ output_path: Optional[Union[str, os.PathLike]] = None,
1353
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
1354
+ evaluation_name: Optional[str] = None,
1355
+ fail_on_evaluator_errors: bool = False,
1356
+ tags: Optional[Dict[str, str]] = None,
1357
+ **kwargs,
1358
+ ) -> __ValidatedData:
700
1359
  # Process evaluator config to replace ${target.} with ${data.}
701
1360
  if evaluator_config is None:
702
1361
  evaluator_config = {}
1362
+
1363
+ input_data_df = _validate_and_load_data(
1364
+ target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name, tags
1365
+ )
1366
+ if target is not None:
1367
+ _validate_columns_for_target(input_data_df, target)
1368
+
703
1369
  # extract column mapping dicts into dictionary mapping evaluator name to column mapping
704
1370
  column_mapping = _process_column_mappings(
705
1371
  {
@@ -708,35 +1374,115 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
708
1374
  }
709
1375
  )
710
1376
 
711
- if target is not None:
712
- _validate_columns_for_target(input_data_df, target)
713
-
714
- pf_client = PFClient(user_agent=USER_AGENT)
715
- target_run: Optional[Run] = None
716
-
717
1377
  # Create default configuration for evaluators that directly maps
718
1378
  # input data names to keyword inputs of the same name in the evaluators.
719
1379
  column_mapping = column_mapping or {}
720
1380
  column_mapping.setdefault("default", {})
721
1381
 
722
- # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
1382
+ # Split normal evaluators and OAI graders
1383
+ evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
1384
+
1385
+ target_run: Optional[BatchClientRun] = None
723
1386
  target_generated_columns: Set[str] = set()
1387
+ batch_run_client: BatchClient
1388
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
1389
+
1390
+ def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
1391
+ """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
1392
+ _use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
1393
+ _use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
1394
+
1395
+ if _use_run_submitter_client is None and _use_pf_client is None:
1396
+ # If both are unset, return default
1397
+ return "run_submitter"
1398
+
1399
+ if _use_run_submitter_client and _use_pf_client:
1400
+ raise EvaluationException(
1401
+ message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
1402
+ target=ErrorTarget.EVALUATE,
1403
+ category=ErrorCategory.INVALID_VALUE,
1404
+ blame=ErrorBlame.USER_ERROR,
1405
+ )
1406
+
1407
+ if _use_run_submitter_client == False and _use_pf_client == False:
1408
+ return "code_client"
1409
+
1410
+ if _use_run_submitter_client:
1411
+ return "run_submitter"
1412
+ if _use_pf_client:
1413
+ return "pf_client"
1414
+
1415
+ if _use_run_submitter_client is None and _use_pf_client == False:
1416
+ return "run_submitter"
1417
+ if _use_run_submitter_client == False and _use_pf_client is None:
1418
+ return "pf_client"
1419
+
1420
+ assert False, "This should be impossible"
1421
+
1422
+ client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
1423
+
1424
+ if client_type == "run_submitter":
1425
+ batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
1426
+ batch_run_data = input_data_df
1427
+ elif client_type == "pf_client":
1428
+ batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
1429
+ # Ensure the absolute path is Re to pf.run, as relative path doesn't work with
1430
+ # multiple evaluators. If the path is already absolute, abspath will return the original path.
1431
+ batch_run_data = os.path.abspath(data)
1432
+ elif client_type == "code_client":
1433
+ batch_run_client = CodeClient()
1434
+ batch_run_data = input_data_df
1435
+
1436
+ # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
724
1437
  if data is not None and target is not None:
725
1438
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
726
- target, data, pf_client, input_data_df, evaluation_name, **kwargs
1439
+ target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
727
1440
  )
728
1441
 
729
- for evaluator_name, mapping in column_mapping.items():
730
- mapped_to_values = set(mapping.values())
731
- for col in target_generated_columns:
732
- # If user defined mapping differently, do not change it.
733
- # If it was mapped to target, we have already changed it
734
- # in _process_column_mappings
735
- run_output = f"${{run.outputs.{col}}}"
736
- # We will add our mapping only if
737
- # customer did not mapped target output.
738
- if col not in mapping and run_output not in mapped_to_values:
739
- column_mapping[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
1442
+ # IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
1443
+ # This ensures that evaluators get all rows (including failed ones with NaN values)
1444
+ if isinstance(batch_run_client, ProxyClient):
1445
+ # Create a temporary JSONL file with the complete dataframe
1446
+ temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
1447
+ try:
1448
+ for _, row in input_data_df.iterrows():
1449
+ row_dict = row.to_dict()
1450
+ temp_file.write(json.dumps(row_dict) + "\n")
1451
+ temp_file.close()
1452
+ batch_run_data = temp_file.name
1453
+
1454
+ # Update column mappings to use data references instead of run outputs
1455
+ for evaluator_name, mapping in column_mapping.items():
1456
+ mapped_to_values = set(mapping.values())
1457
+ for col in target_generated_columns:
1458
+ # Use data reference instead of run output to ensure we get all rows
1459
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1460
+
1461
+ # We will add our mapping only if customer did not map target output.
1462
+ if col not in mapping and target_reference not in mapped_to_values:
1463
+ column_mapping[evaluator_name][col] = target_reference
1464
+
1465
+ # Don't pass the target_run since we're now using the complete dataframe
1466
+ target_run = None
1467
+
1468
+ except Exception as e:
1469
+ # Clean up the temp file if something goes wrong
1470
+ if os.path.exists(temp_file.name):
1471
+ os.unlink(temp_file.name)
1472
+ raise e
1473
+ else:
1474
+ # For DataFrame-based clients, update batch_run_data to use the updated input_data_df
1475
+ batch_run_data = input_data_df
1476
+
1477
+ # Update column mappings for DataFrame clients
1478
+ for evaluator_name, mapping in column_mapping.items():
1479
+ mapped_to_values = set(mapping.values())
1480
+ for col in target_generated_columns:
1481
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1482
+
1483
+ # We will add our mapping only if customer did not map target output.
1484
+ if col not in mapping and target_reference not in mapped_to_values:
1485
+ column_mapping[evaluator_name][col] = target_reference
740
1486
 
741
1487
  # After we have generated all columns, we can check if we have everything we need for evaluators.
742
1488
  _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -745,24 +1491,156 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
745
1491
  # via target mapping.
746
1492
  # If both the data and the output dictionary of the target function
747
1493
  # have the same column, then the target function value is used.
1494
+ # NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
1495
+ # Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
1496
+ if input_data_df is not None:
1497
+ if "conversation" in input_data_df.columns or "messages" in input_data_df.columns:
1498
+ # No action is taken when 'conversation' or 'messages' columns are present,
1499
+ # as these indicate chat/conversation data which should not be flattened or mapped by default.
1500
+ pass
1501
+ else:
1502
+ input_data_df = _flatten_object_columns_for_default_mapping(input_data_df)
1503
+
1504
+ # Build default mapping for leaves:
748
1505
  if input_data_df is not None:
1506
+ # First, map flattened nested columns (those containing a dot) to leaf names.
749
1507
  for col in input_data_df.columns:
750
- # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
751
- # Also ignore columns that are already in config, since they've been covered by target mapping.
752
- if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in column_mapping["default"].keys():
1508
+ # Skip target output columns
1509
+ if col.startswith(Prefixes.TSG_OUTPUTS):
1510
+ continue
1511
+ # Skip root container columns (no dot) here; they'll be handled below if truly primitive.
1512
+ if "." in col:
1513
+ leaf_name = col.split(".")[-1]
1514
+ if leaf_name not in column_mapping["default"]:
1515
+ column_mapping["default"][leaf_name] = f"${{data.{col}}}"
1516
+
1517
+ # Then, handle remaining top-level primitive columns (original logic).
1518
+ for col in input_data_df.columns:
1519
+ if (
1520
+ not col.startswith(Prefixes.TSG_OUTPUTS)
1521
+ and col not in column_mapping["default"].keys()
1522
+ and "." not in col # only pure top-level primitives
1523
+ ):
753
1524
  column_mapping["default"][col] = f"${{data.{col}}}"
754
1525
 
755
- def eval_batch_run(
756
- batch_run_client: TClient, *, data=Union[str, os.PathLike, pd.DataFrame]
757
- ) -> Dict[str, __EvaluatorInfo]:
1526
+ return __ValidatedData(
1527
+ evaluators=evaluators,
1528
+ graders=graders,
1529
+ input_data_df=input_data_df,
1530
+ column_mapping=column_mapping,
1531
+ target_run=target_run,
1532
+ batch_run_client=batch_run_client,
1533
+ batch_run_data=batch_run_data,
1534
+ )
1535
+
1536
+
1537
+ def _flatten_object_columns_for_default_mapping(
1538
+ df: pd.DataFrame, root_prefixes: Optional[Iterable[str]] = None
1539
+ ) -> pd.DataFrame:
1540
+ """Flatten nested dictionary-valued columns into dotted leaf columns.
1541
+
1542
+ For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
1543
+ leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
1544
+ columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
1545
+ all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
1546
+ are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
1547
+
1548
+ Example
1549
+ If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
1550
+ columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
1551
+
1552
+ :param df: Input DataFrame to flatten in place.
1553
+ :type df: ~pandas.DataFrame
1554
+ :param root_prefixes: Optional iterable restricting which top-level columns are considered
1555
+ for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
1556
+ :type root_prefixes: Optional[Iterable[str]]
1557
+ :return: The same DataFrame instance (returned for convenient chaining).
1558
+ :rtype: ~pandas.DataFrame
1559
+ """
1560
+ candidate_cols = []
1561
+ if root_prefixes is not None:
1562
+ candidate_cols = [c for c in root_prefixes if c in df.columns]
1563
+ else:
1564
+ # pick columns where at least one non-null value is a dict
1565
+ for c in df.columns:
1566
+ series = df[c]
1567
+ if series.map(lambda v: isinstance(v, dict)).any():
1568
+ candidate_cols.append(c)
1569
+
1570
+ def _extract_leaves(obj: Any, prefix: str) -> Iterator[Tuple[str, Any]]:
1571
+ if isinstance(obj, dict):
1572
+ for k, v in obj.items():
1573
+ new_prefix = f"{prefix}.{k}" if prefix else k
1574
+ if isinstance(v, dict):
1575
+ yield from _extract_leaves(v, new_prefix)
1576
+ else:
1577
+ # treat list / primitive / None as leaf
1578
+ yield new_prefix, v
1579
+
1580
+ for root_col in candidate_cols:
1581
+ # Build a union of leaf paths across rows to ensure consistent columns
1582
+ leaf_paths: Set[str] = set()
1583
+ for val in df[root_col]:
1584
+ if isinstance(val, dict):
1585
+ for path, _ in _extract_leaves(val, root_col):
1586
+ leaf_paths.add(path)
1587
+
1588
+ if not leaf_paths:
1589
+ continue
1590
+
1591
+ # Create each flattened column if absent
1592
+ for path in leaf_paths:
1593
+ if path in df.columns:
1594
+ continue # already present
1595
+ relative_keys = path[len(root_col) + 1 :].split(".") if len(path) > len(root_col) else []
1596
+
1597
+ def getter(root_val: Any) -> Any:
1598
+ cur = root_val
1599
+ for rk in relative_keys:
1600
+ if not isinstance(cur, dict):
1601
+ return None
1602
+ cur = cur.get(rk, None)
1603
+ return cur
1604
+
1605
+ df[path] = df[root_col].map(lambda rv: getter(rv) if isinstance(rv, dict) else None)
1606
+
1607
+ return df
1608
+
1609
+
1610
+ def _run_callable_evaluators(
1611
+ validated_data: __ValidatedData,
1612
+ fail_on_evaluator_errors: bool = False,
1613
+ **kwargs,
1614
+ ) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
1615
+
1616
+ # Extract needed values
1617
+ batch_run_client = validated_data["batch_run_client"]
1618
+ target_run = validated_data["target_run"]
1619
+ batch_run_data = validated_data["batch_run_data"]
1620
+ column_mapping = validated_data["column_mapping"]
1621
+ evaluators = validated_data["evaluators"]
1622
+
1623
+ # Clean up temporary file after evaluation if it was created
1624
+ temp_file_to_cleanup = None
1625
+ if (
1626
+ isinstance(batch_run_client, ProxyClient)
1627
+ and isinstance(batch_run_data, str)
1628
+ and batch_run_data.endswith(".jsonl")
1629
+ ):
1630
+ # Check if it's a temporary file (contains temp directory path)
1631
+ if tempfile.gettempdir() in batch_run_data:
1632
+ temp_file_to_cleanup = batch_run_data
1633
+
1634
+ try:
758
1635
  with EvalRunContext(batch_run_client):
759
1636
  runs = {
760
1637
  evaluator_name: batch_run_client.run(
761
1638
  flow=evaluator,
1639
+ data=batch_run_data,
1640
+ # Don't pass target_run when using complete dataframe
762
1641
  run=target_run,
763
1642
  evaluator_name=evaluator_name,
764
1643
  column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
765
- data=data,
766
1644
  stream=True,
767
1645
  name=kwargs.get("_run_name"),
768
1646
  )
@@ -770,7 +1648,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
770
1648
  }
771
1649
 
772
1650
  # get_details needs to be called within EvalRunContext scope in order to have user agent populated
773
- return {
1651
+ per_evaluator_results: Dict[str, __EvaluatorInfo] = {
774
1652
  evaluator_name: {
775
1653
  "result": batch_run_client.get_details(run, all_results=True),
776
1654
  "metrics": batch_run_client.get_metrics(run),
@@ -778,22 +1656,21 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
778
1656
  }
779
1657
  for evaluator_name, run in runs.items()
780
1658
  }
781
-
782
- # Batch Run
783
- use_pf_client = kwargs.get("_use_pf_client", True)
784
- if use_pf_client:
785
- # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
786
- # multiple evaluators. If the path is already absolute, abspath will return the original path.
787
- data = os.path.abspath(data)
788
- per_evaluator_results = eval_batch_run(ProxyClient(pf_client), data=data)
789
- else:
790
- data = input_data_df
791
- per_evaluator_results = eval_batch_run(CodeClient(), data=input_data_df)
792
-
1659
+ finally:
1660
+ # Clean up temporary file if it was created
1661
+ if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
1662
+ try:
1663
+ os.unlink(temp_file_to_cleanup)
1664
+ except Exception as e:
1665
+ LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
793
1666
  # Concatenate all results
794
- evaluators_result_df = None
1667
+ evaluators_result_df = pd.DataFrame()
795
1668
  evaluators_metric = {}
796
1669
  for evaluator_name, evaluator_result in per_evaluator_results.items():
1670
+ if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
1671
+ _print_summary(per_evaluator_results)
1672
+ _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
1673
+
797
1674
  evaluator_result_df = evaluator_result["result"]
798
1675
 
799
1676
  # drop input columns
@@ -822,31 +1699,821 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
822
1699
  # Rename columns, generated by target function to outputs instead of inputs.
823
1700
  # If target generates columns, already present in the input data, these columns
824
1701
  # will be marked as outputs already so we do not need to rename them.
825
- input_data_df = _rename_columns_conditionally(input_data_df)
826
-
827
- result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
828
- metrics = _aggregate_metrics(evaluators_result_df, evaluators)
829
- metrics.update(evaluators_metric)
830
-
831
- # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
832
- target_run = None
833
- trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
834
- studio_url = None
835
- if trace_destination:
836
- studio_url = _log_metrics_and_instance_results(
837
- metrics,
838
- result_df,
839
- trace_destination,
840
- target_run,
841
- evaluation_name,
842
- )
843
1702
 
844
- result_df_dict = result_df.to_dict("records")
845
- result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
1703
+ input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
1704
+ eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
1705
+ eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
1706
+ eval_metrics.update(evaluators_metric)
846
1707
 
847
- _print_summary(per_evaluator_results)
1708
+ return eval_result_df, eval_metrics, per_evaluator_results
848
1709
 
849
- if output_path:
850
- _write_output(output_path, result)
851
1710
 
852
- return result
1711
+ def _map_names_to_builtins(
1712
+ evaluators: Dict[str, Callable],
1713
+ graders: Dict[str, AzureOpenAIGrader],
1714
+ ) -> Dict[str, str]:
1715
+ """
1716
+ Construct a mapping from user-supplied evaluator names to which known, built-in
1717
+ evaluator or grader they refer to. Custom evaluators are excluded from the mapping
1718
+ as we only want to track built-in evaluators and graders.
1719
+
1720
+ :param evaluators: The dictionary of evaluators.
1721
+ :type evaluators: Dict[str, Callable]
1722
+ :param graders: The dictionary of graders.
1723
+ :type graders: Dict[str, AzureOpenAIGrader]
1724
+ :param evaluator_config: The configuration for evaluators.
1725
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1726
+
1727
+ """
1728
+ from .._eval_mapping import EVAL_CLASS_MAP
1729
+
1730
+ name_map = {}
1731
+
1732
+ for name, evaluator in evaluators.items():
1733
+ # Check if the evaluator is a known built-in evaluator
1734
+ found_eval = False
1735
+ for eval_class, eval_id in EVAL_CLASS_MAP.items():
1736
+ if isinstance(evaluator, eval_class):
1737
+ name_map[name] = eval_id
1738
+ found_eval = True
1739
+ break
1740
+ if not found_eval:
1741
+ # Skip custom evaluators - we only want to track built-in evaluators
1742
+ pass
1743
+
1744
+ for name, grader in graders.items():
1745
+ name_map[name] = grader.id
1746
+
1747
+ return name_map
1748
+
1749
+
1750
+ def _turn_error_logs_into_exception(log_path: str) -> None:
1751
+ """Produce an EvaluationException using the contents of the inputted
1752
+ file as the error message.
1753
+
1754
+ :param log_path: The path to the error log file.
1755
+ :type log_path: str
1756
+ """
1757
+ with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
1758
+ error_message = file.read()
1759
+ raise EvaluationException(
1760
+ message=error_message,
1761
+ target=ErrorTarget.EVALUATE,
1762
+ category=ErrorCategory.FAILED_EXECUTION,
1763
+ blame=ErrorBlame.UNKNOWN,
1764
+ )
1765
+
1766
+
1767
+ def _convert_results_to_aoai_evaluation_results(
1768
+ results: EvaluationResult,
1769
+ logger: logging.Logger,
1770
+ eval_id: Optional[str] = None,
1771
+ eval_run_id: Optional[str] = None,
1772
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None,
1773
+ eval_run_summary: Optional[Dict[str, Any]] = None,
1774
+ eval_meta_data: Optional[Dict[str, Any]] = None,
1775
+ ) -> None:
1776
+ """
1777
+ Convert evaluation results to AOAI evaluation results format.
1778
+
1779
+ Each row of input results.rows looks like:
1780
+ {"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe",
1781
+ "inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.",
1782
+ "outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5}
1783
+
1784
+ Convert each row into new RunOutputItem object with results array.
1785
+
1786
+ :param results: The evaluation results to convert
1787
+ :type results: EvaluationResult
1788
+ :param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria
1789
+ :type eval_meta_data: Dict[str, Any]
1790
+ :param logger: Logger instance
1791
+ :type logger: logging.Logger
1792
+ :return: EvaluationResult with converted evaluation results in AOAI format
1793
+ :rtype: EvaluationResult
1794
+ """
1795
+
1796
+ if evaluators is None:
1797
+ return
1798
+
1799
+ # Get the testing_criteria_name and testing_criteria_type from evaluators
1800
+ testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {}
1801
+ criteria_name_types_from_meta: Optional[Dict[str, str]] = {}
1802
+ if eval_meta_data and "testing_criteria" in eval_meta_data:
1803
+ testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")
1804
+ if testing_criteria_list is not None:
1805
+ for criteria in testing_criteria_list:
1806
+ criteria_name = criteria.get("name")
1807
+ criteria_type = criteria.get("type")
1808
+ if criteria_name is not None and criteria_type is not None:
1809
+ criteria_name_types_from_meta[criteria_name] = criteria
1810
+
1811
+ for criteria_name, evaluator in evaluators.items():
1812
+ criteria_type = None
1813
+ metrics = []
1814
+ if criteria_name in criteria_name_types_from_meta:
1815
+ criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
1816
+ evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
1817
+ current_evaluator_metrics = criteria_name_types_from_meta[criteria_name].get("metrics", None)
1818
+ if current_evaluator_metrics and len(current_evaluator_metrics) > 0:
1819
+ metrics.extend(current_evaluator_metrics)
1820
+ elif evaluator_name:
1821
+ if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
1822
+ evaluator_name = evaluator_name.replace("builtin.", "")
1823
+ metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
1824
+ if metrics_mapped and len(metrics_mapped) > 0:
1825
+ metrics.extend(metrics_mapped)
1826
+ else:
1827
+ metrics.append(criteria_name)
1828
+ else:
1829
+ metrics.append(criteria_name)
1830
+ elif isinstance(evaluator, AzureOpenAIGrader):
1831
+ criteria_type = evaluator._type # pylint: disable=protected-access
1832
+ metrics.append(criteria_name)
1833
+ elif isinstance(evaluator, EvaluatorBase):
1834
+ criteria_type = "azure_ai_evaluator"
1835
+ evaluator_class_name = evaluator.__class__.__name__
1836
+ eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None)
1837
+ if eval_name:
1838
+ metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, [])
1839
+ if metrics_mapped and len(metrics_mapped) > 0:
1840
+ metrics.extend(metrics_mapped)
1841
+ else:
1842
+ metrics.append(criteria_name)
1843
+ else:
1844
+ criteria_type = "unknown"
1845
+ metrics.append(criteria_name)
1846
+ testing_criteria_name_types_metrics[criteria_name] = {"type": criteria_type, "metrics": metrics}
1847
+
1848
+ created_time = int(time.time())
1849
+ converted_rows = []
1850
+
1851
+ for row_idx, row in enumerate(results.get("rows", [])):
1852
+ # Group outputs by test criteria name
1853
+ criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()}
1854
+ input_groups = {}
1855
+ top_sample = {}
1856
+ for key, value in row.items():
1857
+ if key.startswith("outputs."):
1858
+ # Parse key: outputs.<test-criteria-name>.<metric>
1859
+ parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '<criteria-name>', '<metric>']
1860
+ if len(parts) >= 3:
1861
+ criteria_name = parts[1]
1862
+ metric_name = parts[2]
1863
+
1864
+ if criteria_name not in criteria_groups:
1865
+ criteria_groups[criteria_name] = {}
1866
+
1867
+ criteria_groups[criteria_name][metric_name] = value
1868
+ else:
1869
+ input_key = key.replace("inputs.", "") if key.startswith("inputs.") else key
1870
+ if input_key not in input_groups:
1871
+ input_groups[input_key] = value
1872
+
1873
+ # Convert each criteria group to RunOutputItem result
1874
+ run_output_results = []
1875
+ for criteria_name, metrics in criteria_groups.items():
1876
+ # Extract metrics for this criteria
1877
+ expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
1878
+ criteria_type = testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "unknown")
1879
+ result_per_metric = {}
1880
+ # Find score - look for various score patterns
1881
+ for metric_key, metric_value in metrics.items():
1882
+ if metric_key.endswith("_score") or metric_key == "score":
1883
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1884
+ if metric not in result_per_metric:
1885
+ result_per_metric[metric] = {"score": metric_value}
1886
+ else:
1887
+ result_per_metric[metric]["score"] = metric_value
1888
+ _append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
1889
+ if metric_key == "passed":
1890
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1891
+ if metric not in result_per_metric:
1892
+ result_per_metric[metric] = {"passed": metric_value}
1893
+ else:
1894
+ result_per_metric[metric]["passed"] = metric_value
1895
+ _append_indirect_attachments_to_results(result_per_metric, "passed", metric, metric_value)
1896
+ elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
1897
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1898
+ label = metric_value
1899
+ passed = (
1900
+ True if (str(metric_value).lower() == "pass" or str(metric_value).lower() == "true") else False
1901
+ )
1902
+ if metric not in result_per_metric:
1903
+ if criteria_type == "azure_ai_evaluator":
1904
+ result_per_metric[metric] = {"label": label, "passed": passed}
1905
+ else:
1906
+ result_per_metric[metric] = {"label": label}
1907
+ else:
1908
+ result_per_metric[metric]["label"] = metric_value
1909
+ if criteria_type == "azure_ai_evaluator":
1910
+ result_per_metric[metric]["passed"] = passed
1911
+ _append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
1912
+ if criteria_type == "azure_ai_evaluator":
1913
+ _append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
1914
+ elif (
1915
+ metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
1916
+ ) or metric_key == "reason":
1917
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1918
+ if metric not in result_per_metric:
1919
+ result_per_metric[metric] = {"reason": metric_value}
1920
+ else:
1921
+ result_per_metric[metric]["reason"] = metric_value
1922
+ _append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
1923
+ elif metric_key.endswith("_threshold") or metric_key == "threshold":
1924
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1925
+ if metric not in result_per_metric:
1926
+ result_per_metric[metric] = {"threshold": metric_value}
1927
+ else:
1928
+ result_per_metric[metric]["threshold"] = metric_value
1929
+ _append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
1930
+ elif metric_key == "sample":
1931
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1932
+ if metric not in result_per_metric:
1933
+ result_per_metric[metric] = {"sample": metric_value}
1934
+ else:
1935
+ result_per_metric[metric]["sample"] = metric_value
1936
+ _append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
1937
+ elif metric_key.endswith("_finish_reason"):
1938
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1939
+ if metric not in result_per_metric:
1940
+ result_per_metric[metric] = {"sample": {"finish_reason": metric_value}}
1941
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1942
+ result_per_metric[metric]["sample"] = {"finish_reason": metric_value}
1943
+ elif (
1944
+ metric in result_per_metric
1945
+ and "sample" in result_per_metric[metric]
1946
+ and "finish_reason" not in result_per_metric[metric]["sample"]
1947
+ ):
1948
+ result_per_metric[metric]["sample"]["finish_reason"] = metric_value
1949
+ _append_indirect_attachments_to_results(
1950
+ result_per_metric, "sample", metric, metric_value, "finish_reason"
1951
+ )
1952
+ elif metric_key.endswith("_model"):
1953
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1954
+ if metric not in result_per_metric:
1955
+ result_per_metric[metric] = {"sample": {"model": metric_value}}
1956
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1957
+ result_per_metric[metric]["sample"] = {"model": metric_value}
1958
+ elif (
1959
+ metric in result_per_metric
1960
+ and "sample" in result_per_metric[metric]
1961
+ and "model" not in result_per_metric[metric]["sample"]
1962
+ ):
1963
+ result_per_metric[metric]["sample"]["model"] = metric_value
1964
+ _append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
1965
+ elif metric_key.endswith("_sample_input"):
1966
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1967
+ input_metric_val_json: Optional[List[Dict[str, Any]]] = []
1968
+ try:
1969
+ input_metric_val_json = json.loads(metric_value)
1970
+ except Exception as e:
1971
+ logger.warning(f"Failed to parse _sample_input value as JSON: {e}")
1972
+ if metric not in result_per_metric:
1973
+ result_per_metric[metric] = {"sample": {"input": input_metric_val_json}}
1974
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1975
+ result_per_metric[metric]["sample"] = {"input": input_metric_val_json}
1976
+ elif (
1977
+ metric in result_per_metric
1978
+ and "sample" in result_per_metric[metric]
1979
+ and "input" not in result_per_metric[metric]["sample"]
1980
+ ):
1981
+ result_per_metric[metric]["sample"]["input"] = input_metric_val_json
1982
+ _append_indirect_attachments_to_results(
1983
+ result_per_metric, "sample", metric, input_metric_val_json, "input"
1984
+ )
1985
+ elif metric_key.endswith("_sample_output"):
1986
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1987
+ output_metric_val_json: Optional[List[Dict[str, Any]]] = []
1988
+ try:
1989
+ output_metric_val_json = json.loads(metric_value)
1990
+ except Exception as e:
1991
+ logger.warning(f"Failed to parse _sample_output value as JSON: {e}")
1992
+ if metric not in result_per_metric:
1993
+ result_per_metric[metric] = {"sample": {"output": output_metric_val_json}}
1994
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1995
+ result_per_metric[metric]["sample"] = {"output": output_metric_val_json}
1996
+ elif (
1997
+ metric in result_per_metric
1998
+ and "sample" in result_per_metric[metric]
1999
+ and "output" not in result_per_metric[metric]["sample"]
2000
+ ):
2001
+ result_per_metric[metric]["sample"]["output"] = output_metric_val_json
2002
+ _append_indirect_attachments_to_results(
2003
+ result_per_metric, "sample", metric, output_metric_val_json, "output"
2004
+ )
2005
+ elif metric_key.endswith("_total_tokens"):
2006
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2007
+ metric_value = None if _is_none_or_nan(metric_value) else metric_value
2008
+ if metric not in result_per_metric:
2009
+ result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
2010
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2011
+ result_per_metric[metric]["sample"] = {"usage": {"total_tokens": metric_value}}
2012
+ elif (
2013
+ metric in result_per_metric
2014
+ and "sample" in result_per_metric[metric]
2015
+ and "usage" not in result_per_metric[metric]["sample"]
2016
+ ):
2017
+ result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
2018
+ else:
2019
+ result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
2020
+ _append_indirect_attachments_to_results(
2021
+ result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
2022
+ )
2023
+ elif metric_key.endswith("_prompt_tokens"):
2024
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2025
+ metric_value = None if _is_none_or_nan(metric_value) else metric_value
2026
+ if metric not in result_per_metric:
2027
+ result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
2028
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2029
+ result_per_metric[metric]["sample"] = {"usage": {"prompt_tokens": metric_value}}
2030
+ elif (
2031
+ metric in result_per_metric
2032
+ and "sample" in result_per_metric[metric]
2033
+ and "usage" not in result_per_metric[metric]["sample"]
2034
+ ):
2035
+ result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
2036
+ else:
2037
+ result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
2038
+ _append_indirect_attachments_to_results(
2039
+ result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
2040
+ )
2041
+ elif metric_key.endswith("_completion_tokens"):
2042
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2043
+ metric_value = None if _is_none_or_nan(metric_value) else metric_value
2044
+ if metric not in result_per_metric:
2045
+ result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
2046
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2047
+ result_per_metric[metric]["sample"] = {"usage": {"completion_tokens": metric_value}}
2048
+ elif (
2049
+ metric in result_per_metric
2050
+ and "sample" in result_per_metric[metric]
2051
+ and "usage" not in result_per_metric[metric]["sample"]
2052
+ ):
2053
+ result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
2054
+ else:
2055
+ result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
2056
+ _append_indirect_attachments_to_results(
2057
+ result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
2058
+ )
2059
+ elif not any(
2060
+ metric_key.endswith(suffix)
2061
+ for suffix in [
2062
+ "_result",
2063
+ "_reason",
2064
+ "_threshold",
2065
+ "_label",
2066
+ "_score",
2067
+ "_model",
2068
+ "_finish_reason",
2069
+ "_sample_input",
2070
+ "_sample_output",
2071
+ "_total_tokens",
2072
+ "_prompt_tokens",
2073
+ "_completion_tokens",
2074
+ ]
2075
+ ):
2076
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2077
+ # If no score found yet and this doesn't match other patterns, use as score
2078
+ if metric_key == metric and metric not in result_per_metric:
2079
+ result_per_metric[metric] = {"score": metric_value}
2080
+ elif metric_key == metric and result_per_metric[metric].get("score", None) is None:
2081
+ result_per_metric[metric]["score"] = metric_value
2082
+
2083
+ for metric, metric_values in result_per_metric.items():
2084
+ score = metric_values.get("score", None)
2085
+ label = metric_values.get("label", None)
2086
+ reason = metric_values.get("reason", None)
2087
+ threshold = metric_values.get("threshold", None)
2088
+ passed = metric_values.get("passed", None)
2089
+ sample = metric_values.get("sample", None)
2090
+
2091
+ # Create result object for this criteria
2092
+ result_obj = {
2093
+ "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2094
+ "type", "azure_ai_evaluator"
2095
+ ),
2096
+ "name": criteria_name, # Use criteria name as name
2097
+ "metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2098
+ }
2099
+ # Add optional fields
2100
+ if (
2101
+ metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
2102
+ or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
2103
+ or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
2104
+ ):
2105
+ copy_label = label
2106
+ if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
2107
+ label = "fail"
2108
+ score = 0.0
2109
+ passed = False
2110
+ else:
2111
+ label = "pass"
2112
+ score = 1.0
2113
+ passed = True
2114
+ result_obj["score"] = (
2115
+ score if not (score is None or (isinstance(score, float) and math.isnan(score))) else None
2116
+ )
2117
+ result_obj["label"] = label
2118
+ result_obj["reason"] = reason
2119
+ result_obj["threshold"] = threshold
2120
+ result_obj["passed"] = passed
2121
+
2122
+ if sample is not None:
2123
+ result_obj["sample"] = sample
2124
+ top_sample = sample # Save top sample for the row
2125
+ run_output_results.append(result_obj)
2126
+
2127
+ if (
2128
+ eval_run_summary
2129
+ and criteria_name in eval_run_summary
2130
+ and isinstance(eval_run_summary[criteria_name], dict)
2131
+ and "error_code" in eval_run_summary[criteria_name]
2132
+ ) and eval_run_summary[criteria_name].get("error_code", None) is not None:
2133
+ error_info = (
2134
+ {
2135
+ "code": eval_run_summary[criteria_name].get("error_code", None),
2136
+ "message": eval_run_summary[criteria_name].get("error_message", None),
2137
+ }
2138
+ if eval_run_summary[criteria_name].get("error_code", None) is not None
2139
+ else None
2140
+ )
2141
+ sample = {"error": error_info} if error_info is not None else None
2142
+ # Create result object for this criteria
2143
+ metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
2144
+ for metric in metrics:
2145
+ should_add_error_summary = True
2146
+ for result in run_output_results:
2147
+ if result.get("name", None) == criteria_name and result.get("metric", None) == metric:
2148
+ rs_score = result.get("score", None)
2149
+ rs_threshold = result.get("threshold", None)
2150
+ rs_label = result.get("label", None)
2151
+ rs_reason = result.get("reason", None)
2152
+ if (
2153
+ _is_none_or_nan(rs_score)
2154
+ and _is_none_or_nan(rs_threshold)
2155
+ and _is_none_or_nan(rs_label)
2156
+ and _is_none_or_nan(rs_reason)
2157
+ ):
2158
+ run_output_results.remove(result)
2159
+ else:
2160
+ should_add_error_summary = False
2161
+ break # Skip if already have result for this criteria and metric
2162
+ if should_add_error_summary:
2163
+ result_obj = {
2164
+ "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2165
+ "type", "azure_ai_evaluator"
2166
+ ),
2167
+ "name": criteria_name, # Use criteria name as name
2168
+ "metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2169
+ "score": None,
2170
+ "label": None,
2171
+ "reason": None,
2172
+ "threshold": None,
2173
+ "passed": None,
2174
+ "sample": sample,
2175
+ }
2176
+ run_output_results.append(result_obj)
2177
+
2178
+ # Create RunOutputItem structure
2179
+ run_output_item = {
2180
+ "object": "eval.run.output_item",
2181
+ "id": f"{row_idx+1}",
2182
+ "run_id": eval_run_id,
2183
+ "eval_id": eval_id,
2184
+ "created_at": created_time,
2185
+ "datasource_item_id": row_idx,
2186
+ "datasource_item": input_groups,
2187
+ "results": run_output_results,
2188
+ "status": "completed" if len(run_output_results) > 0 else "error",
2189
+ }
2190
+
2191
+ run_output_item["sample"] = top_sample
2192
+
2193
+ converted_rows.append(run_output_item)
2194
+
2195
+ # Create converted results maintaining the same structure
2196
+ results["_evaluation_results_list"] = converted_rows
2197
+ logger.info(
2198
+ f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
2199
+ )
2200
+ # Calculate summary statistics
2201
+ evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger, criteria_name_types_from_meta)
2202
+ results["_evaluation_summary"] = evaluation_summary
2203
+ logger.info(
2204
+ f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
2205
+ )
2206
+
2207
+
2208
+ def _is_none_or_nan(value: Any) -> bool:
2209
+ """
2210
+ Check if a value is None or NaN.
2211
+
2212
+ :param value: The value to check
2213
+ :type value: Any
2214
+ :return: True if the value is None or NaN, False otherwise
2215
+ :rtype: bool
2216
+ """
2217
+ if value is None:
2218
+ return True
2219
+ if isinstance(value, float) and math.isnan(value):
2220
+ return True
2221
+ if isinstance(value, str) and value.lower() in ["nan", "null", "none", ""]:
2222
+ return True
2223
+ return False
2224
+
2225
+
2226
+ def _append_indirect_attachments_to_results(
2227
+ current_result_dict: Dict[str, Any],
2228
+ result_name: str,
2229
+ metric: str,
2230
+ metric_value: Any,
2231
+ nested_result_name: Optional[str] = None,
2232
+ secondnested_result_name: Optional[str] = None,
2233
+ ) -> None:
2234
+ """
2235
+ Append indirect attachments to the current result dictionary.
2236
+
2237
+ :param current_result_dict: The current result dictionary to update
2238
+ :type current_result_dict: Dict[str, Any]
2239
+ :param result_name: The result name
2240
+ :type result_name: str
2241
+ :param metric: The metric name
2242
+ :type metric: str
2243
+ :param metric_value: The value of the metric
2244
+ :type metric_value: Any
2245
+ """
2246
+ if metric == "xpia" and result_name:
2247
+ for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
2248
+ if nested_result_name is None:
2249
+ if metric_extended not in current_result_dict:
2250
+ current_result_dict[metric_extended] = {result_name: metric_value}
2251
+ else:
2252
+ current_result_dict[metric_extended][result_name] = metric_value
2253
+ elif nested_result_name is not None and secondnested_result_name is None:
2254
+ if metric_extended not in current_result_dict:
2255
+ current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
2256
+ elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
2257
+ current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
2258
+ elif (
2259
+ metric_extended in current_result_dict
2260
+ and result_name in current_result_dict[metric_extended]
2261
+ and nested_result_name not in current_result_dict[metric_extended][result_name]
2262
+ ):
2263
+ current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
2264
+ elif nested_result_name is not None and secondnested_result_name is not None:
2265
+ if metric_extended not in current_result_dict:
2266
+ current_result_dict[metric_extended] = {
2267
+ result_name: {nested_result_name: {secondnested_result_name: metric_value}}
2268
+ }
2269
+ elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
2270
+ current_result_dict[metric_extended][result_name] = {
2271
+ nested_result_name: {secondnested_result_name: metric_value}
2272
+ }
2273
+ elif (
2274
+ metric_extended in current_result_dict
2275
+ and result_name in current_result_dict[metric_extended]
2276
+ and nested_result_name not in current_result_dict[metric_extended][result_name]
2277
+ ):
2278
+ current_result_dict[metric_extended][result_name][nested_result_name] = {
2279
+ secondnested_result_name: metric_value
2280
+ }
2281
+ else:
2282
+ (
2283
+ current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
2284
+ ) = metric_value
2285
+
2286
+
2287
+ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
2288
+ """
2289
+ Get the metric name from the testing criteria and metric key.
2290
+
2291
+ :param testing_criteria_name: The name of the testing criteria
2292
+ :type testing_criteria_name: str
2293
+ :param metric_key: The metric key to look for
2294
+ :type metric_key: str
2295
+ :param metric_list: List of expected metrics for the testing criteria
2296
+ :type metric_list: List[str]
2297
+ :return: The metric name if found, otherwise the testing criteria name
2298
+ :rtype: str
2299
+ """
2300
+ metric = None
2301
+
2302
+ if metric_key == "xpia_manipulated_content":
2303
+ metric = "xpia_manipulated_content"
2304
+ return metric
2305
+ elif metric_key == "xpia_intrusion":
2306
+ metric = "xpia_intrusion"
2307
+ return metric
2308
+ elif metric_key == "xpia_information_gathering":
2309
+ metric = "xpia_information_gathering"
2310
+ return metric
2311
+ for expected_metric in metric_list:
2312
+ if metric_key.startswith(expected_metric):
2313
+ metric = expected_metric
2314
+ break
2315
+ if metric is None:
2316
+ metric = testing_criteria_name
2317
+ return metric
2318
+
2319
+
2320
+ def _is_primary_metric(metric_name: str, evaluator_name: str) -> bool:
2321
+ """
2322
+ Check if the given metric name is a primary metric.
2323
+
2324
+ :param metric_name: The name of the metric
2325
+ :type metric_name: str
2326
+ :param evaluator_name: The name of the evaluator
2327
+ :type evaluator_name: str
2328
+ :return: True if the metric is a primary metric, False otherwise
2329
+ :rtype: bool
2330
+ """
2331
+ if (
2332
+ not _is_none_or_nan(metric_name)
2333
+ and not _is_none_or_nan(evaluator_name)
2334
+ and evaluator_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS
2335
+ and isinstance(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name], list)
2336
+ and len(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]) > 1
2337
+ and metric_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]
2338
+ and metric_name.lower() != _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name][0].lower()
2339
+ ):
2340
+ return False
2341
+ else:
2342
+ return True
2343
+
2344
+
2345
+ def _calculate_aoai_evaluation_summary(
2346
+ aoai_results: list, logger: logging.Logger, criteria_name_types_from_meta: Optional[Dict[str, Any]]
2347
+ ) -> Dict[str, Any]:
2348
+ """
2349
+ Calculate summary statistics for AOAI evaluation results.
2350
+
2351
+ :param aoai_results: List of AOAI result objects (run_output_items)
2352
+ :type aoai_results: list
2353
+ :return: Summary statistics dictionary
2354
+ :rtype: Dict[str, Any]
2355
+ """
2356
+ # Calculate result counts based on aoaiResults
2357
+ result_counts = {"total": 0, "errored": 0, "failed": 0, "passed": 0}
2358
+
2359
+ # Count results by status and calculate per model usage
2360
+ model_usage_stats = {} # Dictionary to aggregate usage by model
2361
+ result_counts_stats = {} # Dictionary to aggregate usage by model
2362
+
2363
+ for aoai_result in aoai_results:
2364
+ logger.info(
2365
+ f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}"
2366
+ )
2367
+ result_counts["total"] += 1
2368
+ passed_count = 0
2369
+ failed_count = 0
2370
+ error_count = 0
2371
+ if isinstance(aoai_result, dict) and "results" in aoai_result:
2372
+ logger.info(
2373
+ f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}"
2374
+ )
2375
+ for result_item in aoai_result["results"]:
2376
+ if isinstance(result_item, dict):
2377
+ testing_criteria = result_item.get("name", "")
2378
+ is_primary_metric = True
2379
+ if (
2380
+ criteria_name_types_from_meta is not None
2381
+ and isinstance(criteria_name_types_from_meta, dict)
2382
+ and testing_criteria in criteria_name_types_from_meta
2383
+ ):
2384
+ evaluator_name = criteria_name_types_from_meta[testing_criteria].get("evaluator_name", None)
2385
+ criteria_type = criteria_name_types_from_meta[testing_criteria].get("type", None)
2386
+ if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
2387
+ evaluator_name = evaluator_name.replace("builtin.", "")
2388
+ is_primary_metric = _is_primary_metric(result_item.get("metric", ""), evaluator_name)
2389
+ if not is_primary_metric:
2390
+ logger.info(
2391
+ f"Skip counts for non-primary metric for testing_criteria: {testing_criteria}, metric: {result_item.get('metric', '')}"
2392
+ )
2393
+ continue
2394
+ # Check if the result has a 'passed' field
2395
+ if "passed" in result_item and result_item["passed"] is not None:
2396
+ if testing_criteria not in result_counts_stats:
2397
+ result_counts_stats[testing_criteria] = {
2398
+ "testing_criteria": testing_criteria,
2399
+ "failed": 0,
2400
+ "passed": 0,
2401
+ }
2402
+ if result_item["passed"] is True:
2403
+ passed_count += 1
2404
+ result_counts_stats[testing_criteria]["passed"] += 1
2405
+
2406
+ elif result_item["passed"] is False:
2407
+ failed_count += 1
2408
+ result_counts_stats[testing_criteria]["failed"] += 1
2409
+ # Check if the result indicates an error status
2410
+ elif ("status" in result_item and result_item["status"] in ["error", "errored"]) or (
2411
+ "sample" in result_item
2412
+ and isinstance(result_item["sample"], dict)
2413
+ and result_item["sample"].get("error", None) is not None
2414
+ ):
2415
+ error_count += 1
2416
+ elif hasattr(aoai_result, "status") and aoai_result.status == "error":
2417
+ error_count += 1
2418
+ elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
2419
+ error_count += 1
2420
+
2421
+ # Update overall result counts, error counts will not be considered for passed/failed
2422
+ if error_count > 0:
2423
+ result_counts["errored"] += 1
2424
+
2425
+ if failed_count > 0:
2426
+ result_counts["failed"] += 1
2427
+ elif (
2428
+ failed_count == 0 and passed_count > 0 and passed_count == len(aoai_result.get("results", [])) - error_count
2429
+ ):
2430
+ result_counts["passed"] += 1
2431
+
2432
+ # Extract usage statistics from aoai_result.sample
2433
+ sample_data_list = []
2434
+ dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
2435
+ dup_usage_list.remove("xpia")
2436
+ if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
2437
+ for result_item in aoai_result["results"]:
2438
+ if (
2439
+ isinstance(result_item, dict)
2440
+ and "sample" in result_item
2441
+ and result_item["sample"]
2442
+ and result_item["metric"] not in dup_usage_list
2443
+ ):
2444
+ sample_data_list.append(result_item["sample"])
2445
+
2446
+ for sample_data in sample_data_list:
2447
+ if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
2448
+ usage_data = sample_data["usage"]
2449
+ model_name = sample_data.get("model", "unknown") if usage_data.get("model", "unknown") else "unknown"
2450
+ if _is_none_or_nan(model_name):
2451
+ continue
2452
+ if model_name not in model_usage_stats:
2453
+ model_usage_stats[model_name] = {
2454
+ "invocation_count": 0,
2455
+ "total_tokens": 0,
2456
+ "prompt_tokens": 0,
2457
+ "completion_tokens": 0,
2458
+ "cached_tokens": 0,
2459
+ }
2460
+ # Aggregate usage statistics
2461
+ model_stats = model_usage_stats[model_name]
2462
+ model_stats["invocation_count"] += 1
2463
+ if isinstance(usage_data, dict):
2464
+ cur_total_tokens = usage_data.get("total_tokens", 0)
2465
+ if _is_none_or_nan(cur_total_tokens):
2466
+ cur_total_tokens = 0
2467
+ cur_prompt_tokens = usage_data.get("prompt_tokens", 0)
2468
+ if _is_none_or_nan(cur_prompt_tokens):
2469
+ cur_prompt_tokens = 0
2470
+ cur_completion_tokens = usage_data.get("completion_tokens", 0)
2471
+ if _is_none_or_nan(cur_completion_tokens):
2472
+ cur_completion_tokens = 0
2473
+ cur_cached_tokens = usage_data.get("cached_tokens", 0)
2474
+ if _is_none_or_nan(cur_cached_tokens):
2475
+ cur_cached_tokens = 0
2476
+ logger.info(
2477
+ f"Model: {model_name}, cur_total_tokens: {cur_total_tokens}, {_is_none_or_nan(cur_total_tokens)}, cur_prompt_tokens: {cur_prompt_tokens}, cur_completion_tokens: {cur_completion_tokens}, cur_cached_tokens: {cur_cached_tokens}"
2478
+ )
2479
+ model_stats["total_tokens"] += cur_total_tokens
2480
+ model_stats["prompt_tokens"] += cur_prompt_tokens
2481
+ model_stats["completion_tokens"] += cur_completion_tokens
2482
+ model_stats["cached_tokens"] += cur_cached_tokens
2483
+
2484
+ # Convert model usage stats to list format matching EvaluationRunPerModelUsage
2485
+ per_model_usage = []
2486
+ for model_name, stats in model_usage_stats.items():
2487
+ per_model_usage.append(
2488
+ {
2489
+ "model_name": model_name,
2490
+ "invocation_count": stats["invocation_count"],
2491
+ "total_tokens": stats["total_tokens"],
2492
+ "prompt_tokens": stats["prompt_tokens"],
2493
+ "completion_tokens": stats["completion_tokens"],
2494
+ "cached_tokens": stats["cached_tokens"],
2495
+ }
2496
+ )
2497
+ result_counts_stats_val = []
2498
+ logger.info(f"\r\n Result counts stats: {result_counts_stats}")
2499
+ for criteria_name, stats_val in result_counts_stats.items():
2500
+ if isinstance(stats_val, dict):
2501
+ logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}")
2502
+ cur_passed = stats_val.get("passed", 0)
2503
+ if _is_none_or_nan(cur_passed):
2504
+ cur_passed = 0
2505
+ cur_failed_count = stats_val.get("failed", 0)
2506
+ if _is_none_or_nan(cur_failed_count):
2507
+ cur_failed_count = 0
2508
+ result_counts_stats_val.append(
2509
+ {
2510
+ "testing_criteria": criteria_name if not _is_none_or_nan(criteria_name) else "unknown",
2511
+ "passed": cur_passed,
2512
+ "failed": cur_failed_count,
2513
+ }
2514
+ )
2515
+ return {
2516
+ "result_counts": result_counts,
2517
+ "per_model_usage": per_model_usage,
2518
+ "per_testing_criteria_results": result_counts_stats_val,
2519
+ }