azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -2,37 +2,128 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import inspect
5
+ import contextlib
6
+ import json
7
+ import logging
8
+ import math
5
9
  import os
6
10
  import re
7
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
8
-
9
- import numpy as np
11
+ import tempfile
12
+ import json
13
+ import time
14
+ from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
15
+
16
+ from openai import OpenAI, AzureOpenAI
17
+ from azure.ai.evaluation._legacy._adapters._constants import LINE_NUMBER
18
+ from azure.ai.evaluation._legacy._adapters.entities import Run
10
19
  import pandas as pd
11
- from promptflow._sdk._constants import LINE_NUMBER
12
- from promptflow.client import PFClient
13
20
 
21
+ from azure.ai.evaluation._common.math import list_mean_nan_safe, apply_transform_nan_safe
22
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
23
+ from azure.ai.evaluation._evaluators._common._base_eval import EvaluatorBase
14
24
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
25
+ from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
15
26
 
16
27
  from .._constants import (
17
28
  CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
29
+ EVALUATION_PASS_FAIL_MAPPING,
18
30
  EvaluationMetrics,
31
+ DefaultOpenEncoding,
19
32
  Prefixes,
20
33
  _InternalEvaluationMetrics,
34
+ BINARY_AGGREGATE_SUFFIX,
35
+ DEFAULT_OAI_EVAL_RUN_NAME,
36
+ EVALUATION_EVENT_NAME,
37
+ _EvaluatorMetricMapping,
38
+ )
39
+ from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig, AppInsightsConfig
40
+ from .._user_agent import UserAgentSingleton
41
+ from ._batch_run import (
42
+ EvalRunContext,
43
+ CodeClient,
44
+ ProxyClient,
45
+ TargetRunContext,
46
+ RunSubmitterClient,
21
47
  )
22
- from .._model_configurations import AzureAIProject
23
- from .._user_agent import USER_AGENT
24
- from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient
25
48
  from ._utils import (
26
49
  _apply_column_mapping,
27
50
  _log_metrics_and_instance_results,
28
51
  _trace_destination_from_project_scope,
29
52
  _write_output,
53
+ DataLoaderFactory,
54
+ _log_metrics_and_instance_results_onedp,
55
+ )
56
+ from ._batch_run.batch_clients import BatchClient, BatchClientRun
57
+
58
+ from ._evaluate_aoai import (
59
+ _begin_aoai_evaluation,
60
+ _split_evaluators_and_grader_configs,
61
+ _get_evaluation_run_results,
62
+ OAIEvalRunCreationInfo,
30
63
  )
31
64
 
65
+ LOGGER = logging.getLogger(__name__)
66
+
67
+ # For metrics (aggregates) whose metric names intentionally differ from their
68
+ # originating column name, usually because the aggregation of the original value
69
+ # means something sufficiently different.
70
+ # Note that content safety metrics are handled separately.
71
+ METRIC_COLUMN_NAME_REPLACEMENTS = {
72
+ "groundedness_pro_label": "groundedness_pro_passing_rate",
73
+ }
74
+
75
+
76
+ class __EvaluatorInfo(TypedDict):
77
+ result: pd.DataFrame
78
+ metrics: Dict[str, Any]
79
+ run_summary: Dict[str, Any]
80
+
81
+
82
+ class __ValidatedData(TypedDict):
83
+ """
84
+ Simple dictionary that contains ALL pre-processed data and
85
+ the resultant objects that are needed for downstream evaluation.
86
+ """
87
+
88
+ evaluators: Dict[str, Callable]
89
+ graders: Dict[str, AzureOpenAIGrader]
90
+ input_data_df: pd.DataFrame
91
+ column_mapping: Dict[str, Dict[str, str]]
92
+ target_run: Optional[BatchClientRun]
93
+ batch_run_client: BatchClient
94
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame]
95
+
96
+
97
+ def _aggregate_other_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[str, float]]:
98
+ """Identify and average various metrics that need to have the metric name be replaced,
99
+ instead of having the metric match the originating column name.
100
+ :param df: The dataframe of evaluation results.
101
+ :type df: ~pandas.DataFrame
102
+ :return: A tuple; the first element is a list of dataframe columns that were aggregated,
103
+ and the second element is a dictionary of resultant new metric column names and their values.
104
+ :rtype: Tuple[List[str], Dict[str, float]]
105
+ """
106
+ renamed_cols = []
107
+ metric_columns = {}
108
+ for col in df.columns:
109
+ metric_prefix = col.split(".")[0]
110
+ metric_name = col.split(".")[1]
111
+ if metric_name in METRIC_COLUMN_NAME_REPLACEMENTS:
112
+ renamed_cols.append(col)
113
+ new_col_name = metric_prefix + "." + METRIC_COLUMN_NAME_REPLACEMENTS[metric_name]
114
+ col_with_numeric_values = cast(List[float], pd.to_numeric(df[col], errors="coerce"))
115
+ try:
116
+ metric_columns[new_col_name] = round(list_mean_nan_safe(col_with_numeric_values), 2)
117
+ except EvaluationException: # only exception that can be cause is all NaN values
118
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
119
+ LOGGER.warning(msg)
120
+
121
+ return renamed_cols, metric_columns
122
+
32
123
 
33
124
  # pylint: disable=line-too-long
34
125
  def _aggregate_content_safety_metrics(
35
- df: pd.DataFrame, evaluators: Dict[str, Type]
126
+ df: pd.DataFrame, evaluators: Dict[str, Callable]
36
127
  ) -> Tuple[List[str], Dict[str, float]]:
37
128
  """Find and aggregate defect rates for content safety metrics. Returns both a list
38
129
  of columns that were used to calculate defect rates and the defect rates themselves.
@@ -61,7 +152,6 @@ def _aggregate_content_safety_metrics(
61
152
  module = inspect.getmodule(evaluators[evaluator_name])
62
153
  if (
63
154
  module
64
- and module.__name__.startswith("azure.ai.evaluation.")
65
155
  and metric_name.endswith("_score")
66
156
  and metric_name.replace("_score", "") in content_safety_metrics
67
157
  ):
@@ -71,12 +161,16 @@ def _aggregate_content_safety_metrics(
71
161
  defect_rates = {}
72
162
  for col in content_safety_df.columns:
73
163
  defect_rate_name = col.replace("_score", "_defect_rate")
74
- col_with_numeric_values = pd.to_numeric(content_safety_df[col], errors="coerce")
75
- defect_rates[defect_rate_name] = round(
76
- np.sum(col_with_numeric_values >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT)
77
- / col_with_numeric_values.count(),
78
- 2,
79
- )
164
+ col_with_numeric_values = cast(List[float], pd.to_numeric(content_safety_df[col], errors="coerce"))
165
+ try:
166
+ col_with_boolean_values = apply_transform_nan_safe(
167
+ col_with_numeric_values, lambda x: 1 if x >= CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT else 0
168
+ )
169
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
170
+ except EvaluationException: # only exception that can be cause is all NaN values
171
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
172
+ LOGGER.warning(msg)
173
+
80
174
  return content_safety_cols, defect_rates
81
175
 
82
176
 
@@ -92,28 +186,152 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
92
186
  """
93
187
  handled_metrics = [
94
188
  EvaluationMetrics.PROTECTED_MATERIAL,
189
+ EvaluationMetrics.FICTIONAL_CHARACTERS,
190
+ EvaluationMetrics.ARTWORK,
191
+ EvaluationMetrics.LOGOS_AND_BRANDS,
95
192
  _InternalEvaluationMetrics.ECI,
96
193
  EvaluationMetrics.XPIA,
194
+ EvaluationMetrics.CODE_VULNERABILITY,
195
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
97
196
  ]
98
197
  label_cols = []
198
+ details_cols = []
99
199
  for col in df.columns:
100
200
  metric_name = col.split(".")[1]
101
201
  if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
102
202
  label_cols.append(col)
203
+ if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
204
+ details_cols = col
103
205
 
104
206
  label_df = df[label_cols]
105
207
  defect_rates = {}
106
208
  for col in label_df.columns:
107
209
  defect_rate_name = col.replace("_label", "_defect_rate")
108
- col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
109
- defect_rates[defect_rate_name] = round(
110
- np.sum(col_with_boolean_values) / col_with_boolean_values.count(),
111
- 2,
112
- )
210
+ col_with_boolean_values = cast(List[float], pd.to_numeric(label_df[col], errors="coerce"))
211
+ try:
212
+ defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
213
+ except EvaluationException: # only exception that can be cause is all NaN values
214
+ msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
215
+ LOGGER.warning(msg)
216
+
217
+ if details_cols:
218
+ details_df = df[details_cols]
219
+ detail_defect_rates = {}
220
+
221
+ for key, value in details_df.items():
222
+ _process_rows(value, detail_defect_rates)
223
+
224
+ for key, value in detail_defect_rates.items():
225
+ col_with_boolean_values = pd.to_numeric(value, errors="coerce")
226
+ try:
227
+ defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
228
+ list_mean_nan_safe(col_with_boolean_values), 2
229
+ )
230
+ except EvaluationException: # only exception that can be cause is all NaN values
231
+ msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
232
+ LOGGER.warning(msg)
233
+
113
234
  return label_cols, defect_rates
114
235
 
115
236
 
116
- def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[str, float]:
237
+ def _process_rows(row, detail_defect_rates):
238
+ for key, value in row.items():
239
+ if key not in detail_defect_rates:
240
+ detail_defect_rates[key] = []
241
+ detail_defect_rates[key].append(value)
242
+ return detail_defect_rates
243
+
244
+
245
+ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
246
+ """
247
+ Aggregate binary output results (pass/fail) from evaluation dataframe.
248
+
249
+ For each evaluator, calculates the proportion of "pass" results.
250
+
251
+ :param df: The dataframe of evaluation results.
252
+ :type df: ~pandas.DataFrame
253
+ :return: A dictionary mapping evaluator names to the proportion of pass results.
254
+ :rtype: Dict[str, float]
255
+ """
256
+ results = {}
257
+
258
+ # Find all columns that end with "_result"
259
+ result_columns = [col for col in df.columns if col.startswith("outputs.") and col.endswith("_result")]
260
+
261
+ for col in result_columns:
262
+ # Extract the evaluator name from the column name
263
+ # (outputs.<evaluator>.<metric>_result)
264
+ parts = col.split(".")
265
+ evaluator_name = None
266
+ if len(parts) >= 3:
267
+ evaluator_name = parts[1]
268
+ else:
269
+ LOGGER.warning(
270
+ "Skipping column '%s' due to unexpected format. Expected at least three parts separated by '.'", col
271
+ )
272
+ continue
273
+ if evaluator_name:
274
+ # Count the occurrences of each unique value (pass/fail)
275
+ value_counts = df[col].value_counts().to_dict()
276
+
277
+ # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
278
+ total_rows = len(df)
279
+ pass_count = value_counts.get(EVALUATION_PASS_FAIL_MAPPING[True], 0)
280
+ proportion = pass_count / total_rows if total_rows > 0 else 0.0
281
+
282
+ # Set the result with the evaluator name as the key
283
+ result_key = f"{evaluator_name}.{BINARY_AGGREGATE_SUFFIX}"
284
+ results[result_key] = round(proportion, 2)
285
+
286
+ return results
287
+
288
+
289
+ def _get_token_count_columns_to_exclude(df: pd.DataFrame) -> List[str]:
290
+ """Identify token count columns from known SDK metrics that should be excluded from aggregation.
291
+
292
+ Token counts from custom evaluators are not excluded, only those from EvaluationMetrics
293
+ and _InternalEvaluationMetrics.
294
+
295
+ :param df: The dataframe of evaluation results.
296
+ :type df: ~pandas.DataFrame
297
+ :return: List of column names to exclude from aggregation.
298
+ :rtype: List[str]
299
+ """
300
+ # Get all metric values from EvaluationMetrics class
301
+ evaluation_metrics_values = [
302
+ getattr(EvaluationMetrics, attr)
303
+ for attr in dir(EvaluationMetrics)
304
+ if not attr.startswith("_") and isinstance(getattr(EvaluationMetrics, attr), str)
305
+ ]
306
+
307
+ # Get all metric values from _InternalEvaluationMetrics class
308
+ internal_metrics_values = [
309
+ getattr(_InternalEvaluationMetrics, attr)
310
+ for attr in dir(_InternalEvaluationMetrics)
311
+ if not attr.startswith("_") and isinstance(getattr(_InternalEvaluationMetrics, attr), str)
312
+ ]
313
+
314
+ # Combine all known metrics
315
+ all_known_metrics = evaluation_metrics_values + internal_metrics_values
316
+
317
+ # Find token count columns that belong to known metrics
318
+ token_count_cols = [
319
+ col
320
+ for col in df.columns
321
+ if (
322
+ any(
323
+ col.endswith(f"{metric}_prompt_tokens")
324
+ or col.endswith(f"{metric}_completion_tokens")
325
+ or col.endswith(f"{metric}_total_tokens")
326
+ for metric in all_known_metrics
327
+ )
328
+ )
329
+ ]
330
+
331
+ return token_count_cols
332
+
333
+
334
+ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
117
335
  """Aggregate metrics from the evaluation results.
118
336
  On top of naively calculating the mean of most metrics, this function also identifies certain columns
119
337
  that represent defect rates and renames them accordingly. Other columns in the dataframe are dropped.
@@ -122,10 +340,12 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
122
340
  :param df: The dataframe of evaluation results.
123
341
  :type df: ~pandas.DataFrame
124
342
  :param evaluators: A dictionary mapping of strings to evaluator classes.
125
- :type evaluators: Dict[str, Type]
343
+ :type evaluators: Dict[str, Callable]
126
344
  :return: The aggregated metrics.
127
345
  :rtype: Dict[str, float]
128
346
  """
347
+ binary_metrics = _aggregation_binary_output(df)
348
+
129
349
  df.rename(columns={col: col.replace("outputs.", "") for col in df.columns}, inplace=True)
130
350
 
131
351
  handled_columns = []
@@ -133,134 +353,265 @@ def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Type]) -> Dict[st
133
353
  # Rename certain columns as defect rates if we know that's what their aggregates represent
134
354
  # Content safety metrics
135
355
  content_safety_cols, cs_defect_rates = _aggregate_content_safety_metrics(df, evaluators)
356
+ other_renamed_cols, renamed_cols = _aggregate_other_metrics(df)
136
357
  handled_columns.extend(content_safety_cols)
358
+ handled_columns.extend(other_renamed_cols)
137
359
  defect_rates.update(cs_defect_rates)
360
+ defect_rates.update(renamed_cols)
138
361
  # Label-based (true/false) metrics where 'true' means 'something is wrong'
139
362
  label_cols, label_defect_rates = _aggregate_label_defect_metrics(df)
140
363
  handled_columns.extend(label_cols)
141
364
  defect_rates.update(label_defect_rates)
142
365
 
366
+ # Exclude token count columns from aggregation for known SDK metrics
367
+ token_count_cols = _get_token_count_columns_to_exclude(df)
368
+ handled_columns.extend(token_count_cols)
369
+
143
370
  # For rest of metrics, we will calculate mean
144
371
  df.drop(columns=handled_columns, inplace=True)
145
372
 
373
+ # Convert "not applicable" strings to None to allow proper numeric aggregation
374
+ df = df.replace(EvaluatorBase._NOT_APPLICABLE_RESULT, None)
375
+
376
+ # NOTE: nan/None values don't count as as booleans, so boolean columns with
377
+ # nan/None values won't have a mean produced from them.
378
+ # This is different from label-based known evaluators, which have special handling.
146
379
  mean_value = df.mean(numeric_only=True)
147
380
  metrics = mean_value.to_dict()
148
381
  # Add defect rates back into metrics
149
382
  metrics.update(defect_rates)
383
+
384
+ # Add binary threshold metrics based on pass/fail results
385
+ metrics.update(binary_metrics)
386
+
150
387
  return metrics
151
388
 
152
389
 
153
- def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_target_fn=False):
390
+ def _validate_columns_for_target(
391
+ df: pd.DataFrame,
392
+ target: Callable,
393
+ ) -> None:
394
+ """
395
+ Check that all columns needed by target function are present.
396
+
397
+ :param df: The data frame to be validated.
398
+ :type df: pd.DataFrame
399
+ :param target: The callable to be applied to data set.
400
+ :type target: Optional[Callable]
401
+ :raises EvaluationException: If the column starts with "__outputs." or if the input data contains missing fields.
402
+ """
403
+ if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
404
+ msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
405
+ raise EvaluationException(
406
+ message=msg,
407
+ internal_message=msg,
408
+ target=ErrorTarget.EVALUATE,
409
+ category=ErrorCategory.INVALID_VALUE,
410
+ blame=ErrorBlame.USER_ERROR,
411
+ )
412
+ # If the target function is given, it may return
413
+ # several columns and hence we cannot check the availability of columns
414
+ # without knowing target function semantics.
415
+ # Instead, here we will validate the columns, taken by target.
154
416
  required_inputs = [
155
417
  param.name
156
- for param in inspect.signature(evaluator).parameters.values()
418
+ for param in inspect.signature(target).parameters.values()
157
419
  if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
158
420
  ]
159
421
 
160
- missing_inputs = [col for col in required_inputs if col not in df_data.columns]
422
+ missing_inputs = [col for col in required_inputs if col not in df.columns]
161
423
  if missing_inputs:
162
- if not is_target_fn:
163
- msg = f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}."
164
- raise EvaluationException(
165
- message=msg,
166
- internal_message=msg,
167
- target=ErrorTarget.EVALUATE,
168
- category=ErrorCategory.MISSING_FIELD,
169
- blame=ErrorBlame.USER_ERROR,
170
- )
171
- msg = f"Missing required inputs for target : {missing_inputs}."
424
+ msg = f"Missing required inputs for target: {missing_inputs}."
172
425
  raise EvaluationException(
173
426
  message=msg,
174
- internal_message=msg,
175
427
  target=ErrorTarget.EVALUATE,
176
428
  category=ErrorCategory.MISSING_FIELD,
177
429
  blame=ErrorBlame.USER_ERROR,
178
430
  )
179
431
 
180
432
 
181
- def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
433
+ def _validate_columns_for_evaluators(
434
+ df: pd.DataFrame,
435
+ evaluators: Dict[str, Callable],
436
+ target: Optional[Callable],
437
+ target_generated_columns: Optional[Set[str]],
438
+ column_mapping: Dict[str, Dict[str, str]],
439
+ ) -> None:
440
+ """
441
+ Check that all columns needed by evaluators are present.
442
+
443
+ :param df: The data frame to be validated.
444
+ :type df: pd.DataFrame
445
+ :param evaluators: The dictionary of evaluators.
446
+ :type evaluators: Dict[str, Callable]
447
+ :param target: The callable to be applied to data set.
448
+ :type target: Optional[Callable]
449
+ :param target_generated_columns: The set of columns generated by the target callable.
450
+ :type target_generated_columns: Optional[Set[str]]
451
+ :param column_mapping: Dictionary mapping evaluator name to evaluator column mapping.
452
+ :type column_mapping: Dict[str, Dict[str, str]]
453
+ :raises EvaluationException: If data is missing required inputs or if the target callable did not generate the necessary columns.
454
+ """
455
+ missing_inputs_per_evaluator = {}
456
+
457
+ for evaluator_name, evaluator in evaluators.items():
458
+ # Apply column mapping
459
+ mapping_config = column_mapping.get(evaluator_name, column_mapping.get("default", None))
460
+ new_df = _apply_column_mapping(df, mapping_config)
461
+
462
+ # Validate input data for evaluator
463
+ is_built_in = evaluator.__module__.startswith("azure.ai.evaluation")
464
+ if is_built_in:
465
+ # Note that for built-in evaluators supporting the "conversation" parameter,
466
+ # input parameters are now optional.
467
+ evaluator_params = [
468
+ param.name
469
+ for param in inspect.signature(evaluator).parameters.values()
470
+ if param.name not in ["kwargs", "args", "self"]
471
+ ]
472
+
473
+ if "conversation" in evaluator_params and "conversation" in new_df.columns:
474
+ # Ignore the missing fields if "conversation" presents in the input data
475
+ missing_inputs = []
476
+ else:
477
+ optional_params = (
478
+ cast(Any, evaluator)._OPTIONAL_PARAMS # pylint: disable=protected-access
479
+ if hasattr(evaluator, "_OPTIONAL_PARAMS")
480
+ else []
481
+ )
482
+ excluded_params = set(new_df.columns).union(optional_params)
483
+ missing_inputs = [col for col in evaluator_params if col not in excluded_params]
484
+
485
+ # If "conversation" is the only parameter and it is missing, keep it in the missing inputs
486
+ # Otherwise, remove it from the missing inputs
487
+ if "conversation" in missing_inputs:
488
+ if not (evaluator_params == ["conversation"] and missing_inputs == ["conversation"]):
489
+ missing_inputs.remove("conversation")
490
+ else:
491
+ evaluator_params = [
492
+ param.name
493
+ for param in inspect.signature(evaluator).parameters.values()
494
+ if param.default == inspect.Parameter.empty and param.name not in ["kwargs", "args", "self"]
495
+ ]
496
+
497
+ missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
498
+
499
+ if missing_inputs:
500
+ missing_inputs_per_evaluator[evaluator_name] = missing_inputs
501
+
502
+ if missing_inputs_per_evaluator:
503
+ msg = "Some evaluators are missing required inputs:\n"
504
+ for evaluator_name, missing in missing_inputs_per_evaluator.items():
505
+ msg += f"- {evaluator_name}: {missing}\n"
506
+
507
+ # Add the additional notes
508
+ msg += "\nTo resolve this issue:\n"
509
+ msg += "- Ensure the data contains required inputs.\n"
510
+ if target is not None:
511
+ msg += "- Verify that the target is generating the necessary columns for the evaluators. "
512
+ msg += f"Currently generated columns: {target_generated_columns} \n"
513
+ msg += "- Check that the column mapping is correctly configured."
514
+
515
+ raise EvaluationException(
516
+ message=msg.strip(),
517
+ target=ErrorTarget.EVALUATE,
518
+ category=ErrorCategory.MISSING_FIELD,
519
+ blame=ErrorBlame.USER_ERROR,
520
+ )
521
+
522
+
523
+ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name, tags):
182
524
  if data is None:
183
- msg = "data parameter must be provided for evaluation."
525
+ msg = "The 'data' parameter is required for evaluation."
184
526
  raise EvaluationException(
185
527
  message=msg,
186
- internal_message=msg,
187
528
  target=ErrorTarget.EVALUATE,
188
- category=ErrorCategory.MISSING_FIELD,
529
+ category=ErrorCategory.INVALID_VALUE,
530
+ blame=ErrorBlame.USER_ERROR,
531
+ )
532
+ if not isinstance(data, (os.PathLike, str)):
533
+ msg = "The 'data' parameter must be a string or a path-like object."
534
+ raise EvaluationException(
535
+ message=msg,
536
+ target=ErrorTarget.EVALUATE,
537
+ category=ErrorCategory.INVALID_VALUE,
538
+ blame=ErrorBlame.USER_ERROR,
539
+ )
540
+ if not os.path.exists(data):
541
+ msg = f"The input data file path '{data}' does not exist."
542
+ raise EvaluationException(
543
+ message=msg,
544
+ target=ErrorTarget.EVALUATE,
545
+ category=ErrorCategory.INVALID_VALUE,
189
546
  blame=ErrorBlame.USER_ERROR,
190
547
  )
191
548
 
192
549
  if target is not None:
193
550
  if not callable(target):
194
- msg = "target parameter must be a callable function."
551
+ msg = "The 'target' parameter must be a callable function."
195
552
  raise EvaluationException(
196
553
  message=msg,
197
- internal_message=msg,
198
554
  target=ErrorTarget.EVALUATE,
199
555
  category=ErrorCategory.INVALID_VALUE,
200
556
  blame=ErrorBlame.USER_ERROR,
201
557
  )
202
558
 
203
- if data is not None:
204
- if not isinstance(data, str):
205
- msg = "data parameter must be a string."
206
- raise EvaluationException(
207
- message=msg,
208
- internal_message=msg,
209
- target=ErrorTarget.EVALUATE,
210
- category=ErrorCategory.INVALID_VALUE,
211
- blame=ErrorBlame.USER_ERROR,
212
- )
559
+ if not evaluators:
560
+ msg = "The 'evaluators' parameter is required and cannot be None or empty."
561
+ raise EvaluationException(
562
+ message=msg,
563
+ target=ErrorTarget.EVALUATE,
564
+ category=ErrorCategory.INVALID_VALUE,
565
+ blame=ErrorBlame.USER_ERROR,
566
+ )
567
+ if not isinstance(evaluators, dict):
568
+ msg = "The 'evaluators' parameter must be a dictionary."
569
+ raise EvaluationException(
570
+ message=msg,
571
+ target=ErrorTarget.EVALUATE,
572
+ category=ErrorCategory.INVALID_VALUE,
573
+ blame=ErrorBlame.USER_ERROR,
574
+ )
213
575
 
214
- if evaluators is not None:
215
- if not isinstance(evaluators, dict):
216
- msg = "evaluators parameter must be a dictionary."
576
+ if output_path is not None:
577
+ if not isinstance(output_path, (os.PathLike, str)):
578
+ msg = "The 'output_path' parameter must be a string or a path-like object."
217
579
  raise EvaluationException(
218
580
  message=msg,
219
- internal_message=msg,
220
581
  target=ErrorTarget.EVALUATE,
221
582
  category=ErrorCategory.INVALID_VALUE,
222
583
  blame=ErrorBlame.USER_ERROR,
223
584
  )
224
585
 
225
- if output_path is not None:
226
- if not isinstance(output_path, str):
227
- msg = "output_path parameter must be a string."
586
+ output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
587
+ if output_dir and not os.path.exists(output_dir):
588
+ msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
228
589
  raise EvaluationException(
229
590
  message=msg,
230
- internal_message=msg,
231
591
  target=ErrorTarget.EVALUATE,
232
592
  category=ErrorCategory.INVALID_VALUE,
233
593
  blame=ErrorBlame.USER_ERROR,
234
594
  )
235
595
 
236
596
  if azure_ai_project is not None:
237
- if not isinstance(azure_ai_project, Dict):
238
- msg = "azure_ai_project parameter must be a dictionary."
239
- raise EvaluationException(
240
- message=msg,
241
- internal_message=msg,
242
- target=ErrorTarget.EVALUATE,
243
- category=ErrorCategory.INVALID_VALUE,
244
- blame=ErrorBlame.USER_ERROR,
245
- )
597
+ validate_azure_ai_project(azure_ai_project)
246
598
 
247
599
  if evaluation_name is not None:
248
- if not isinstance(evaluation_name, str):
249
- msg = "evaluation_name parameter must be a string."
600
+ if not isinstance(evaluation_name, str) or not evaluation_name.strip():
601
+ msg = "The 'evaluation_name' parameter must be a non-empty string."
250
602
  raise EvaluationException(
251
603
  message=msg,
252
- internal_message=msg,
253
604
  target=ErrorTarget.EVALUATE,
254
605
  category=ErrorCategory.INVALID_VALUE,
255
606
  blame=ErrorBlame.USER_ERROR,
256
607
  )
257
608
 
258
609
  try:
259
- initial_data_df = pd.read_json(data, lines=True)
610
+ data_loader = DataLoaderFactory.get_loader(data)
611
+ initial_data_df = data_loader.load()
260
612
  except Exception as e:
261
613
  raise EvaluationException(
262
- message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
263
- internal_message="Failed to load data. Confirm that it is valid jsonl data.",
614
+ message=f"Unable to load data from '{data}'. Supported formats are JSONL and CSV. Detailed error: {e}.",
264
615
  target=ErrorTarget.EVALUATE,
265
616
  category=ErrorCategory.INVALID_VALUE,
266
617
  blame=ErrorBlame.USER_ERROR,
@@ -269,88 +620,67 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
269
620
  return initial_data_df
270
621
 
271
622
 
272
- def _validate_columns(
273
- df: pd.DataFrame,
274
- evaluators: Dict[str, Any],
275
- target: Optional[Callable],
276
- evaluator_config: Dict[str, Dict[str, str]],
277
- ) -> None:
278
- """
279
- Check that all columns needed by evaluator or target function are present.
280
-
281
- :param df: The data frame to be validated.
282
- :type df: pd.DataFrame
283
- :param evaluators: The dictionary of evaluators.
284
- :type evaluators: Dict[str, Any]
285
- :param target: The callable to be applied to data set.
286
- :type target: Optional[Callable]
287
- :param evaluator_config: The configuration for evaluators.
288
- :type evaluator_config: Dict[str, Dict[str, str]]
289
- :raises EvaluationException: If column starts from "__outputs." while target is defined.
290
- """
291
- if target:
292
- if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns):
293
- msg = "The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.'
294
- raise EvaluationException(
295
- message=msg,
296
- internal_message=msg,
297
- target=ErrorTarget.EVALUATE,
298
- category=ErrorCategory.INVALID_VALUE,
299
- blame=ErrorBlame.USER_ERROR,
300
- )
301
- # If the target function is given, it may return
302
- # several columns and hence we cannot check the availability of columns
303
- # without knowing target function semantics.
304
- # Instead, here we will validate the columns, taken by target.
305
- _validate_input_data_for_evaluator(target, None, df, is_target_fn=True)
306
- else:
307
- for evaluator_name, evaluator in evaluators.items():
308
- # Apply column mapping
309
- mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
310
- new_df = _apply_column_mapping(df, mapping_config)
311
-
312
- # Validate input data for evaluator
313
- _validate_input_data_for_evaluator(evaluator, evaluator_name, new_df)
314
-
315
-
316
623
  def _apply_target_to_data(
317
624
  target: Callable,
318
- data: str,
319
- pf_client: PFClient,
625
+ data: Union[str, os.PathLike, pd.DataFrame],
626
+ batch_client: BatchClient,
320
627
  initial_data: pd.DataFrame,
321
628
  evaluation_name: Optional[str] = None,
322
- _run_name: Optional[str] = None,
323
- ) -> Tuple[pd.DataFrame, Set[str]]:
629
+ **kwargs,
630
+ ) -> Tuple[pd.DataFrame, Set[str], BatchClientRun]:
324
631
  """
325
632
  Apply the target function to the data set and return updated data and generated columns.
326
633
 
327
634
  :param target: The function to be applied to data.
328
635
  :type target: Callable
329
- :param data: The path to input jsonl file.
330
- :type data: str
331
- :param pf_client: The promptflow client to be used.
332
- :type pf_client: PFClient
636
+ :param data: The path to input jsonl or csv file.
637
+ :type data: Union[str, os.PathLike]
638
+ :param batch_client: The promptflow client to be used.
639
+ :type batch_client: PFClient
333
640
  :param initial_data: The data frame with the loaded data.
334
641
  :type initial_data: pd.DataFrame
335
642
  :param evaluation_name: The name of the evaluation.
336
643
  :type evaluation_name: Optional[str]
337
- :param _run_name: The name of target run. Used for testing only.
338
- :type _run_name: Optional[str]
339
644
  :return: The tuple, containing data frame and the list of added columns.
340
645
  :rtype: Tuple[pandas.DataFrame, List[str]]
341
646
  """
342
- # We are manually creating the temporary directory for the flow
343
- # because the way tempdir remove temporary directories will
344
- # hang the debugger, because promptflow will keep flow directory.
345
- run = pf_client.run(
346
- flow=target,
347
- display_name=evaluation_name,
348
- data=data,
349
- properties={"runType": "eval_run", "isEvaluatorRun": "true"},
350
- stream=True,
351
- name=_run_name,
352
- )
353
- target_output = pf_client.runs.get_details(run, all_results=True)
647
+
648
+ _run_name = kwargs.get("_run_name")
649
+ with TargetRunContext(batch_client):
650
+ run: BatchClientRun = batch_client.run(
651
+ flow=target,
652
+ display_name=evaluation_name,
653
+ data=data,
654
+ stream=True,
655
+ name=_run_name,
656
+ evaluator_name=getattr(target, "__qualname__", "TARGET"),
657
+ )
658
+ target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
659
+ run_summary = batch_client.get_run_summary(run)
660
+
661
+ if run_summary["completed_lines"] == 0:
662
+ msg = (
663
+ f"Evaluation target failed to produce any results."
664
+ f" Please check the logs at {run_summary['log_path']} for more details about cause of failure."
665
+ )
666
+ raise EvaluationException(
667
+ message=msg,
668
+ target=ErrorTarget.EVALUATE,
669
+ category=ErrorCategory.FAILED_EXECUTION,
670
+ blame=ErrorBlame.USER_ERROR,
671
+ )
672
+
673
+ # Log a warning if some rows failed
674
+ failed_lines = run_summary.get("failed_lines", 0)
675
+ completed_lines = run_summary["completed_lines"]
676
+ total_lines = failed_lines + completed_lines
677
+
678
+ if failed_lines > 0:
679
+ LOGGER.warning(
680
+ f"Target function completed {completed_lines} out of {total_lines} rows. "
681
+ f"{failed_lines} rows failed and will be filled with NaN values."
682
+ )
683
+
354
684
  # Remove input and output prefix
355
685
  generated_columns = {
356
686
  col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -358,6 +688,13 @@ def _apply_target_to_data(
358
688
  # Sort output by line numbers
359
689
  target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
360
690
  target_output.sort_index(inplace=True)
691
+
692
+ initial_data_with_line_numbers = initial_data.copy()
693
+ initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
694
+
695
+ complete_index = initial_data_with_line_numbers[LINE_NUMBER]
696
+ target_output = target_output.reindex(complete_index)
697
+
361
698
  target_output.reset_index(inplace=True, drop=False)
362
699
  # target_output contains only input columns, taken by function,
363
700
  # so we need to concatenate it to the input data frame.
@@ -366,34 +703,36 @@ def _apply_target_to_data(
366
703
  # Rename outputs columns to __outputs
367
704
  rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
368
705
  target_output.rename(columns=rename_dict, inplace=True)
369
- # Concatenate output to input
370
- target_output = pd.concat([target_output, initial_data], axis=1)
706
+ # Concatenate output to input - now both dataframes have the same number of rows
707
+ target_output = pd.concat([initial_data, target_output], axis=1)
371
708
 
372
709
  return target_output, generated_columns, run
373
710
 
374
711
 
375
- def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]:
376
- """Process evaluator_config to replace ${target.} with ${data.}
712
+ def _process_column_mappings(
713
+ column_mapping: Dict[str, Optional[Dict[str, str]]],
714
+ ) -> Dict[str, Dict[str, str]]:
715
+ """Process column_mapping to replace ${target.} with ${data.}
377
716
 
378
- :param evaluator_config: The configuration for evaluators.
379
- :type evaluator_config: Dict[str, Dict[str, str]]
717
+ :param column_mapping: The configuration for evaluators.
718
+ :type column_mapping: Dict[str, Optional[Dict[str, str]]]
380
719
  :return: The processed configuration.
381
720
  :rtype: Dict[str, Dict[str, str]]
382
721
  """
383
722
 
384
- processed_config = {}
723
+ processed_config: Dict[str, Dict[str, str]] = {}
385
724
 
386
- unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
725
+ expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
387
726
 
388
- if evaluator_config:
389
- for evaluator, mapping_config in evaluator_config.items():
727
+ if column_mapping:
728
+ for evaluator, mapping_config in column_mapping.items():
390
729
  if isinstance(mapping_config, dict):
391
730
  processed_config[evaluator] = {}
392
731
 
393
732
  for map_to_key, map_value in mapping_config.items():
394
733
  # Check if there's any unexpected reference other than ${target.} or ${data.}
395
- if unexpected_references.search(map_value):
396
- msg = "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
734
+ if not expected_references.search(map_value):
735
+ msg = "Unexpected references detected in 'column_mapping'. Ensure only ${target.} and ${data.} are used."
397
736
  raise EvaluationException(
398
737
  message=msg,
399
738
  internal_message=msg,
@@ -432,94 +771,93 @@ def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame:
432
771
  return df
433
772
 
434
773
 
435
- # @log_evaluate_activity
436
774
  def evaluate(
437
775
  *,
438
- data: str,
439
- evaluators: Dict[str, Callable],
776
+ data: Union[str, os.PathLike],
777
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
440
778
  evaluation_name: Optional[str] = None,
441
779
  target: Optional[Callable] = None,
442
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
443
- azure_ai_project: Optional[AzureAIProject] = None,
444
- output_path: Optional[str] = None,
780
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
781
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
782
+ output_path: Optional[Union[str, os.PathLike]] = None,
783
+ fail_on_evaluator_errors: bool = False,
784
+ tags: Optional[Dict[str, str]] = None,
445
785
  **kwargs,
446
- ):
786
+ ) -> EvaluationResult:
447
787
  """Evaluates target or data with built-in or custom evaluators. If both target and data are provided,
448
788
  data will be run through target function and then results will be evaluated.
449
789
 
450
790
  :keyword data: Path to the data to be evaluated or passed to target if target is set.
451
- Only .jsonl format files are supported. `target` and `data` both cannot be None. Required.
791
+ JSONL and CSV files are supported. `target` and `data` both cannot be None. Required.
452
792
  :paramtype data: str
453
793
  :keyword evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
454
- and value as the evaluator function. Required.
455
- :paramtype evaluators: Dict[str, Callable]
794
+ and value as the evaluator function. Also accepts AzureOpenAIGrader instances as values, which are processed separately.
795
+ Required.
796
+ :paramtype evaluators: Dict[str, Union[Callable, ~azure.ai.evaluation.AzureOpenAIGrader]]
456
797
  :keyword evaluation_name: Display name of the evaluation.
457
798
  :paramtype evaluation_name: Optional[str]
458
799
  :keyword target: Target to be evaluated. `target` and `data` both cannot be None
459
800
  :paramtype target: Optional[Callable]
460
801
  :keyword evaluator_config: Configuration for evaluators. The configuration should be a dictionary with evaluator
461
- names as keys and a dictionary of column mappings as values. The column mappings should be a dictionary with
462
- keys as the column names in the evaluator input and values as the column names in the input data or data
463
- generated by target.
464
- :paramtype evaluator_config: Optional[Dict[str, Dict[str, str]]
802
+ names as keys and a values that are dictionaries containing the column mappings. The column mappings should
803
+ be a dictionary with keys as the column names in the evaluator input and values as the column names in the
804
+ input data or data generated by target.
805
+ :paramtype evaluator_config: Optional[Dict[str, ~azure.ai.evaluation.EvaluatorConfig]]
465
806
  :keyword output_path: The local folder or file path to save evaluation results to if set. If folder path is provided
466
807
  the results will be saved to a file named `evaluation_results.json` in the folder.
467
808
  :paramtype output_path: Optional[str]
468
- :keyword azure_ai_project: Logs evaluation results to AI Studio if set.
469
- :paramtype azure_ai_project: Optional[~azure.ai.evaluation.AzureAIProject]
809
+ :keyword azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
810
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
811
+ :paramtype azure_ai_project: Optional[Union[str, ~azure.ai.evaluation.AzureAIProject]]
812
+ :keyword fail_on_evaluator_errors: Whether or not the evaluation should cancel early with an EvaluationException
813
+ if ANY evaluator fails during their evaluation.
814
+ Defaults to false, which means that evaluations will continue regardless of failures.
815
+ If such failures occur, metrics may be missing, and evidence of failures can be found in the evaluation's logs.
816
+ :paramtype fail_on_evaluator_errors: bool
817
+ :keyword tags: A dictionary of tags to be added to the evaluation run for tracking and organization purposes.
818
+ Keys and values must be strings. For more information about tag limits, see:
819
+ https://learn.microsoft.com/en-us/azure/machine-learning/resource-limits-capacity?view=azureml-api-2#runs
820
+ :paramtype tags: Optional[Dict[str, str]]
821
+ :keyword user_agent: A string to append to the default user-agent sent with evaluation http requests
822
+ :paramtype user_agent: Optional[str]
470
823
  :return: Evaluation results.
471
- :rtype: dict
472
-
473
- :Example:
474
-
475
- Evaluate API can be used as follows:
476
-
477
- .. code-block:: python
478
-
479
- from azure.ai.evaluation import evaluate, RelevanceEvaluator, CoherenceEvaluator
480
-
481
-
482
- model_config = {
483
- "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
484
- "api_key": os.environ.get("AZURE_OPENAI_KEY"),
485
- "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT")
486
- }
487
-
488
- coherence_eval = CoherenceEvaluator(model_config=model_config)
489
- relevance_eval = RelevanceEvaluator(model_config=model_config)
490
-
491
- path = "evaluate_test_data.jsonl"
492
- result = evaluate(
493
- data=path,
494
- evaluators={
495
- "coherence": coherence_eval,
496
- "relevance": relevance_eval,
497
- },
498
- evaluator_config={
499
- "coherence": {
500
- "response": "${data.response}",
501
- "query": "${data.query}"
502
- },
503
- "relevance": {
504
- "response": "${data.response}",
505
- "context": "${data.context}",
506
- "query": "${data.query}"
507
- }
508
- }
509
- )
510
-
824
+ :rtype: ~azure.ai.evaluation.EvaluationResult
825
+
826
+ .. admonition:: Example:
827
+
828
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
829
+ :start-after: [START evaluate_method]
830
+ :end-before: [END evaluate_method]
831
+ :language: python
832
+ :dedent: 8
833
+ :caption: Run an evaluation on local data with one or more evaluators using azure.ai.evaluation.AzureAIProject
834
+
835
+ .. admonition:: Example using Azure AI Project URL:
836
+
837
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
838
+ :start-after: [START evaluate_method]
839
+ :end-before: [END evaluate_method]
840
+ :language: python
841
+ :dedent: 8
842
+ :caption: Run an evaluation on local data with one or more evaluators using Azure AI Project URL in following format
843
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
511
844
  """
512
845
  try:
513
- return _evaluate(
514
- evaluation_name=evaluation_name,
515
- target=target,
516
- data=data,
517
- evaluators=evaluators,
518
- evaluator_config=evaluator_config,
519
- azure_ai_project=azure_ai_project,
520
- output_path=output_path,
521
- **kwargs,
522
- )
846
+ user_agent: Optional[str] = kwargs.get("user_agent")
847
+ with UserAgentSingleton().add_useragent_product(user_agent) if user_agent else contextlib.nullcontext():
848
+ results = _evaluate(
849
+ evaluation_name=evaluation_name,
850
+ target=target,
851
+ data=data,
852
+ evaluators_and_graders=evaluators,
853
+ evaluator_config=evaluator_config,
854
+ azure_ai_project=azure_ai_project,
855
+ output_path=output_path,
856
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
857
+ tags=tags,
858
+ **kwargs,
859
+ )
860
+ return results
523
861
  except Exception as e:
524
862
  # Handle multiprocess bootstrap error
525
863
  bootstrap_error = (
@@ -538,116 +876,802 @@ def evaluate(
538
876
  internal_message=error_message,
539
877
  target=ErrorTarget.EVALUATE,
540
878
  category=ErrorCategory.FAILED_EXECUTION,
541
- blame=ErrorBlame.UNKNOWN,
879
+ blame=ErrorBlame.USER_ERROR,
880
+ ) from e
881
+
882
+ # Ensure a consistent user experience when encountering errors by converting
883
+ # all other exceptions to EvaluationException.
884
+ if not isinstance(e, EvaluationException):
885
+ raise EvaluationException(
886
+ message=str(e),
887
+ target=ErrorTarget.EVALUATE,
888
+ category=ErrorCategory.FAILED_EXECUTION,
889
+ blame=ErrorBlame.SYSTEM_ERROR,
542
890
  ) from e
543
891
 
544
892
  raise e
545
893
 
546
894
 
547
- def _evaluate( # pylint: disable=too-many-locals
895
+ def _print_summary(per_evaluator_results: Dict[str, Any]) -> None:
896
+ # Extract evaluators with a non-empty "run_summary"
897
+ output_dict = {
898
+ name: result["run_summary"] for name, result in per_evaluator_results.items() if result.get("run_summary")
899
+ }
900
+
901
+ if output_dict:
902
+ print("======= Combined Run Summary (Per Evaluator) =======\n")
903
+ print(json.dumps(output_dict, indent=4))
904
+ print("\n====================================================\n")
905
+
906
+
907
+ def _print_fail_flag_warning() -> None:
908
+ print(
909
+ "Notice: fail_on_evaluator_errors is enabled. It is recommended that you disable "
910
+ + "this flag for evaluations on large datasets (loosely defined as more than 10 rows of inputs, "
911
+ + "or more than 4 evaluators). Using this flag on large datasets runs the risk of large runs failing "
912
+ + "without producing any outputs, since a single failure will cancel the entire run "
913
+ "when fail_on_evaluator_errors is enabled."
914
+ )
915
+
916
+
917
+ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
548
918
  *,
919
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
549
920
  evaluation_name: Optional[str] = None,
550
921
  target: Optional[Callable] = None,
551
- data: Optional[str] = None,
552
- evaluators: Optional[Dict[str, Callable]] = None,
553
- evaluator_config: Optional[Dict[str, Dict[str, str]]] = None,
554
- azure_ai_project: Optional[AzureAIProject] = None,
555
- output_path: Optional[str] = None,
922
+ data: Union[str, os.PathLike],
923
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
924
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
925
+ output_path: Optional[Union[str, os.PathLike]] = None,
926
+ fail_on_evaluator_errors: bool = False,
927
+ tags: Optional[Dict[str, str]] = None,
556
928
  **kwargs,
557
- ):
558
- input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name)
929
+ ) -> EvaluationResult:
930
+ if fail_on_evaluator_errors:
931
+ _print_fail_flag_warning()
559
932
 
933
+ # Turn inputted mess of data into a dataframe, apply targets if needed
934
+ # split graders and evaluators, and verify that column mappings are sensible.
935
+ validated_data = _preprocess_data(
936
+ data=data,
937
+ evaluators_and_graders=evaluators_and_graders,
938
+ evaluator_config=evaluator_config,
939
+ target=target,
940
+ output_path=output_path,
941
+ azure_ai_project=azure_ai_project,
942
+ evaluation_name=evaluation_name,
943
+ fail_on_evaluator_errors=fail_on_evaluator_errors,
944
+ tags=tags,
945
+ **kwargs,
946
+ )
947
+
948
+ # extract relevant info from validated data
949
+ column_mapping = validated_data["column_mapping"]
950
+ evaluators = validated_data["evaluators"]
951
+ graders = validated_data["graders"]
952
+ input_data_df = validated_data["input_data_df"]
953
+ results_df = pd.DataFrame()
954
+ metrics: Dict[str, float] = {}
955
+ eval_run_info_list: List[OAIEvalRunCreationInfo] = []
956
+ eval_run_summary_dict = {}
957
+
958
+ # Start OAI eval runs if any graders are present.
959
+ need_oai_run = len(graders) > 0
960
+ need_local_run = len(evaluators) > 0
961
+ need_get_oai_results = False
962
+ got_local_results = False
963
+ if need_oai_run:
964
+ try:
965
+ aoi_name = evaluation_name if evaluation_name else DEFAULT_OAI_EVAL_RUN_NAME
966
+ eval_run_info_list = _begin_aoai_evaluation(graders, column_mapping, input_data_df, aoi_name, **kwargs)
967
+ need_get_oai_results = len(eval_run_info_list) > 0
968
+ except EvaluationException as e:
969
+ if need_local_run:
970
+ # If there are normal evaluators, don't stop execution and try to run
971
+ # those.
972
+ LOGGER.warning(
973
+ "Remote Azure Open AI grader evaluations failed during run creation."
974
+ + " Continuing with local evaluators."
975
+ )
976
+ LOGGER.warning(e)
977
+ else:
978
+ raise e
979
+
980
+ # Evaluate 'normal' evaluators. This includes built-in evaluators and any user-supplied callables.
981
+ if need_local_run:
982
+ try:
983
+ eval_result_df, eval_metrics, per_evaluator_results = _run_callable_evaluators(
984
+ validated_data=validated_data, fail_on_evaluator_errors=fail_on_evaluator_errors
985
+ )
986
+ results_df = eval_result_df
987
+ metrics = eval_metrics
988
+ got_local_results = True
989
+ # TODO figure out how to update this printing to include OAI results?
990
+ _print_summary(per_evaluator_results)
991
+ eval_run_summary_dict = {name: result["run_summary"] for name, result in per_evaluator_results.items()}
992
+ LOGGER.info(f"run_summary: \r\n{json.dumps(eval_run_summary_dict, indent=4)}")
993
+ except EvaluationException as e:
994
+ if need_get_oai_results:
995
+ # If there are OAI graders, we only print a warning on local failures.
996
+ LOGGER.warning("Local evaluations failed. Will still attempt to retrieve online grader results.")
997
+ LOGGER.warning(e)
998
+ else:
999
+ raise e
1000
+
1001
+ # Retrieve OAI eval run results if needed.
1002
+ if need_get_oai_results:
1003
+ try:
1004
+ aoai_results, aoai_metrics = _get_evaluation_run_results(eval_run_info_list) # type: ignore
1005
+ # Post build TODO: add equivalent of _print_summary(per_evaluator_results) here
1006
+
1007
+ # Combine results if both evaluators and graders are present
1008
+ if len(evaluators) > 0:
1009
+ results_df = pd.concat([results_df, aoai_results], axis=1)
1010
+ metrics.update(aoai_metrics)
1011
+ else:
1012
+ # Otherwise combine aoai results with input data df to include input columns in outputs.
1013
+ results_df = pd.concat([input_data_df, aoai_results], axis=1)
1014
+ metrics = aoai_metrics
1015
+ except EvaluationException as e:
1016
+ if got_local_results:
1017
+ # If there are local eval results, we only print a warning on OAI failure.
1018
+ LOGGER.warning("Remote Azure Open AI grader evaluations failed. Still returning local results.")
1019
+ LOGGER.warning(e)
1020
+ else:
1021
+ raise e
1022
+
1023
+ # Done with all evaluations, message outputs into final forms, and log results if needed.
1024
+ name_map = _map_names_to_builtins(evaluators, graders)
1025
+ if is_onedp_project(azure_ai_project):
1026
+ studio_url = _log_metrics_and_instance_results_onedp(
1027
+ metrics, results_df, azure_ai_project, evaluation_name, name_map, tags=tags, **kwargs
1028
+ )
1029
+ else:
1030
+ # Since tracing is disabled, pass None for target_run so a dummy evaluation run will be created each time.
1031
+ trace_destination = _trace_destination_from_project_scope(azure_ai_project) if azure_ai_project else None
1032
+ studio_url = None
1033
+ if trace_destination:
1034
+ studio_url = _log_metrics_and_instance_results(
1035
+ metrics, results_df, trace_destination, None, evaluation_name, name_map, tags=tags, **kwargs
1036
+ )
1037
+
1038
+ result_df_dict = results_df.to_dict("records")
1039
+ result: EvaluationResult = {"rows": result_df_dict, "metrics": metrics, "studio_url": studio_url} # type: ignore
1040
+ # _add_aoai_structured_results_to_results(result, LOGGER, kwargs.get("eval_meta_data"))
1041
+
1042
+ eval_id: Optional[str] = kwargs.get("_eval_id")
1043
+ eval_run_id: Optional[str] = kwargs.get("_eval_run_id")
1044
+ eval_meta_data: Optional[Dict[str, Any]] = kwargs.get("_eval_meta_data")
1045
+ if kwargs.get("_convert_to_aoai_evaluation_result", False):
1046
+ _convert_results_to_aoai_evaluation_results(
1047
+ result, LOGGER, eval_id, eval_run_id, evaluators_and_graders, eval_run_summary_dict, eval_meta_data
1048
+ )
1049
+ if app_insights_configuration := kwargs.get("_app_insights_configuration"):
1050
+ emit_eval_result_events_to_app_insights(
1051
+ app_insights_configuration, result["_evaluation_results_list"], evaluator_config
1052
+ )
1053
+
1054
+ if output_path:
1055
+ _write_output(output_path, result)
1056
+ return result
1057
+
1058
+
1059
+ def _build_internal_log_attributes(
1060
+ event_data: Dict[str, Any],
1061
+ metric_name: str,
1062
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]],
1063
+ internal_log_attributes: Dict[str, str],
1064
+ ) -> Dict[str, str]:
1065
+ """
1066
+ Build internal log attributes for OpenTelemetry logging.
1067
+
1068
+ :param event_data: The event data containing threshold and name information
1069
+ :type event_data: Dict[str, Any]
1070
+ :param metric_name: The name of the metric being evaluated
1071
+ :type metric_name: str
1072
+ :param evaluator_config: Configuration for evaluators
1073
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1074
+ :return: Dictionary of internal log attributes
1075
+ :rtype: Dict[str, str]
1076
+ """
1077
+ # Add threshold if present
1078
+ if event_data.get("threshold"):
1079
+ internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"])
1080
+
1081
+ # Add testing criteria details if present
1082
+ testing_criteria_name = event_data.get("name")
1083
+ if testing_criteria_name:
1084
+ internal_log_attributes["gen_ai.evaluation.testing_criteria.name"] = testing_criteria_name
1085
+
1086
+ # Get evaluator definition details
1087
+ if evaluator_config and testing_criteria_name in evaluator_config:
1088
+ testing_criteria_config = evaluator_config[testing_criteria_name]
1089
+
1090
+ if evaluator_name := testing_criteria_config.get("_evaluator_name"):
1091
+ internal_log_attributes["gen_ai.evaluator.name"] = str(evaluator_name)
1092
+
1093
+ if evaluator_version := testing_criteria_config.get("_evaluator_version"):
1094
+ internal_log_attributes["gen_ai.evaluator.version"] = str(evaluator_version)
1095
+
1096
+ if evaluator_id := testing_criteria_config.get("_evaluator_id"):
1097
+ internal_log_attributes["gen_ai.evaluator.id"] = str(evaluator_id)
1098
+
1099
+ if evaluator_definition := testing_criteria_config.get("_evaluator_definition"):
1100
+ metric_config_detail = evaluator_definition.get("metrics").get(metric_name)
1101
+
1102
+ if metric_config_detail:
1103
+ if metric_config_detail.get("min_value") is not None:
1104
+ internal_log_attributes["gen_ai.evaluation.min_value"] = str(metric_config_detail["min_value"])
1105
+ if metric_config_detail.get("max_value") is not None:
1106
+ internal_log_attributes["gen_ai.evaluation.max_value"] = str(metric_config_detail["max_value"])
1107
+
1108
+ return internal_log_attributes
1109
+
1110
+
1111
+ def _log_events_to_app_insights(
1112
+ event_logger,
1113
+ events: List[Dict[str, Any]],
1114
+ log_attributes: Dict[str, Any],
1115
+ app_insights_config: AppInsightsConfig,
1116
+ data_source_item: Optional[Dict[str, Any]] = None,
1117
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1118
+ ) -> None:
1119
+ """
1120
+ Log independent events directly to App Insights using OpenTelemetry event logging.
1121
+
1122
+ :param event_logger: OpenTelemetry event logger instance
1123
+ :type event_logger: EventLogger
1124
+ :param events: List of event data dictionaries to log
1125
+ :type events: List[Dict[str, Any]]
1126
+ :param log_attributes: Attributes dict to use for each event (already includes extra_attributes if present)
1127
+ :type log_attributes: Dict[str, Any]
1128
+ :param app_insights_config: App Insights configuration containing connection string
1129
+ :type app_insights_config: AppInsightsConfig
1130
+ :param data_source_item: Data source item containing trace, response, and agent information
1131
+ :type data_source_item: Optional[Dict[str, Any]]
1132
+ """
1133
+
1134
+ from opentelemetry._events import Event
1135
+
1136
+ try:
1137
+ # Initialize values from AppInsights config as defaults
1138
+ trace_id = None
1139
+ span_id = None
1140
+ response_id = None
1141
+ conversation_id = None
1142
+ previous_response_id = None
1143
+ agent_id = app_insights_config.get("agent_id", None)
1144
+ agent_version = app_insights_config.get("agent_version", None)
1145
+ agent_name = app_insights_config.get("agent_name", None)
1146
+
1147
+ # Data source item values have higher priority and will override AppInsights config defaults
1148
+ if data_source_item:
1149
+ for key, value in data_source_item.items():
1150
+ if key.endswith("trace_id") and value and isinstance(value, str):
1151
+ # Remove dashes if present
1152
+ trace_id_str = str(value).replace("-", "").lower()
1153
+ if len(trace_id_str) == 32: # Valid trace_id length
1154
+ trace_id = int(trace_id_str, 16)
1155
+ elif key == "previous_response_id" and value and isinstance(value, str):
1156
+ previous_response_id = value
1157
+ elif key == "response_id" and value and isinstance(value, str):
1158
+ response_id = value
1159
+ elif key == "conversation_id" and value and isinstance(value, str):
1160
+ conversation_id = value
1161
+ elif key == "agent_id" and value and isinstance(value, str):
1162
+ agent_id = value
1163
+ elif key.endswith("span_id") and value and isinstance(value, str):
1164
+ # Remove dashes if present and convert to int
1165
+ span_id_str = str(value).replace("-", "").lower()
1166
+ if len(span_id_str) == 16: # Valid span_id length (64-bit = 16 hex chars)
1167
+ span_id = int(span_id_str, 16)
1168
+ elif key == "agent_version" and value and isinstance(value, str):
1169
+ agent_version = value
1170
+ elif key == "agent_name" and value and isinstance(value, str):
1171
+ agent_name = value
1172
+
1173
+ # Log each event as a separate log record
1174
+ for i, event_data in enumerate(events):
1175
+ try:
1176
+ # Prepare log record attributes with specific mappings
1177
+ # The standard attributes are already in https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-events.md#event-eventgen_aievaluationresult
1178
+ metric_name = event_data.get("metric")
1179
+ standard_log_attributes = {}
1180
+ # This attributes makes evaluation events to go into customEvents table in App Insights
1181
+ standard_log_attributes["microsoft.custom_event.name"] = EVALUATION_EVENT_NAME
1182
+ standard_log_attributes["gen_ai.evaluation.name"] = metric_name
1183
+ if event_data.get("score") is not None:
1184
+ standard_log_attributes["gen_ai.evaluation.score.value"] = event_data.get("score")
1185
+ if event_data.get("label") is not None:
1186
+ standard_log_attributes["gen_ai.evaluation.score.label"] = event_data.get("label")
1187
+
1188
+ # Internal proposed attributes
1189
+ # Put it in internal property bag for now, will be expanded if we got sign-off to Otel standard later.
1190
+ internal_log_attributes = _build_internal_log_attributes(
1191
+ event_data, metric_name, evaluator_config, log_attributes
1192
+ )
1193
+
1194
+ # Optional field that may not always be present
1195
+ if "reason" in event_data:
1196
+ standard_log_attributes["gen_ai.evaluation.explanation"] = str(event_data["reason"])
1197
+
1198
+ # Handle error from sample if present
1199
+ # Put the error message in error.type to follow OTel semantic conventions
1200
+ error = event_data.get("sample", {}).get("error", {}).get("message", None)
1201
+ if error:
1202
+ standard_log_attributes["error.type"] = error
1203
+
1204
+ # Handle redteam attack properties if present
1205
+ if "properties" in event_data:
1206
+ properties = event_data["properties"]
1207
+
1208
+ if "attack_success" in properties:
1209
+ internal_log_attributes["gen_ai.redteam.attack.success"] = str(properties["attack_success"])
1210
+
1211
+ if "attack_technique" in properties:
1212
+ internal_log_attributes["gen_ai.redteam.attack.technique"] = str(properties["attack_technique"])
1213
+
1214
+ if "attack_complexity" in properties:
1215
+ internal_log_attributes["gen_ai.redteam.attack.complexity"] = str(
1216
+ properties["attack_complexity"]
1217
+ )
1218
+
1219
+ if "attack_success_threshold" in properties:
1220
+ internal_log_attributes["gen_ai.redteam.attack.success_threshold"] = str(
1221
+ properties["attack_success_threshold"]
1222
+ )
1223
+
1224
+ # Add data source item attributes if present
1225
+ if response_id:
1226
+ standard_log_attributes["gen_ai.response.id"] = response_id
1227
+ if conversation_id:
1228
+ standard_log_attributes["gen_ai.conversation.id"] = conversation_id
1229
+ if previous_response_id:
1230
+ internal_log_attributes["gen_ai.previous.response.id"] = previous_response_id
1231
+ if agent_id:
1232
+ standard_log_attributes["gen_ai.agent.id"] = agent_id
1233
+ if agent_name:
1234
+ standard_log_attributes["gen_ai.agent.name"] = agent_name
1235
+ if agent_version:
1236
+ internal_log_attributes["gen_ai.agent.version"] = agent_version
1237
+
1238
+ # Combine standard and internal attributes, put internal under the properties bag
1239
+ standard_log_attributes["internal_properties"] = json.dumps(internal_log_attributes)
1240
+ # Anonymize IP address to prevent Azure GeoIP enrichment and location tracking
1241
+ standard_log_attributes["http.client_ip"] = "0.0.0.0"
1242
+
1243
+ event_logger.emit(
1244
+ Event(
1245
+ name=EVALUATION_EVENT_NAME,
1246
+ attributes=standard_log_attributes,
1247
+ body=EVALUATION_EVENT_NAME,
1248
+ trace_id=trace_id if trace_id is not None else None,
1249
+ span_id=span_id if span_id is not None else None,
1250
+ )
1251
+ )
1252
+
1253
+ except Exception as e:
1254
+ LOGGER.warning(f"Failed to log event {i}: {e}")
1255
+
1256
+ except Exception as e:
1257
+ LOGGER.error(f"Failed to log events to App Insights: {e}")
1258
+
1259
+
1260
+ def emit_eval_result_events_to_app_insights(
1261
+ app_insights_config: AppInsightsConfig,
1262
+ results: List[Dict],
1263
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1264
+ ) -> None:
1265
+ """
1266
+ Emit evaluation result events to App Insights using OpenTelemetry logging.
1267
+ Each result is logged as an independent log record, potentially including trace context.
1268
+
1269
+ :param app_insights_config: App Insights configuration containing connection string
1270
+ :type app_insights_config: AppInsightsConfig
1271
+ :param results: List of evaluation results to log
1272
+ :type results: List[Dict]
1273
+ """
1274
+
1275
+ from opentelemetry import _logs
1276
+ from opentelemetry.sdk._logs import LoggerProvider
1277
+ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
1278
+ from opentelemetry.sdk.resources import Resource
1279
+ from opentelemetry.semconv.resource import ResourceAttributes
1280
+ from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter
1281
+ from opentelemetry._events import get_event_logger
1282
+ from opentelemetry.sdk._events import EventLoggerProvider
1283
+
1284
+ if not results:
1285
+ LOGGER.debug("No results to log to App Insights")
1286
+ return
1287
+
1288
+ try:
1289
+ # Configure OpenTelemetry logging with anonymized Resource attributes
1290
+
1291
+ # Create a resource with minimal attributes to prevent sensitive data collection
1292
+ # SERVICE_INSTANCE_ID maps to cloud_RoleInstance in Azure Monitor and prevents
1293
+ # Azure Monitor from auto-detecting the device hostname
1294
+ anonymized_resource = Resource.create(
1295
+ {
1296
+ ResourceAttributes.SERVICE_NAME: "unknown",
1297
+ ResourceAttributes.SERVICE_INSTANCE_ID: "unknown",
1298
+ }
1299
+ )
1300
+
1301
+ logger_provider = LoggerProvider(resource=anonymized_resource)
1302
+ _logs.set_logger_provider(logger_provider)
1303
+
1304
+ # Create Azure Monitor log exporter
1305
+ azure_log_exporter = AzureMonitorLogExporter(connection_string=app_insights_config["connection_string"])
1306
+
1307
+ # Add the Azure Monitor exporter to the logger provider
1308
+ logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
1309
+
1310
+ # Create event logger
1311
+ event_provider = EventLoggerProvider(logger_provider)
1312
+ event_logger = get_event_logger(__name__, event_logger_provider=event_provider)
1313
+
1314
+ # Initialize base log attributes with extra_attributes if present, otherwise empty dict
1315
+ base_log_attributes = app_insights_config.get("extra_attributes", {})
1316
+
1317
+ # Add AppInsights config attributes with proper semantic convention mappings
1318
+ if "run_type" in app_insights_config:
1319
+ base_log_attributes["gen_ai.evaluation.azure_ai_type"] = str(app_insights_config["run_type"])
1320
+ if "schedule_type" in app_insights_config:
1321
+ base_log_attributes["gen_ai.evaluation.azure_ai_scheduled"] = str(app_insights_config["schedule_type"])
1322
+ if "run_id" in app_insights_config:
1323
+ base_log_attributes["gen_ai.evaluation.run.id"] = str(app_insights_config["run_id"])
1324
+ if "project_id" in app_insights_config:
1325
+ base_log_attributes["gen_ai.azure_ai_project.id"] = str(app_insights_config["project_id"])
1326
+
1327
+ for result in results:
1328
+ # Create a copy of base attributes for this result's events
1329
+ log_attributes = base_log_attributes.copy()
1330
+
1331
+ _log_events_to_app_insights(
1332
+ event_logger=event_logger,
1333
+ events=result["results"],
1334
+ log_attributes=log_attributes,
1335
+ data_source_item=result["datasource_item"] if "datasource_item" in result else None,
1336
+ evaluator_config=evaluator_config,
1337
+ app_insights_config=app_insights_config,
1338
+ )
1339
+ # Force flush to ensure events are sent
1340
+ logger_provider.force_flush()
1341
+ LOGGER.info(f"Successfully logged {len(results)} evaluation results to App Insights")
1342
+
1343
+ except Exception as e:
1344
+ LOGGER.error(f"Failed to emit evaluation results to App Insights: {e}")
1345
+
1346
+
1347
+ def _preprocess_data(
1348
+ data: Union[str, os.PathLike],
1349
+ evaluators_and_graders: Dict[str, Union[Callable, AzureOpenAIGrader]],
1350
+ evaluator_config: Optional[Dict[str, EvaluatorConfig]] = None,
1351
+ target: Optional[Callable] = None,
1352
+ output_path: Optional[Union[str, os.PathLike]] = None,
1353
+ azure_ai_project: Optional[Union[str, AzureAIProject]] = None,
1354
+ evaluation_name: Optional[str] = None,
1355
+ fail_on_evaluator_errors: bool = False,
1356
+ tags: Optional[Dict[str, str]] = None,
1357
+ **kwargs,
1358
+ ) -> __ValidatedData:
560
1359
  # Process evaluator config to replace ${target.} with ${data.}
561
1360
  if evaluator_config is None:
562
1361
  evaluator_config = {}
563
- evaluator_config = _process_evaluator_config(evaluator_config)
564
- _validate_columns(input_data_df, evaluators, target, evaluator_config)
565
-
566
- # Target Run
567
- pf_client = PFClient(
568
- config=(
569
- {"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None
570
- ),
571
- user_agent=USER_AGENT,
572
- )
573
1362
 
574
- trace_destination = pf_client._config.get_trace_destination() # pylint: disable=protected-access
575
- target_run = None
576
- target_generated_columns = set()
1363
+ input_data_df = _validate_and_load_data(
1364
+ target, data, evaluators_and_graders, output_path, azure_ai_project, evaluation_name, tags
1365
+ )
1366
+ if target is not None:
1367
+ _validate_columns_for_target(input_data_df, target)
1368
+
1369
+ # extract column mapping dicts into dictionary mapping evaluator name to column mapping
1370
+ column_mapping = _process_column_mappings(
1371
+ {
1372
+ evaluator_name: evaluator_configuration.get("column_mapping", None)
1373
+ for evaluator_name, evaluator_configuration in evaluator_config.items()
1374
+ }
1375
+ )
577
1376
 
578
1377
  # Create default configuration for evaluators that directly maps
579
1378
  # input data names to keyword inputs of the same name in the evaluators.
580
- evaluator_config = evaluator_config or {}
581
- evaluator_config.setdefault("default", {})
1379
+ column_mapping = column_mapping or {}
1380
+ column_mapping.setdefault("default", {})
1381
+
1382
+ # Split normal evaluators and OAI graders
1383
+ evaluators, graders = _split_evaluators_and_grader_configs(evaluators_and_graders)
1384
+
1385
+ target_run: Optional[BatchClientRun] = None
1386
+ target_generated_columns: Set[str] = set()
1387
+ batch_run_client: BatchClient
1388
+ batch_run_data: Union[str, os.PathLike, pd.DataFrame] = data
1389
+
1390
+ def get_client_type(evaluate_kwargs: Dict[str, Any]) -> Literal["run_submitter", "pf_client", "code_client"]:
1391
+ """Determines the BatchClient to use from provided kwargs (_use_run_submitter_client and _use_pf_client)"""
1392
+ _use_run_submitter_client = cast(Optional[bool], kwargs.pop("_use_run_submitter_client", None))
1393
+ _use_pf_client = cast(Optional[bool], kwargs.pop("_use_pf_client", None))
1394
+
1395
+ if _use_run_submitter_client is None and _use_pf_client is None:
1396
+ # If both are unset, return default
1397
+ return "run_submitter"
1398
+
1399
+ if _use_run_submitter_client and _use_pf_client:
1400
+ raise EvaluationException(
1401
+ message="Only one of _use_pf_client and _use_run_submitter_client should be set to True.",
1402
+ target=ErrorTarget.EVALUATE,
1403
+ category=ErrorCategory.INVALID_VALUE,
1404
+ blame=ErrorBlame.USER_ERROR,
1405
+ )
1406
+
1407
+ if _use_run_submitter_client == False and _use_pf_client == False:
1408
+ return "code_client"
1409
+
1410
+ if _use_run_submitter_client:
1411
+ return "run_submitter"
1412
+ if _use_pf_client:
1413
+ return "pf_client"
1414
+
1415
+ if _use_run_submitter_client is None and _use_pf_client == False:
1416
+ return "run_submitter"
1417
+ if _use_run_submitter_client == False and _use_pf_client is None:
1418
+ return "pf_client"
1419
+
1420
+ assert False, "This should be impossible"
1421
+
1422
+ client_type: Literal["run_submitter", "pf_client", "code_client"] = get_client_type(kwargs)
1423
+
1424
+ if client_type == "run_submitter":
1425
+ batch_run_client = RunSubmitterClient(raise_on_errors=fail_on_evaluator_errors)
1426
+ batch_run_data = input_data_df
1427
+ elif client_type == "pf_client":
1428
+ batch_run_client = ProxyClient(user_agent=UserAgentSingleton().value)
1429
+ # Ensure the absolute path is Re to pf.run, as relative path doesn't work with
1430
+ # multiple evaluators. If the path is already absolute, abspath will return the original path.
1431
+ batch_run_data = os.path.abspath(data)
1432
+ elif client_type == "code_client":
1433
+ batch_run_client = CodeClient()
1434
+ batch_run_data = input_data_df
582
1435
 
583
1436
  # If target is set, apply 1-1 column mapping from target outputs to evaluator inputs
584
1437
  if data is not None and target is not None:
585
1438
  input_data_df, target_generated_columns, target_run = _apply_target_to_data(
586
- target, data, pf_client, input_data_df, evaluation_name, _run_name=kwargs.get("_run_name")
1439
+ target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
587
1440
  )
588
1441
 
589
- for evaluator_name, mapping in evaluator_config.items():
590
- mapped_to_values = set(mapping.values())
591
- for col in target_generated_columns:
592
- # If user defined mapping differently, do not change it.
593
- # If it was mapped to target, we have already changed it
594
- # in _process_evaluator_config
595
- run_output = f"${{run.outputs.{col}}}"
596
- # We will add our mapping only if
597
- # customer did not mapped target output.
598
- if col not in mapping and run_output not in mapped_to_values:
599
- evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup
600
-
601
- # After we have generated all columns we can check if we have
602
- # everything we need for evaluators.
603
- _validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)
1442
+ # IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
1443
+ # This ensures that evaluators get all rows (including failed ones with NaN values)
1444
+ if isinstance(batch_run_client, ProxyClient):
1445
+ # Create a temporary JSONL file with the complete dataframe
1446
+ temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
1447
+ try:
1448
+ for _, row in input_data_df.iterrows():
1449
+ row_dict = row.to_dict()
1450
+ temp_file.write(json.dumps(row_dict) + "\n")
1451
+ temp_file.close()
1452
+ batch_run_data = temp_file.name
1453
+
1454
+ # Update column mappings to use data references instead of run outputs
1455
+ for evaluator_name, mapping in column_mapping.items():
1456
+ mapped_to_values = set(mapping.values())
1457
+ for col in target_generated_columns:
1458
+ # Use data reference instead of run output to ensure we get all rows
1459
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1460
+
1461
+ # We will add our mapping only if customer did not map target output.
1462
+ if col not in mapping and target_reference not in mapped_to_values:
1463
+ column_mapping[evaluator_name][col] = target_reference
1464
+
1465
+ # Don't pass the target_run since we're now using the complete dataframe
1466
+ target_run = None
1467
+
1468
+ except Exception as e:
1469
+ # Clean up the temp file if something goes wrong
1470
+ if os.path.exists(temp_file.name):
1471
+ os.unlink(temp_file.name)
1472
+ raise e
1473
+ else:
1474
+ # For DataFrame-based clients, update batch_run_data to use the updated input_data_df
1475
+ batch_run_data = input_data_df
1476
+
1477
+ # Update column mappings for DataFrame clients
1478
+ for evaluator_name, mapping in column_mapping.items():
1479
+ mapped_to_values = set(mapping.values())
1480
+ for col in target_generated_columns:
1481
+ target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
1482
+
1483
+ # We will add our mapping only if customer did not map target output.
1484
+ if col not in mapping and target_reference not in mapped_to_values:
1485
+ column_mapping[evaluator_name][col] = target_reference
1486
+
1487
+ # After we have generated all columns, we can check if we have everything we need for evaluators.
1488
+ _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
604
1489
 
605
1490
  # Apply 1-1 mapping from input data to evaluator inputs, excluding values already assigned
606
1491
  # via target mapping.
607
1492
  # If both the data and the output dictionary of the target function
608
1493
  # have the same column, then the target function value is used.
1494
+ # NEW: flatten nested object columns (e.g., 'item') so we can map leaf values automatically.
1495
+ # Ensure the data does not contain top-level 'conversation' or 'messages' columns (which indicate chat/conversation data)
609
1496
  if input_data_df is not None:
1497
+ if "conversation" in input_data_df.columns or "messages" in input_data_df.columns:
1498
+ # No action is taken when 'conversation' or 'messages' columns are present,
1499
+ # as these indicate chat/conversation data which should not be flattened or mapped by default.
1500
+ pass
1501
+ else:
1502
+ input_data_df = _flatten_object_columns_for_default_mapping(input_data_df)
1503
+
1504
+ # Build default mapping for leaves:
1505
+ if input_data_df is not None:
1506
+ # First, map flattened nested columns (those containing a dot) to leaf names.
610
1507
  for col in input_data_df.columns:
611
- # Ignore columns added by target mapping. These are formatted as "__outputs.<column_name>"
612
- # Also ignore columns that are already in config, since they've been covered by target mapping.
613
- if not col.startswith(Prefixes.TSG_OUTPUTS) and col not in evaluator_config["default"].keys():
614
- evaluator_config["default"][col] = f"${{data.{col}}}"
615
- # Batch Run
616
- evaluators_info = {}
617
- use_pf_client = kwargs.get("_use_pf_client", True)
618
- if use_pf_client:
619
- batch_run_client = ProxyClient(pf_client)
620
-
621
- # Ensure the absolute path is passed to pf.run, as relative path doesn't work with
622
- # multiple evaluators. If the path is already absolute, abspath will return the original path.
623
- data = os.path.abspath(data)
1508
+ # Skip target output columns
1509
+ if col.startswith(Prefixes.TSG_OUTPUTS):
1510
+ continue
1511
+ # Skip root container columns (no dot) here; they'll be handled below if truly primitive.
1512
+ if "." in col:
1513
+ leaf_name = col.split(".")[-1]
1514
+ if leaf_name not in column_mapping["default"]:
1515
+ column_mapping["default"][leaf_name] = f"${{data.{col}}}"
1516
+
1517
+ # Then, handle remaining top-level primitive columns (original logic).
1518
+ for col in input_data_df.columns:
1519
+ if (
1520
+ not col.startswith(Prefixes.TSG_OUTPUTS)
1521
+ and col not in column_mapping["default"].keys()
1522
+ and "." not in col # only pure top-level primitives
1523
+ ):
1524
+ column_mapping["default"][col] = f"${{data.{col}}}"
1525
+
1526
+ return __ValidatedData(
1527
+ evaluators=evaluators,
1528
+ graders=graders,
1529
+ input_data_df=input_data_df,
1530
+ column_mapping=column_mapping,
1531
+ target_run=target_run,
1532
+ batch_run_client=batch_run_client,
1533
+ batch_run_data=batch_run_data,
1534
+ )
1535
+
1536
+
1537
+ def _flatten_object_columns_for_default_mapping(
1538
+ df: pd.DataFrame, root_prefixes: Optional[Iterable[str]] = None
1539
+ ) -> pd.DataFrame:
1540
+ """Flatten nested dictionary-valued columns into dotted leaf columns.
1541
+
1542
+ For any column whose cells (in at least one row) are ``dict`` objects, this utility discovers all
1543
+ leaf paths (recursively descending only through ``dict`` nodes) and materializes new DataFrame
1544
+ columns named ``"<original_col>.<nested.path.leaf>"`` for every unique leaf encountered across
1545
+ all rows. A *leaf* is defined as any value that is **not** a ``dict`` (lists / primitives / ``None``
1546
+ are all treated as leaves). Existing columns are never overwritten (idempotent behavior).
1547
+
1548
+ Example
1549
+ If a column ``item`` contains objects like ``{"a": {"b": 1, "c": 2}}`` a pair of new
1550
+ columns ``item.a.b`` and ``item.a.c`` will be added with the corresponding scalar values.
1551
+
1552
+ :param df: Input DataFrame to flatten in place.
1553
+ :type df: ~pandas.DataFrame
1554
+ :param root_prefixes: Optional iterable restricting which top-level columns are considered
1555
+ for flattening. If ``None``, all columns containing at least one ``dict`` value are processed.
1556
+ :type root_prefixes: Optional[Iterable[str]]
1557
+ :return: The same DataFrame instance (returned for convenient chaining).
1558
+ :rtype: ~pandas.DataFrame
1559
+ """
1560
+ candidate_cols = []
1561
+ if root_prefixes is not None:
1562
+ candidate_cols = [c for c in root_prefixes if c in df.columns]
624
1563
  else:
625
- batch_run_client = CodeClient()
626
- data = input_data_df
627
-
628
- with BatchRunContext(batch_run_client):
629
- for evaluator_name, evaluator in evaluators.items():
630
- evaluators_info[evaluator_name] = {}
631
- evaluators_info[evaluator_name]["run"] = batch_run_client.run(
632
- flow=evaluator,
633
- run=target_run,
634
- evaluator_name=evaluator_name,
635
- column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
636
- data=data,
637
- stream=True,
638
- name=kwargs.get("_run_name"),
639
- )
1564
+ # pick columns where at least one non-null value is a dict
1565
+ for c in df.columns:
1566
+ series = df[c]
1567
+ if series.map(lambda v: isinstance(v, dict)).any():
1568
+ candidate_cols.append(c)
1569
+
1570
+ def _extract_leaves(obj: Any, prefix: str) -> Iterator[Tuple[str, Any]]:
1571
+ if isinstance(obj, dict):
1572
+ for k, v in obj.items():
1573
+ new_prefix = f"{prefix}.{k}" if prefix else k
1574
+ if isinstance(v, dict):
1575
+ yield from _extract_leaves(v, new_prefix)
1576
+ else:
1577
+ # treat list / primitive / None as leaf
1578
+ yield new_prefix, v
1579
+
1580
+ for root_col in candidate_cols:
1581
+ # Build a union of leaf paths across rows to ensure consistent columns
1582
+ leaf_paths: Set[str] = set()
1583
+ for val in df[root_col]:
1584
+ if isinstance(val, dict):
1585
+ for path, _ in _extract_leaves(val, root_col):
1586
+ leaf_paths.add(path)
1587
+
1588
+ if not leaf_paths:
1589
+ continue
1590
+
1591
+ # Create each flattened column if absent
1592
+ for path in leaf_paths:
1593
+ if path in df.columns:
1594
+ continue # already present
1595
+ relative_keys = path[len(root_col) + 1 :].split(".") if len(path) > len(root_col) else []
1596
+
1597
+ def getter(root_val: Any) -> Any:
1598
+ cur = root_val
1599
+ for rk in relative_keys:
1600
+ if not isinstance(cur, dict):
1601
+ return None
1602
+ cur = cur.get(rk, None)
1603
+ return cur
1604
+
1605
+ df[path] = df[root_col].map(lambda rv: getter(rv) if isinstance(rv, dict) else None)
1606
+
1607
+ return df
1608
+
640
1609
 
641
- # get_details needs to be called within BatchRunContext scope in order to have user agent populated
642
- for evaluator_name, evaluator_info in evaluators_info.items():
643
- evaluator_info["result"] = batch_run_client.get_details(evaluator_info["run"], all_results=True)
644
- evaluator_info["metrics"] = batch_run_client.get_metrics(evaluator_info["run"])
1610
+ def _run_callable_evaluators(
1611
+ validated_data: __ValidatedData,
1612
+ fail_on_evaluator_errors: bool = False,
1613
+ **kwargs,
1614
+ ) -> Tuple[pd.DataFrame, Dict[str, Any], Dict[str, __EvaluatorInfo]]:
1615
+
1616
+ # Extract needed values
1617
+ batch_run_client = validated_data["batch_run_client"]
1618
+ target_run = validated_data["target_run"]
1619
+ batch_run_data = validated_data["batch_run_data"]
1620
+ column_mapping = validated_data["column_mapping"]
1621
+ evaluators = validated_data["evaluators"]
1622
+
1623
+ # Clean up temporary file after evaluation if it was created
1624
+ temp_file_to_cleanup = None
1625
+ if (
1626
+ isinstance(batch_run_client, ProxyClient)
1627
+ and isinstance(batch_run_data, str)
1628
+ and batch_run_data.endswith(".jsonl")
1629
+ ):
1630
+ # Check if it's a temporary file (contains temp directory path)
1631
+ if tempfile.gettempdir() in batch_run_data:
1632
+ temp_file_to_cleanup = batch_run_data
645
1633
 
1634
+ try:
1635
+ with EvalRunContext(batch_run_client):
1636
+ runs = {
1637
+ evaluator_name: batch_run_client.run(
1638
+ flow=evaluator,
1639
+ data=batch_run_data,
1640
+ # Don't pass target_run when using complete dataframe
1641
+ run=target_run,
1642
+ evaluator_name=evaluator_name,
1643
+ column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
1644
+ stream=True,
1645
+ name=kwargs.get("_run_name"),
1646
+ )
1647
+ for evaluator_name, evaluator in evaluators.items()
1648
+ }
1649
+
1650
+ # get_details needs to be called within EvalRunContext scope in order to have user agent populated
1651
+ per_evaluator_results: Dict[str, __EvaluatorInfo] = {
1652
+ evaluator_name: {
1653
+ "result": batch_run_client.get_details(run, all_results=True),
1654
+ "metrics": batch_run_client.get_metrics(run),
1655
+ "run_summary": batch_run_client.get_run_summary(run),
1656
+ }
1657
+ for evaluator_name, run in runs.items()
1658
+ }
1659
+ finally:
1660
+ # Clean up temporary file if it was created
1661
+ if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
1662
+ try:
1663
+ os.unlink(temp_file_to_cleanup)
1664
+ except Exception as e:
1665
+ LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
646
1666
  # Concatenate all results
647
- evaluators_result_df = None
1667
+ evaluators_result_df = pd.DataFrame()
648
1668
  evaluators_metric = {}
649
- for evaluator_name, evaluator_info in evaluators_info.items():
650
- evaluator_result_df = evaluator_info["result"]
1669
+ for evaluator_name, evaluator_result in per_evaluator_results.items():
1670
+ if fail_on_evaluator_errors and evaluator_result["run_summary"]["failed_lines"] > 0:
1671
+ _print_summary(per_evaluator_results)
1672
+ _turn_error_logs_into_exception(evaluator_result["run_summary"]["log_path"] + "/error.json")
1673
+
1674
+ evaluator_result_df = evaluator_result["result"]
651
1675
 
652
1676
  # drop input columns
653
1677
  evaluator_result_df = evaluator_result_df.drop(
@@ -670,27 +1694,826 @@ def _evaluate( # pylint: disable=too-many-locals
670
1694
  else evaluator_result_df
671
1695
  )
672
1696
 
673
- evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_info["metrics"].items()})
1697
+ evaluators_metric.update({f"{evaluator_name}.{k}": v for k, v in evaluator_result["metrics"].items()})
674
1698
 
675
1699
  # Rename columns, generated by target function to outputs instead of inputs.
676
1700
  # If target generates columns, already present in the input data, these columns
677
1701
  # will be marked as outputs already so we do not need to rename them.
678
- input_data_df = _rename_columns_conditionally(input_data_df)
679
-
680
- result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
681
- metrics = _aggregate_metrics(evaluators_result_df, evaluators)
682
- metrics.update(evaluators_metric)
683
- studio_url = _log_metrics_and_instance_results(
684
- metrics,
685
- result_df,
686
- trace_destination,
687
- target_run,
688
- evaluation_name,
1702
+
1703
+ input_data_df = _rename_columns_conditionally(validated_data["input_data_df"])
1704
+ eval_result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
1705
+ eval_metrics = _aggregate_metrics(evaluators_result_df, evaluators)
1706
+ eval_metrics.update(evaluators_metric)
1707
+
1708
+ return eval_result_df, eval_metrics, per_evaluator_results
1709
+
1710
+
1711
+ def _map_names_to_builtins(
1712
+ evaluators: Dict[str, Callable],
1713
+ graders: Dict[str, AzureOpenAIGrader],
1714
+ ) -> Dict[str, str]:
1715
+ """
1716
+ Construct a mapping from user-supplied evaluator names to which known, built-in
1717
+ evaluator or grader they refer to. Custom evaluators are excluded from the mapping
1718
+ as we only want to track built-in evaluators and graders.
1719
+
1720
+ :param evaluators: The dictionary of evaluators.
1721
+ :type evaluators: Dict[str, Callable]
1722
+ :param graders: The dictionary of graders.
1723
+ :type graders: Dict[str, AzureOpenAIGrader]
1724
+ :param evaluator_config: The configuration for evaluators.
1725
+ :type evaluator_config: Optional[Dict[str, EvaluatorConfig]]
1726
+
1727
+ """
1728
+ from .._eval_mapping import EVAL_CLASS_MAP
1729
+
1730
+ name_map = {}
1731
+
1732
+ for name, evaluator in evaluators.items():
1733
+ # Check if the evaluator is a known built-in evaluator
1734
+ found_eval = False
1735
+ for eval_class, eval_id in EVAL_CLASS_MAP.items():
1736
+ if isinstance(evaluator, eval_class):
1737
+ name_map[name] = eval_id
1738
+ found_eval = True
1739
+ break
1740
+ if not found_eval:
1741
+ # Skip custom evaluators - we only want to track built-in evaluators
1742
+ pass
1743
+
1744
+ for name, grader in graders.items():
1745
+ name_map[name] = grader.id
1746
+
1747
+ return name_map
1748
+
1749
+
1750
+ def _turn_error_logs_into_exception(log_path: str) -> None:
1751
+ """Produce an EvaluationException using the contents of the inputted
1752
+ file as the error message.
1753
+
1754
+ :param log_path: The path to the error log file.
1755
+ :type log_path: str
1756
+ """
1757
+ with open(log_path, "r", encoding=DefaultOpenEncoding.READ) as file:
1758
+ error_message = file.read()
1759
+ raise EvaluationException(
1760
+ message=error_message,
1761
+ target=ErrorTarget.EVALUATE,
1762
+ category=ErrorCategory.FAILED_EXECUTION,
1763
+ blame=ErrorBlame.UNKNOWN,
689
1764
  )
690
1765
 
691
- result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url}
692
1766
 
693
- if output_path:
694
- _write_output(output_path, result)
1767
+ def _convert_results_to_aoai_evaluation_results(
1768
+ results: EvaluationResult,
1769
+ logger: logging.Logger,
1770
+ eval_id: Optional[str] = None,
1771
+ eval_run_id: Optional[str] = None,
1772
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]] = None,
1773
+ eval_run_summary: Optional[Dict[str, Any]] = None,
1774
+ eval_meta_data: Optional[Dict[str, Any]] = None,
1775
+ ) -> None:
1776
+ """
1777
+ Convert evaluation results to AOAI evaluation results format.
1778
+
1779
+ Each row of input results.rows looks like:
1780
+ {"inputs.query":"What is the capital of France?","inputs.context":"France is in Europe",
1781
+ "inputs.generated_response":"Paris is the capital of France.","inputs.ground_truth":"Paris is the capital of France.",
1782
+ "outputs.F1_score.f1_score":1.0,"outputs.F1_score.f1_result":"pass","outputs.F1_score.f1_threshold":0.5}
1783
+
1784
+ Convert each row into new RunOutputItem object with results array.
1785
+
1786
+ :param results: The evaluation results to convert
1787
+ :type results: EvaluationResult
1788
+ :param eval_meta_data: The evaluation metadata, containing eval_id, eval_run_id, and testing_criteria
1789
+ :type eval_meta_data: Dict[str, Any]
1790
+ :param logger: Logger instance
1791
+ :type logger: logging.Logger
1792
+ :return: EvaluationResult with converted evaluation results in AOAI format
1793
+ :rtype: EvaluationResult
1794
+ """
695
1795
 
696
- return result
1796
+ if evaluators is None:
1797
+ return
1798
+
1799
+ # Get the testing_criteria_name and testing_criteria_type from evaluators
1800
+ testing_criteria_name_types_metrics: Optional[Dict[str, Any]] = {}
1801
+ criteria_name_types_from_meta: Optional[Dict[str, str]] = {}
1802
+ if eval_meta_data and "testing_criteria" in eval_meta_data:
1803
+ testing_criteria_list: Optional[List[Dict[str, Any]]] = eval_meta_data.get("testing_criteria")
1804
+ if testing_criteria_list is not None:
1805
+ for criteria in testing_criteria_list:
1806
+ criteria_name = criteria.get("name")
1807
+ criteria_type = criteria.get("type")
1808
+ if criteria_name is not None and criteria_type is not None:
1809
+ criteria_name_types_from_meta[criteria_name] = criteria
1810
+
1811
+ for criteria_name, evaluator in evaluators.items():
1812
+ criteria_type = None
1813
+ metrics = []
1814
+ if criteria_name in criteria_name_types_from_meta:
1815
+ criteria_type = criteria_name_types_from_meta[criteria_name].get("type", None)
1816
+ evaluator_name = criteria_name_types_from_meta[criteria_name].get("evaluator_name", None)
1817
+ current_evaluator_metrics = criteria_name_types_from_meta[criteria_name].get("metrics", None)
1818
+ if current_evaluator_metrics and len(current_evaluator_metrics) > 0:
1819
+ metrics.extend(current_evaluator_metrics)
1820
+ elif evaluator_name:
1821
+ if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
1822
+ evaluator_name = evaluator_name.replace("builtin.", "")
1823
+ metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(evaluator_name, [])
1824
+ if metrics_mapped and len(metrics_mapped) > 0:
1825
+ metrics.extend(metrics_mapped)
1826
+ else:
1827
+ metrics.append(criteria_name)
1828
+ else:
1829
+ metrics.append(criteria_name)
1830
+ elif isinstance(evaluator, AzureOpenAIGrader):
1831
+ criteria_type = evaluator._type # pylint: disable=protected-access
1832
+ metrics.append(criteria_name)
1833
+ elif isinstance(evaluator, EvaluatorBase):
1834
+ criteria_type = "azure_ai_evaluator"
1835
+ evaluator_class_name = evaluator.__class__.__name__
1836
+ eval_name = _EvaluatorMetricMapping.EVAL_CLASS_NAME_MAP.get(evaluator_class_name, None)
1837
+ if eval_name:
1838
+ metrics_mapped = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS.get(eval_name, [])
1839
+ if metrics_mapped and len(metrics_mapped) > 0:
1840
+ metrics.extend(metrics_mapped)
1841
+ else:
1842
+ metrics.append(criteria_name)
1843
+ else:
1844
+ criteria_type = "unknown"
1845
+ metrics.append(criteria_name)
1846
+ testing_criteria_name_types_metrics[criteria_name] = {"type": criteria_type, "metrics": metrics}
1847
+
1848
+ created_time = int(time.time())
1849
+ converted_rows = []
1850
+
1851
+ for row_idx, row in enumerate(results.get("rows", [])):
1852
+ # Group outputs by test criteria name
1853
+ criteria_groups = {criteria: {} for criteria in testing_criteria_name_types_metrics.keys()}
1854
+ input_groups = {}
1855
+ top_sample = {}
1856
+ for key, value in row.items():
1857
+ if key.startswith("outputs."):
1858
+ # Parse key: outputs.<test-criteria-name>.<metric>
1859
+ parts = key.split(".", 2) # Split into max 3 parts: ['outputs', '<criteria-name>', '<metric>']
1860
+ if len(parts) >= 3:
1861
+ criteria_name = parts[1]
1862
+ metric_name = parts[2]
1863
+
1864
+ if criteria_name not in criteria_groups:
1865
+ criteria_groups[criteria_name] = {}
1866
+
1867
+ criteria_groups[criteria_name][metric_name] = value
1868
+ elif key.startswith("inputs."):
1869
+ input_key = key.replace("inputs.", "")
1870
+ if input_key not in input_groups:
1871
+ input_groups[input_key] = value
1872
+
1873
+ # Convert each criteria group to RunOutputItem result
1874
+ run_output_results = []
1875
+ for criteria_name, metrics in criteria_groups.items():
1876
+ # Extract metrics for this criteria
1877
+ expected_metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
1878
+ criteria_type = testing_criteria_name_types_metrics.get(criteria_name, {}).get("type", "unknown")
1879
+ result_per_metric = {}
1880
+ # Find score - look for various score patterns
1881
+ for metric_key, metric_value in metrics.items():
1882
+ if metric_key.endswith("_score") or metric_key == "score":
1883
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1884
+ if metric not in result_per_metric:
1885
+ result_per_metric[metric] = {"score": metric_value}
1886
+ else:
1887
+ result_per_metric[metric]["score"] = metric_value
1888
+ _append_indirect_attachments_to_results(result_per_metric, "score", metric, metric_value)
1889
+ if metric_key == "passed":
1890
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1891
+ if metric not in result_per_metric:
1892
+ result_per_metric[metric] = {"passed": metric_value}
1893
+ else:
1894
+ result_per_metric[metric]["passed"] = metric_value
1895
+ _append_indirect_attachments_to_results(result_per_metric, "passed", metric, metric_value)
1896
+ elif metric_key.endswith("_result") or metric_key == "result" or metric_key.endswith("_label"):
1897
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1898
+ label = metric_value
1899
+ passed = (
1900
+ True if (str(metric_value).lower() == "pass" or str(metric_value).lower() == "true") else False
1901
+ )
1902
+ if metric not in result_per_metric:
1903
+ if criteria_type == "azure_ai_evaluator":
1904
+ result_per_metric[metric] = {"label": label, "passed": passed}
1905
+ else:
1906
+ result_per_metric[metric] = {"label": label}
1907
+ else:
1908
+ result_per_metric[metric]["label"] = metric_value
1909
+ if criteria_type == "azure_ai_evaluator":
1910
+ result_per_metric[metric]["passed"] = passed
1911
+ _append_indirect_attachments_to_results(result_per_metric, "label", metric, label)
1912
+ if criteria_type == "azure_ai_evaluator":
1913
+ _append_indirect_attachments_to_results(result_per_metric, "passed", metric, passed)
1914
+ elif (
1915
+ metric_key.endswith("_reason") and not metric_key.endswith("_finish_reason")
1916
+ ) or metric_key == "reason":
1917
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1918
+ if metric not in result_per_metric:
1919
+ result_per_metric[metric] = {"reason": metric_value}
1920
+ else:
1921
+ result_per_metric[metric]["reason"] = metric_value
1922
+ _append_indirect_attachments_to_results(result_per_metric, "reason", metric, metric_value)
1923
+ elif metric_key.endswith("_threshold") or metric_key == "threshold":
1924
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1925
+ if metric not in result_per_metric:
1926
+ result_per_metric[metric] = {"threshold": metric_value}
1927
+ else:
1928
+ result_per_metric[metric]["threshold"] = metric_value
1929
+ _append_indirect_attachments_to_results(result_per_metric, "threshold", metric, metric_value)
1930
+ elif metric_key == "sample":
1931
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1932
+ if metric not in result_per_metric:
1933
+ result_per_metric[metric] = {"sample": metric_value}
1934
+ else:
1935
+ result_per_metric[metric]["sample"] = metric_value
1936
+ _append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value)
1937
+ elif metric_key.endswith("_finish_reason"):
1938
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1939
+ if metric not in result_per_metric:
1940
+ result_per_metric[metric] = {"sample": {"finish_reason": metric_value}}
1941
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1942
+ result_per_metric[metric]["sample"] = {"finish_reason": metric_value}
1943
+ elif (
1944
+ metric in result_per_metric
1945
+ and "sample" in result_per_metric[metric]
1946
+ and "finish_reason" not in result_per_metric[metric]["sample"]
1947
+ ):
1948
+ result_per_metric[metric]["sample"]["finish_reason"] = metric_value
1949
+ _append_indirect_attachments_to_results(
1950
+ result_per_metric, "sample", metric, metric_value, "finish_reason"
1951
+ )
1952
+ elif metric_key.endswith("_model"):
1953
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1954
+ if metric not in result_per_metric:
1955
+ result_per_metric[metric] = {"sample": {"model": metric_value}}
1956
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1957
+ result_per_metric[metric]["sample"] = {"model": metric_value}
1958
+ elif (
1959
+ metric in result_per_metric
1960
+ and "sample" in result_per_metric[metric]
1961
+ and "model" not in result_per_metric[metric]["sample"]
1962
+ ):
1963
+ result_per_metric[metric]["sample"]["model"] = metric_value
1964
+ _append_indirect_attachments_to_results(result_per_metric, "sample", metric, metric_value, "model")
1965
+ elif metric_key.endswith("_sample_input"):
1966
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1967
+ input_metric_val_json: Optional[List[Dict[str, Any]]] = []
1968
+ try:
1969
+ input_metric_val_json = json.loads(metric_value)
1970
+ except Exception as e:
1971
+ logger.warning(f"Failed to parse _sample_input value as JSON: {e}")
1972
+ if metric not in result_per_metric:
1973
+ result_per_metric[metric] = {"sample": {"input": input_metric_val_json}}
1974
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1975
+ result_per_metric[metric]["sample"] = {"input": input_metric_val_json}
1976
+ elif (
1977
+ metric in result_per_metric
1978
+ and "sample" in result_per_metric[metric]
1979
+ and "input" not in result_per_metric[metric]["sample"]
1980
+ ):
1981
+ result_per_metric[metric]["sample"]["input"] = input_metric_val_json
1982
+ _append_indirect_attachments_to_results(
1983
+ result_per_metric, "sample", metric, input_metric_val_json, "input"
1984
+ )
1985
+ elif metric_key.endswith("_sample_output"):
1986
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
1987
+ output_metric_val_json: Optional[List[Dict[str, Any]]] = []
1988
+ try:
1989
+ output_metric_val_json = json.loads(metric_value)
1990
+ except Exception as e:
1991
+ logger.warning(f"Failed to parse _sample_output value as JSON: {e}")
1992
+ if metric not in result_per_metric:
1993
+ result_per_metric[metric] = {"sample": {"output": output_metric_val_json}}
1994
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
1995
+ result_per_metric[metric]["sample"] = {"output": output_metric_val_json}
1996
+ elif (
1997
+ metric in result_per_metric
1998
+ and "sample" in result_per_metric[metric]
1999
+ and "output" not in result_per_metric[metric]["sample"]
2000
+ ):
2001
+ result_per_metric[metric]["sample"]["output"] = output_metric_val_json
2002
+ _append_indirect_attachments_to_results(
2003
+ result_per_metric, "sample", metric, output_metric_val_json, "output"
2004
+ )
2005
+ elif metric_key.endswith("_total_tokens"):
2006
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2007
+ metric_value = None if _is_none_or_nan(metric_value) else metric_value
2008
+ if metric not in result_per_metric:
2009
+ result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
2010
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2011
+ result_per_metric[metric]["sample"] = {"usage": {"total_tokens": metric_value}}
2012
+ elif (
2013
+ metric in result_per_metric
2014
+ and "sample" in result_per_metric[metric]
2015
+ and "usage" not in result_per_metric[metric]["sample"]
2016
+ ):
2017
+ result_per_metric[metric]["sample"]["usage"] = {"total_tokens": metric_value}
2018
+ else:
2019
+ result_per_metric[metric]["sample"]["usage"]["total_tokens"] = metric_value
2020
+ _append_indirect_attachments_to_results(
2021
+ result_per_metric, "sample", metric, metric_value, "usage", "total_tokens"
2022
+ )
2023
+ elif metric_key.endswith("_prompt_tokens"):
2024
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2025
+ metric_value = None if _is_none_or_nan(metric_value) else metric_value
2026
+ if metric not in result_per_metric:
2027
+ result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
2028
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2029
+ result_per_metric[metric]["sample"] = {"usage": {"prompt_tokens": metric_value}}
2030
+ elif (
2031
+ metric in result_per_metric
2032
+ and "sample" in result_per_metric[metric]
2033
+ and "usage" not in result_per_metric[metric]["sample"]
2034
+ ):
2035
+ result_per_metric[metric]["sample"]["usage"] = {"prompt_tokens": metric_value}
2036
+ else:
2037
+ result_per_metric[metric]["sample"]["usage"]["prompt_tokens"] = metric_value
2038
+ _append_indirect_attachments_to_results(
2039
+ result_per_metric, "sample", metric, metric_value, "usage", "prompt_tokens"
2040
+ )
2041
+ elif metric_key.endswith("_completion_tokens"):
2042
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2043
+ metric_value = None if _is_none_or_nan(metric_value) else metric_value
2044
+ if metric not in result_per_metric:
2045
+ result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
2046
+ elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
2047
+ result_per_metric[metric]["sample"] = {"usage": {"completion_tokens": metric_value}}
2048
+ elif (
2049
+ metric in result_per_metric
2050
+ and "sample" in result_per_metric[metric]
2051
+ and "usage" not in result_per_metric[metric]["sample"]
2052
+ ):
2053
+ result_per_metric[metric]["sample"]["usage"] = {"completion_tokens": metric_value}
2054
+ else:
2055
+ result_per_metric[metric]["sample"]["usage"]["completion_tokens"] = metric_value
2056
+ _append_indirect_attachments_to_results(
2057
+ result_per_metric, "sample", metric, metric_value, "usage", "completion_tokens"
2058
+ )
2059
+ elif not any(
2060
+ metric_key.endswith(suffix)
2061
+ for suffix in [
2062
+ "_result",
2063
+ "_reason",
2064
+ "_threshold",
2065
+ "_label",
2066
+ "_score",
2067
+ "_model",
2068
+ "_finish_reason",
2069
+ "_sample_input",
2070
+ "_sample_output",
2071
+ "_total_tokens",
2072
+ "_prompt_tokens",
2073
+ "_completion_tokens",
2074
+ ]
2075
+ ):
2076
+ metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2077
+ # If no score found yet and this doesn't match other patterns, use as score
2078
+ if metric_key == metric and metric not in result_per_metric:
2079
+ result_per_metric[metric] = {"score": metric_value}
2080
+ elif metric_key == metric and result_per_metric[metric].get("score", None) is None:
2081
+ result_per_metric[metric]["score"] = metric_value
2082
+
2083
+ for metric, metric_values in result_per_metric.items():
2084
+ score = metric_values.get("score", None)
2085
+ label = metric_values.get("label", None)
2086
+ reason = metric_values.get("reason", None)
2087
+ threshold = metric_values.get("threshold", None)
2088
+ passed = metric_values.get("passed", None)
2089
+ sample = metric_values.get("sample", None)
2090
+
2091
+ # Create result object for this criteria
2092
+ result_obj = {
2093
+ "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2094
+ "type", "azure_ai_evaluator"
2095
+ ),
2096
+ "name": criteria_name, # Use criteria name as name
2097
+ "metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2098
+ }
2099
+ # Add optional fields
2100
+ if (
2101
+ metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"]
2102
+ or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["code_vulnerability"]
2103
+ or metric in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["protected_material"]
2104
+ ):
2105
+ copy_label = label
2106
+ if copy_label is not None and isinstance(copy_label, bool) and copy_label == True:
2107
+ label = "fail"
2108
+ score = 0.0
2109
+ passed = False
2110
+ else:
2111
+ label = "pass"
2112
+ score = 1.0
2113
+ passed = True
2114
+ result_obj["score"] = (
2115
+ score if not (score is None or (isinstance(score, float) and math.isnan(score))) else None
2116
+ )
2117
+ result_obj["label"] = label
2118
+ result_obj["reason"] = reason
2119
+ result_obj["threshold"] = threshold
2120
+ result_obj["passed"] = passed
2121
+
2122
+ if sample is not None:
2123
+ result_obj["sample"] = sample
2124
+ top_sample = sample # Save top sample for the row
2125
+ run_output_results.append(result_obj)
2126
+
2127
+ if (
2128
+ eval_run_summary
2129
+ and criteria_name in eval_run_summary
2130
+ and isinstance(eval_run_summary[criteria_name], dict)
2131
+ and "error_code" in eval_run_summary[criteria_name]
2132
+ ) and eval_run_summary[criteria_name].get("error_code", None) is not None:
2133
+ error_info = (
2134
+ {
2135
+ "code": eval_run_summary[criteria_name].get("error_code", None),
2136
+ "message": eval_run_summary[criteria_name].get("error_message", None),
2137
+ }
2138
+ if eval_run_summary[criteria_name].get("error_code", None) is not None
2139
+ else None
2140
+ )
2141
+ sample = {"error": error_info} if error_info is not None else None
2142
+ # Create result object for this criteria
2143
+ metrics = testing_criteria_name_types_metrics.get(criteria_name, {}).get("metrics", [])
2144
+ for metric in metrics:
2145
+ should_add_error_summary = True
2146
+ for result in run_output_results:
2147
+ if result.get("name", None) == criteria_name and result.get("metric", None) == metric:
2148
+ rs_score = result.get("score", None)
2149
+ rs_threshold = result.get("threshold", None)
2150
+ rs_label = result.get("label", None)
2151
+ rs_reason = result.get("reason", None)
2152
+ if (
2153
+ _is_none_or_nan(rs_score)
2154
+ and _is_none_or_nan(rs_threshold)
2155
+ and _is_none_or_nan(rs_label)
2156
+ and _is_none_or_nan(rs_reason)
2157
+ ):
2158
+ run_output_results.remove(result)
2159
+ else:
2160
+ should_add_error_summary = False
2161
+ break # Skip if already have result for this criteria and metric
2162
+ if should_add_error_summary:
2163
+ result_obj = {
2164
+ "type": testing_criteria_name_types_metrics.get(criteria_name, {}).get(
2165
+ "type", "azure_ai_evaluator"
2166
+ ),
2167
+ "name": criteria_name, # Use criteria name as name
2168
+ "metric": metric if metric is not None else criteria_name, # Use criteria name as metric
2169
+ "score": None,
2170
+ "label": None,
2171
+ "reason": None,
2172
+ "threshold": None,
2173
+ "passed": None,
2174
+ "sample": sample,
2175
+ }
2176
+ run_output_results.append(result_obj)
2177
+
2178
+ # Create RunOutputItem structure
2179
+ run_output_item = {
2180
+ "object": "eval.run.output_item",
2181
+ "id": f"{row_idx+1}",
2182
+ "run_id": eval_run_id,
2183
+ "eval_id": eval_id,
2184
+ "created_at": created_time,
2185
+ "datasource_item_id": row_idx,
2186
+ "datasource_item": input_groups,
2187
+ "results": run_output_results,
2188
+ "status": "completed" if len(run_output_results) > 0 else "error",
2189
+ }
2190
+
2191
+ run_output_item["sample"] = top_sample
2192
+
2193
+ converted_rows.append(run_output_item)
2194
+
2195
+ # Create converted results maintaining the same structure
2196
+ results["_evaluation_results_list"] = converted_rows
2197
+ logger.info(
2198
+ f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
2199
+ )
2200
+ # Calculate summary statistics
2201
+ evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger, criteria_name_types_from_meta)
2202
+ results["_evaluation_summary"] = evaluation_summary
2203
+ logger.info(
2204
+ f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
2205
+ )
2206
+
2207
+
2208
+ def _is_none_or_nan(value: Any) -> bool:
2209
+ """
2210
+ Check if a value is None or NaN.
2211
+
2212
+ :param value: The value to check
2213
+ :type value: Any
2214
+ :return: True if the value is None or NaN, False otherwise
2215
+ :rtype: bool
2216
+ """
2217
+ if value is None:
2218
+ return True
2219
+ if isinstance(value, float) and math.isnan(value):
2220
+ return True
2221
+ if isinstance(value, str) and value.lower() in ["nan", "null", "none", ""]:
2222
+ return True
2223
+ return False
2224
+
2225
+
2226
+ def _append_indirect_attachments_to_results(
2227
+ current_result_dict: Dict[str, Any],
2228
+ result_name: str,
2229
+ metric: str,
2230
+ metric_value: Any,
2231
+ nested_result_name: Optional[str] = None,
2232
+ secondnested_result_name: Optional[str] = None,
2233
+ ) -> None:
2234
+ """
2235
+ Append indirect attachments to the current result dictionary.
2236
+
2237
+ :param current_result_dict: The current result dictionary to update
2238
+ :type current_result_dict: Dict[str, Any]
2239
+ :param result_name: The result name
2240
+ :type result_name: str
2241
+ :param metric: The metric name
2242
+ :type metric: str
2243
+ :param metric_value: The value of the metric
2244
+ :type metric_value: Any
2245
+ """
2246
+ if metric == "xpia" and result_name:
2247
+ for metric_extended in ["xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"]:
2248
+ if nested_result_name is None:
2249
+ if metric_extended not in current_result_dict:
2250
+ current_result_dict[metric_extended] = {result_name: metric_value}
2251
+ else:
2252
+ current_result_dict[metric_extended][result_name] = metric_value
2253
+ elif nested_result_name is not None and secondnested_result_name is None:
2254
+ if metric_extended not in current_result_dict:
2255
+ current_result_dict[metric_extended] = {result_name: {nested_result_name: metric_value}}
2256
+ elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
2257
+ current_result_dict[metric_extended][result_name] = {nested_result_name: metric_value}
2258
+ elif (
2259
+ metric_extended in current_result_dict
2260
+ and result_name in current_result_dict[metric_extended]
2261
+ and nested_result_name not in current_result_dict[metric_extended][result_name]
2262
+ ):
2263
+ current_result_dict[metric_extended][result_name][nested_result_name] = metric_value
2264
+ elif nested_result_name is not None and secondnested_result_name is not None:
2265
+ if metric_extended not in current_result_dict:
2266
+ current_result_dict[metric_extended] = {
2267
+ result_name: {nested_result_name: {secondnested_result_name: metric_value}}
2268
+ }
2269
+ elif metric_extended in current_result_dict and result_name not in current_result_dict[metric_extended]:
2270
+ current_result_dict[metric_extended][result_name] = {
2271
+ nested_result_name: {secondnested_result_name: metric_value}
2272
+ }
2273
+ elif (
2274
+ metric_extended in current_result_dict
2275
+ and result_name in current_result_dict[metric_extended]
2276
+ and nested_result_name not in current_result_dict[metric_extended][result_name]
2277
+ ):
2278
+ current_result_dict[metric_extended][result_name][nested_result_name] = {
2279
+ secondnested_result_name: metric_value
2280
+ }
2281
+ else:
2282
+ (
2283
+ current_result_dict[metric_extended][result_name][nested_result_name][secondnested_result_name]
2284
+ ) = metric_value
2285
+
2286
+
2287
+ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metric_list: List[str]) -> str:
2288
+ """
2289
+ Get the metric name from the testing criteria and metric key.
2290
+
2291
+ :param testing_criteria_name: The name of the testing criteria
2292
+ :type testing_criteria_name: str
2293
+ :param metric_key: The metric key to look for
2294
+ :type metric_key: str
2295
+ :param metric_list: List of expected metrics for the testing criteria
2296
+ :type metric_list: List[str]
2297
+ :return: The metric name if found, otherwise the testing criteria name
2298
+ :rtype: str
2299
+ """
2300
+ metric = None
2301
+
2302
+ if metric_key == "xpia_manipulated_content":
2303
+ metric = "xpia_manipulated_content"
2304
+ return metric
2305
+ elif metric_key == "xpia_intrusion":
2306
+ metric = "xpia_intrusion"
2307
+ return metric
2308
+ elif metric_key == "xpia_information_gathering":
2309
+ metric = "xpia_information_gathering"
2310
+ return metric
2311
+ for expected_metric in metric_list:
2312
+ if metric_key.startswith(expected_metric):
2313
+ metric = expected_metric
2314
+ break
2315
+ if metric is None:
2316
+ metric = testing_criteria_name
2317
+ return metric
2318
+
2319
+
2320
+ def _is_primary_metric(metric_name: str, evaluator_name: str) -> bool:
2321
+ """
2322
+ Check if the given metric name is a primary metric.
2323
+
2324
+ :param metric_name: The name of the metric
2325
+ :type metric_name: str
2326
+ :param evaluator_name: The name of the evaluator
2327
+ :type evaluator_name: str
2328
+ :return: True if the metric is a primary metric, False otherwise
2329
+ :rtype: bool
2330
+ """
2331
+ if (
2332
+ not _is_none_or_nan(metric_name)
2333
+ and not _is_none_or_nan(evaluator_name)
2334
+ and evaluator_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS
2335
+ and isinstance(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name], list)
2336
+ and len(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]) > 1
2337
+ and metric_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]
2338
+ and metric_name.lower() != _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name][0].lower()
2339
+ ):
2340
+ return False
2341
+ else:
2342
+ return True
2343
+
2344
+
2345
+ def _calculate_aoai_evaluation_summary(
2346
+ aoai_results: list, logger: logging.Logger, criteria_name_types_from_meta: Optional[Dict[str, Any]]
2347
+ ) -> Dict[str, Any]:
2348
+ """
2349
+ Calculate summary statistics for AOAI evaluation results.
2350
+
2351
+ :param aoai_results: List of AOAI result objects (run_output_items)
2352
+ :type aoai_results: list
2353
+ :return: Summary statistics dictionary
2354
+ :rtype: Dict[str, Any]
2355
+ """
2356
+ # Calculate result counts based on aoaiResults
2357
+ result_counts = {"total": 0, "errored": 0, "failed": 0, "passed": 0}
2358
+
2359
+ # Count results by status and calculate per model usage
2360
+ model_usage_stats = {} # Dictionary to aggregate usage by model
2361
+ result_counts_stats = {} # Dictionary to aggregate usage by model
2362
+
2363
+ for aoai_result in aoai_results:
2364
+ logger.info(
2365
+ f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, row keys: {aoai_result.keys() if hasattr(aoai_result, 'keys') else 'N/A'}"
2366
+ )
2367
+ result_counts["total"] += 1
2368
+ passed_count = 0
2369
+ failed_count = 0
2370
+ error_count = 0
2371
+ if isinstance(aoai_result, dict) and "results" in aoai_result:
2372
+ logger.info(
2373
+ f"Processing aoai_result with id: {getattr(aoai_result, 'id', 'unknown')}, results count: {len(aoai_result['results'])}"
2374
+ )
2375
+ for result_item in aoai_result["results"]:
2376
+ if isinstance(result_item, dict):
2377
+ testing_criteria = result_item.get("name", "")
2378
+ is_primary_metric = True
2379
+ if (
2380
+ criteria_name_types_from_meta is not None
2381
+ and isinstance(criteria_name_types_from_meta, dict)
2382
+ and testing_criteria in criteria_name_types_from_meta
2383
+ ):
2384
+ evaluator_name = criteria_name_types_from_meta[testing_criteria].get("evaluator_name", None)
2385
+ criteria_type = criteria_name_types_from_meta[testing_criteria].get("type", None)
2386
+ if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
2387
+ evaluator_name = evaluator_name.replace("builtin.", "")
2388
+ is_primary_metric = _is_primary_metric(result_item.get("metric", ""), evaluator_name)
2389
+ if not is_primary_metric:
2390
+ logger.info(
2391
+ f"Skip counts for non-primary metric for testing_criteria: {testing_criteria}, metric: {result_item.get('metric', '')}"
2392
+ )
2393
+ continue
2394
+ # Check if the result has a 'passed' field
2395
+ if "passed" in result_item and result_item["passed"] is not None:
2396
+ if testing_criteria not in result_counts_stats:
2397
+ result_counts_stats[testing_criteria] = {
2398
+ "testing_criteria": testing_criteria,
2399
+ "failed": 0,
2400
+ "passed": 0,
2401
+ }
2402
+ if result_item["passed"] is True:
2403
+ passed_count += 1
2404
+ result_counts_stats[testing_criteria]["passed"] += 1
2405
+
2406
+ elif result_item["passed"] is False:
2407
+ failed_count += 1
2408
+ result_counts_stats[testing_criteria]["failed"] += 1
2409
+ # Check if the result indicates an error status
2410
+ elif ("status" in result_item and result_item["status"] in ["error", "errored"]) or (
2411
+ "sample" in result_item
2412
+ and isinstance(result_item["sample"], dict)
2413
+ and result_item["sample"].get("error", None) is not None
2414
+ ):
2415
+ error_count += 1
2416
+ elif hasattr(aoai_result, "status") and aoai_result.status == "error":
2417
+ error_count += 1
2418
+ elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
2419
+ error_count += 1
2420
+
2421
+ # Update overall result counts, error counts will not be considered for passed/failed
2422
+ if error_count > 0:
2423
+ result_counts["errored"] += 1
2424
+
2425
+ if failed_count > 0:
2426
+ result_counts["failed"] += 1
2427
+ elif (
2428
+ failed_count == 0 and passed_count > 0 and passed_count == len(aoai_result.get("results", [])) - error_count
2429
+ ):
2430
+ result_counts["passed"] += 1
2431
+
2432
+ # Extract usage statistics from aoai_result.sample
2433
+ sample_data_list = []
2434
+ dup_usage_list = _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS["indirect_attack"].copy()
2435
+ dup_usage_list.remove("xpia")
2436
+ if isinstance(aoai_result, dict) and aoai_result["results"] and isinstance(aoai_result["results"], list):
2437
+ for result_item in aoai_result["results"]:
2438
+ if (
2439
+ isinstance(result_item, dict)
2440
+ and "sample" in result_item
2441
+ and result_item["sample"]
2442
+ and result_item["metric"] not in dup_usage_list
2443
+ ):
2444
+ sample_data_list.append(result_item["sample"])
2445
+
2446
+ for sample_data in sample_data_list:
2447
+ if sample_data and isinstance(sample_data, dict) and "usage" in sample_data:
2448
+ usage_data = sample_data["usage"]
2449
+ model_name = sample_data.get("model", "unknown") if usage_data.get("model", "unknown") else "unknown"
2450
+ if _is_none_or_nan(model_name):
2451
+ continue
2452
+ if model_name not in model_usage_stats:
2453
+ model_usage_stats[model_name] = {
2454
+ "invocation_count": 0,
2455
+ "total_tokens": 0,
2456
+ "prompt_tokens": 0,
2457
+ "completion_tokens": 0,
2458
+ "cached_tokens": 0,
2459
+ }
2460
+ # Aggregate usage statistics
2461
+ model_stats = model_usage_stats[model_name]
2462
+ model_stats["invocation_count"] += 1
2463
+ if isinstance(usage_data, dict):
2464
+ cur_total_tokens = usage_data.get("total_tokens", 0)
2465
+ if _is_none_or_nan(cur_total_tokens):
2466
+ cur_total_tokens = 0
2467
+ cur_prompt_tokens = usage_data.get("prompt_tokens", 0)
2468
+ if _is_none_or_nan(cur_prompt_tokens):
2469
+ cur_prompt_tokens = 0
2470
+ cur_completion_tokens = usage_data.get("completion_tokens", 0)
2471
+ if _is_none_or_nan(cur_completion_tokens):
2472
+ cur_completion_tokens = 0
2473
+ cur_cached_tokens = usage_data.get("cached_tokens", 0)
2474
+ if _is_none_or_nan(cur_cached_tokens):
2475
+ cur_cached_tokens = 0
2476
+ logger.info(
2477
+ f"Model: {model_name}, cur_total_tokens: {cur_total_tokens}, {_is_none_or_nan(cur_total_tokens)}, cur_prompt_tokens: {cur_prompt_tokens}, cur_completion_tokens: {cur_completion_tokens}, cur_cached_tokens: {cur_cached_tokens}"
2478
+ )
2479
+ model_stats["total_tokens"] += cur_total_tokens
2480
+ model_stats["prompt_tokens"] += cur_prompt_tokens
2481
+ model_stats["completion_tokens"] += cur_completion_tokens
2482
+ model_stats["cached_tokens"] += cur_cached_tokens
2483
+
2484
+ # Convert model usage stats to list format matching EvaluationRunPerModelUsage
2485
+ per_model_usage = []
2486
+ for model_name, stats in model_usage_stats.items():
2487
+ per_model_usage.append(
2488
+ {
2489
+ "model_name": model_name,
2490
+ "invocation_count": stats["invocation_count"],
2491
+ "total_tokens": stats["total_tokens"],
2492
+ "prompt_tokens": stats["prompt_tokens"],
2493
+ "completion_tokens": stats["completion_tokens"],
2494
+ "cached_tokens": stats["cached_tokens"],
2495
+ }
2496
+ )
2497
+ result_counts_stats_val = []
2498
+ logger.info(f"\r\n Result counts stats: {result_counts_stats}")
2499
+ for criteria_name, stats_val in result_counts_stats.items():
2500
+ if isinstance(stats_val, dict):
2501
+ logger.info(f"\r\n Criteria: {criteria_name}, stats: {stats_val}")
2502
+ cur_passed = stats_val.get("passed", 0)
2503
+ if _is_none_or_nan(cur_passed):
2504
+ cur_passed = 0
2505
+ cur_failed_count = stats_val.get("failed", 0)
2506
+ if _is_none_or_nan(cur_failed_count):
2507
+ cur_failed_count = 0
2508
+ result_counts_stats_val.append(
2509
+ {
2510
+ "testing_criteria": criteria_name if not _is_none_or_nan(criteria_name) else "unknown",
2511
+ "passed": cur_passed,
2512
+ "failed": cur_failed_count,
2513
+ }
2514
+ )
2515
+ return {
2516
+ "result_counts": result_counts,
2517
+ "per_model_usage": per_model_usage,
2518
+ "per_testing_criteria_results": result_counts_stats_val,
2519
+ }