azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,992 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import json
6
+ import logging
7
+ import re
8
+
9
+ from openai import AzureOpenAI, OpenAI
10
+ import pandas as pd
11
+ from typing import Any, Callable, Dict, Tuple, TypeVar, Union, Type, Optional, TypedDict, List, cast, Set
12
+ from time import sleep
13
+
14
+ from ._batch_run import CodeClient, ProxyClient
15
+
16
+ # import aoai_mapping
17
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
18
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
19
+ from azure.ai.evaluation._aoai.aoai_grader import AzureOpenAIGrader
20
+ from azure.ai.evaluation._common._experimental import experimental
21
+
22
+
23
+ TClient = TypeVar("TClient", ProxyClient, CodeClient)
24
+ LOGGER = logging.getLogger(__name__)
25
+
26
+ # Precompiled regex for extracting data paths from mapping expressions of the form
27
+ # ${data.some.dotted.path}. Compiled once at import time to avoid repeated
28
+ # recompilation on each call to _generate_data_source_config.
29
+ DATA_PATH_PATTERN = re.compile(r"^\$\{data\.([a-zA-Z0-9_\.]+)\}$")
30
+
31
+ # Canonical top-level wrapper key expected in nested JSONL evaluation rows.
32
+ # Centralizing here avoids magic strings sprinkled through schema/content generation code.
33
+ WRAPPER_KEY = "item"
34
+
35
+
36
+ class OAIEvalRunCreationInfo(TypedDict, total=True):
37
+ """Configuration for an evaluator"""
38
+
39
+ client: Union[AzureOpenAI, OpenAI]
40
+ eval_group_id: str
41
+ eval_run_id: str
42
+ grader_name_map: Dict[str, str]
43
+ # Total number of expected rows in the original dataset. Used to
44
+ # re-align AOAI grader results to guard against silent row drops
45
+ # causing horizontal concatenation misalignment.
46
+ expected_rows: int
47
+
48
+
49
+ def _split_evaluators_and_grader_configs(
50
+ evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
51
+ ) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
52
+ """
53
+ Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
54
+ dictionaries that each contain one subset, the first containing the evaluators and the second containing
55
+ the AOAI graders. AOAI graders are defined as anything that is an instance of the AoaiGrader class,
56
+ including child class instances.
57
+
58
+ :param evaluators: Evaluators to be used for evaluation. It should be a dictionary with key as alias for evaluator
59
+ and value as the evaluator function or AOAI grader.
60
+ :type evaluators: Dict[str, Union[Callable, ]]
61
+ :return: Tuple of two dictionaries, the first containing evaluators and the second containing AOAI graders.
62
+ :rtype: Tuple[Dict[str, Callable], Dict[str, AoaiGrader]]
63
+ """
64
+ LOGGER.info(f"AOAI: Splitting {len(evaluators)} evaluators into AOAI graders and standard evaluators...")
65
+ true_evaluators = {}
66
+ aoai_graders = {}
67
+ for key, value in evaluators.items():
68
+ if isinstance(value, AzureOpenAIGrader):
69
+ aoai_graders[key] = value
70
+ else:
71
+ true_evaluators[key] = value
72
+ LOGGER.info(f"AOAI: Found {len(aoai_graders)} AOAI graders and {len(true_evaluators)} standard evaluators.")
73
+ return true_evaluators, aoai_graders
74
+
75
+
76
+ @experimental
77
+ def _begin_aoai_evaluation(
78
+ graders: Dict[str, AzureOpenAIGrader],
79
+ column_mappings: Optional[Dict[str, Dict[str, str]]],
80
+ data: pd.DataFrame,
81
+ run_name: str,
82
+ **kwargs: Any,
83
+ ) -> List[OAIEvalRunCreationInfo]:
84
+ """
85
+ Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
86
+ AOAI evaluation runs must be queried for completion, so this returns the IDs needed to poll for the
87
+ results, and map those results to the user-supplied names of the graders.
88
+
89
+ If any of the graders require unique column mappings, this function will
90
+ create a separate evaluation run for each grader. Otherwise, all graders
91
+ will be evaluated in a single run.
92
+
93
+ :param client: The AOAI client to use for the evaluation.
94
+ :type client: Union[OpenAI, AzureOpenAI]
95
+ :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
96
+ :type graders: Dict[str, AoaiGrader]
97
+ :param column_mappings: The column mappings to use for the evaluation.
98
+ :type column_mappings: Optional[Dict[str, Dict[str, str]]]
99
+ :param data: The data to evaluate, preprocessed by the `_validate_and_load_data` method.
100
+ :type data: pd.DataFrame
101
+ :param run_name: The name of the evaluation run.
102
+ :type run_name: str
103
+ :return: A list of evaluation run info that can be used to retrieve the results of the evaluation later
104
+ :rtype: List[OAIEvalRunCreationInfo]
105
+ """
106
+
107
+ LOGGER.info("AOAI: Aoai graders detected among evaluator inputs. Preparing to create OAI eval group...")
108
+ all_eval_run_info: List[OAIEvalRunCreationInfo] = []
109
+
110
+ grader_mapping_list = list(_get_graders_and_column_mappings(graders, column_mappings))
111
+ LOGGER.info(f"AOAI: Will create {len(grader_mapping_list)} separate evaluation run(s) based on column mappings.")
112
+
113
+ for idx, (selected_graders, selected_column_mapping) in enumerate(grader_mapping_list):
114
+ LOGGER.info(
115
+ f"AOAI: Starting evaluation run {idx + 1}/{len(grader_mapping_list)} with {len(selected_graders)} grader(s)..."
116
+ )
117
+ all_eval_run_info.append(
118
+ _begin_single_aoai_evaluation(selected_graders, data, selected_column_mapping, run_name, **kwargs)
119
+ )
120
+
121
+ LOGGER.info(f"AOAI: Successfully created {len(all_eval_run_info)} evaluation run(s).")
122
+ return all_eval_run_info
123
+
124
+
125
+ def _begin_single_aoai_evaluation(
126
+ graders: Dict[str, AzureOpenAIGrader],
127
+ data: pd.DataFrame,
128
+ column_mapping: Optional[Dict[str, str]],
129
+ run_name: str,
130
+ **kwargs: Any,
131
+ ) -> OAIEvalRunCreationInfo:
132
+ """
133
+ Use the AOAI SDK to start an evaluation of the inputted dataset against the supplied graders.
134
+ AOAI evaluation runs must be queried for completion, so this returns a poller to accomplish that task
135
+ at a later time.
136
+
137
+ :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
138
+ :type graders: Dict[str, AoaiGrader]
139
+ :param data: The input data to evaluate, as a pandas DataFrame.
140
+ :type data: pd.DataFrame
141
+ :param column_mapping: The column mapping to apply. If None, an empty mapping is used.
142
+ :type column_mapping: Optional[Dict[str, str]]
143
+ :param run_name: The name of the evaluation run.
144
+ :type run_name: str
145
+ :return: A tuple containing the eval group ID and eval run ID of the resultant eval run, as well as a dictionary
146
+ that maps the user-supplied evaluators to the names of the graders as generated by the OAI service.
147
+ :rtype: Tuple[str, str, Dict[str, str]]
148
+ """
149
+
150
+ # Format data for eval group creation
151
+ LOGGER.info(f"AOAI: Preparing evaluation for {len(graders)} grader(s): {list(graders.keys())}")
152
+ grader_name_list = []
153
+ grader_list = []
154
+
155
+ data_source: Dict[str, Any] = {}
156
+ data_source_config: Dict[str, Any] = {}
157
+
158
+ if kwargs.get("data_source_config") is not None:
159
+ data_source_config = kwargs.get("data_source_config", {})
160
+
161
+ if kwargs.get("data_source") is not None:
162
+ data_source = kwargs.get("data_source", {})
163
+
164
+ # It's expected that all graders supplied for a single eval run use the same credentials
165
+ # so grab a client from the first grader.
166
+ client = list(graders.values())[0].get_client()
167
+
168
+ for name, grader in graders.items():
169
+ grader_name_list.append(name)
170
+ grader_list.append(grader._grader_config)
171
+ effective_column_mapping: Dict[str, str] = column_mapping or {}
172
+ LOGGER.info(f"AOAI: Generating data source config with {len(effective_column_mapping)} column mapping(s)...")
173
+ if data_source_config == {}:
174
+ data_source_config = _generate_data_source_config(data, effective_column_mapping)
175
+ LOGGER.info(f"AOAI: Data source config generated with schema type: {data_source_config.get('type')}")
176
+
177
+ # Create eval group
178
+ LOGGER.info(f"AOAI: Creating eval group with {len(grader_list)} testing criteria...")
179
+
180
+ # Combine with the item schema with generated data outside Eval SDK
181
+ _combine_item_schemas(data_source_config, kwargs)
182
+
183
+ eval_group_info = client.evals.create(
184
+ data_source_config=data_source_config, testing_criteria=grader_list, metadata={"is_foundry_eval": "true"}
185
+ )
186
+
187
+ LOGGER.info(f"AOAI: Eval group created with id {eval_group_info.id}. Creating eval run next...")
188
+ # Use eval group info to map grader IDs back to user-assigned names.
189
+ grader_name_map = {}
190
+ num_criteria = len(eval_group_info.testing_criteria)
191
+ if num_criteria != len(grader_name_list):
192
+ raise EvaluationException(
193
+ message=f"Number of testing criteria ({num_criteria})"
194
+ + f" returned by OAI eval group does not match oai graders({len(grader_name_list)}).",
195
+ blame=ErrorBlame.USER_ERROR,
196
+ category=ErrorCategory.INVALID_VALUE,
197
+ target=ErrorTarget.AOAI_GRADER,
198
+ )
199
+ for name, criteria in zip(grader_name_list, eval_group_info.testing_criteria):
200
+ grader_name_map[criteria.id] = name
201
+
202
+ # Create eval run
203
+ LOGGER.info(f"AOAI: Creating eval run '{run_name}' with {len(data)} data rows...")
204
+ eval_run_id = _begin_eval_run(client, eval_group_info.id, run_name, data, effective_column_mapping, data_source)
205
+ LOGGER.info(
206
+ f"AOAI: Eval run created with id {eval_run_id}."
207
+ + " Results will be retrieved after normal evaluation is complete..."
208
+ )
209
+
210
+ return OAIEvalRunCreationInfo(
211
+ client=client,
212
+ eval_group_id=eval_group_info.id,
213
+ eval_run_id=eval_run_id,
214
+ grader_name_map=grader_name_map,
215
+ expected_rows=len(data),
216
+ )
217
+
218
+
219
+ def _combine_item_schemas(data_source_config: Dict[str, Any], kwargs: Dict[str, Any]) -> None:
220
+ if "item_schema" not in kwargs or "properties" not in kwargs["item_schema"]:
221
+ return
222
+
223
+ if "item_schema" in data_source_config:
224
+ item_schema = kwargs["item_schema"]["required"] if "required" in kwargs["item_schema"] else []
225
+ for key in kwargs["item_schema"]["properties"]:
226
+ if key not in data_source_config["item_schema"]["properties"]:
227
+ data_source_config["item_schema"]["properties"][key] = kwargs["item_schema"]["properties"][key]
228
+
229
+ if key in item_schema:
230
+ data_source_config["item_schema"]["required"].append(key)
231
+
232
+
233
+ def _get_evaluation_run_results(all_run_info: List[OAIEvalRunCreationInfo]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
234
+ """
235
+ Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
236
+ pipeline to consume. This method accepts a list of eval run information, and will combine the
237
+ results into a single dataframe and metrics dictionary.
238
+
239
+ :param all_run_info: A list of evaluation run information that contains the needed values
240
+ to retrieve the results of the evaluation run.
241
+ :type all_run_info: List[OAIEvalRunCreationInfo]
242
+ :return: A tuple containing the results of the evaluation run as a dataframe, and a dictionary of metrics
243
+ calculated from the evaluation run.
244
+ :rtype: Tuple[pd.DataFrame, Dict[str, Any]]
245
+ :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
246
+ """
247
+
248
+ LOGGER.info(f"AOAI: Retrieving results from {len(all_run_info)} evaluation run(s)...")
249
+ run_metrics = {}
250
+ output_df = pd.DataFrame()
251
+ for idx, run_info in enumerate(all_run_info):
252
+ LOGGER.info(f"AOAI: Fetching results for run {idx + 1}/{len(all_run_info)} (ID: {run_info['eval_run_id']})...")
253
+ cur_output_df, cur_run_metrics = _get_single_run_results(run_info)
254
+ output_df = pd.concat([output_df, cur_output_df], axis=1)
255
+ run_metrics.update(cur_run_metrics)
256
+
257
+ LOGGER.info(f"AOAI: Successfully retrieved all results. Combined dataframe shape: {output_df.shape}")
258
+ return output_df, run_metrics
259
+
260
+
261
+ def _get_single_run_results(
262
+ run_info: OAIEvalRunCreationInfo,
263
+ ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
264
+ """
265
+ Get the results of an OAI evaluation run, formatted in a way that is easy for the rest of the evaluation
266
+ pipeline to consume.
267
+
268
+ :param run_info: The evaluation run information that contains the needed values
269
+ to retrieve the results of the evaluation run.
270
+ :type run_info: OAIEvalRunCreationInfo
271
+ :return: A tuple containing the results of the evaluation run as a dataframe, and a dictionary of metrics
272
+ calculated from the evaluation run.
273
+ :rtype: Tuple[pd.DataFrame, Dict[str, Any]]
274
+ :raises EvaluationException: If the evaluation run fails or is not completed before timing out.
275
+ """
276
+ # Wait for evaluation run to complete
277
+ LOGGER.info(f"AOAI: Waiting for eval run {run_info['eval_run_id']} to complete...")
278
+ run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
279
+
280
+ LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}")
281
+ if run_results.status != "completed":
282
+ raise EvaluationException(
283
+ message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
284
+ + f" failed with status {run_results.status}.",
285
+ blame=ErrorBlame.UNKNOWN,
286
+ category=ErrorCategory.FAILED_EXECUTION,
287
+ target=ErrorTarget.AOAI_GRADER,
288
+ )
289
+
290
+ # Convert run results into a dictionary of metrics
291
+ LOGGER.info(f"AOAI: Processing results and calculating metrics for run {run_info['eval_run_id']}...")
292
+ run_metrics: Dict[str, Any] = {}
293
+ if run_results.per_testing_criteria_results is None:
294
+ msg = (
295
+ "AOAI evaluation run returned no results, despite 'completed' status. This might"
296
+ + " occur when invalid or conflicting models are selected in the model and grader configs."
297
+ f" Navigate to the evaluation run's report URL for more details: {run_results.report_url}"
298
+ )
299
+ raise EvaluationException(
300
+ message=msg,
301
+ blame=ErrorBlame.UNKNOWN,
302
+ category=ErrorCategory.FAILED_EXECUTION,
303
+ target=ErrorTarget.AOAI_GRADER,
304
+ )
305
+ for criteria_result in run_results.per_testing_criteria_results:
306
+ grader_name = run_info["grader_name_map"][criteria_result.testing_criteria]
307
+ passed = criteria_result.passed
308
+ failed = criteria_result.failed
309
+ ratio = passed / (passed + failed) if (passed + failed) else 0.0
310
+ formatted_column_name = f"{grader_name}.pass_rate"
311
+ run_metrics[formatted_column_name] = ratio
312
+ LOGGER.info(f"AOAI: Grader '{grader_name}': {passed} passed, {failed} failed, pass_rate={ratio:.4f}")
313
+
314
+ # Collect all results with pagination
315
+ LOGGER.info(f"AOAI: Collecting output items for run {run_info['eval_run_id']} with pagination...")
316
+ all_results: List[Any] = []
317
+ next_cursor: Optional[str] = None
318
+ limit = 100 # Max allowed by API
319
+
320
+ while True:
321
+ list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
322
+ if next_cursor is not None:
323
+ list_kwargs["after"] = next_cursor
324
+
325
+ raw_list_results = run_info["client"].evals.runs.output_items.list(**list_kwargs)
326
+
327
+ # Add current page results
328
+ all_results.extend(raw_list_results.data)
329
+
330
+ # Check for more pages
331
+ if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
332
+ if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
333
+ next_cursor = raw_list_results.data[-1].id
334
+ else:
335
+ break
336
+ else:
337
+ break
338
+
339
+ LOGGER.info(f"AOAI: Collected {len(all_results)} total output items across all pages.")
340
+ listed_results: Dict[str, List[Any]] = {"index": []}
341
+ # Raw data has no order guarantees; capture datasource_item_id per row for ordering.
342
+ for row_result in all_results:
343
+ listed_results["index"].append(row_result.datasource_item_id)
344
+ for single_grader_row_result in row_result.results:
345
+ if isinstance(single_grader_row_result, dict):
346
+ result_dict = single_grader_row_result
347
+ elif hasattr(single_grader_row_result, "model_dump"):
348
+ result_dict = single_grader_row_result.model_dump()
349
+ elif hasattr(single_grader_row_result, "dict"):
350
+ result_dict = single_grader_row_result.dict()
351
+ elif hasattr(single_grader_row_result, "__dict__"):
352
+ result_dict = vars(single_grader_row_result)
353
+ else:
354
+ raise EvaluationException(
355
+ message=("Unsupported AOAI evaluation result type: " f"{type(single_grader_row_result)!r}."),
356
+ blame=ErrorBlame.UNKNOWN,
357
+ category=ErrorCategory.FAILED_EXECUTION,
358
+ target=ErrorTarget.AOAI_GRADER,
359
+ )
360
+
361
+ grader_result_name = result_dict.get("name", None)
362
+ if grader_result_name is None:
363
+ raise EvaluationException(
364
+ message="AOAI evaluation response missing grader result name; unable to map to original grader.",
365
+ blame=ErrorBlame.UNKNOWN,
366
+ category=ErrorCategory.FAILED_EXECUTION,
367
+ target=ErrorTarget.AOAI_GRADER,
368
+ )
369
+
370
+ grader_name = run_info["grader_name_map"][grader_result_name]
371
+ for name, value in result_dict.items():
372
+ if name in ["name"]:
373
+ continue
374
+ if name.lower() == "passed":
375
+ # Create a `_result` column for each grader
376
+ result_column_name = f"outputs.{grader_name}.{grader_name}_result"
377
+ if len(result_column_name) < 50:
378
+ if result_column_name not in listed_results:
379
+ listed_results[result_column_name] = []
380
+ listed_results[result_column_name].append(EVALUATION_PASS_FAIL_MAPPING[value])
381
+
382
+ formatted_column_name = f"outputs.{grader_name}.{name}"
383
+ if formatted_column_name not in listed_results:
384
+ listed_results[formatted_column_name] = []
385
+ listed_results[formatted_column_name].append(value)
386
+
387
+ # Ensure all columns are the same length as the 'index' list
388
+ num_rows = len(listed_results["index"])
389
+ LOGGER.info(f"AOAI: Processing {num_rows} result rows into dataframe...")
390
+ for col_name in list(listed_results.keys()):
391
+ if col_name != "index":
392
+ col_length = len(listed_results[col_name])
393
+ if col_length < num_rows:
394
+ listed_results[col_name].extend([None] * (num_rows - col_length))
395
+ elif col_length > num_rows:
396
+ listed_results[col_name] = listed_results[col_name][:num_rows]
397
+
398
+ output_df = pd.DataFrame(listed_results)
399
+
400
+ # If the 'index' column is missing for any reason, synthesize it from the current RangeIndex.
401
+ if "index" not in output_df.columns:
402
+ output_df["index"] = list(range(len(output_df)))
403
+
404
+ # Deterministic ordering by original datasource_item_id
405
+ output_df = output_df.sort_values("index", ascending=True)
406
+
407
+ # Keep a temporary row-id copy for debugging/inspection.
408
+ # Use underscores (not hyphens) to avoid pandas column handling quirks.
409
+ output_df["__azure_ai_evaluation_index"] = output_df["index"]
410
+
411
+ # Preserve original ids as index, then pad to expected length
412
+ output_df.set_index("index", inplace=True)
413
+
414
+ expected = run_info.get("expected_rows", None)
415
+ if expected is not None:
416
+ pre_len = len(output_df)
417
+ LOGGER.info(f"AOAI: Validating result count: expected {expected} rows, received {pre_len} rows.")
418
+ # Assumes original datasource_item_id space is 0..expected-1
419
+ output_df = output_df.reindex(range(expected))
420
+ if pre_len != expected:
421
+ missing_rows = expected - pre_len
422
+ LOGGER.warning(
423
+ "AOAI grader run %s returned %d/%d rows; %d missing row(s) padded with NaN for alignment.",
424
+ run_info["eval_run_id"],
425
+ pre_len,
426
+ expected,
427
+ missing_rows,
428
+ )
429
+ # Add a per-grader 'row_missing' boolean for padded rows
430
+ grader_user_names: Set[str] = set()
431
+ for col in output_df.columns:
432
+ if col.startswith("outputs."):
433
+ parts = col.split(".")
434
+ if len(parts) > 2:
435
+ grader_user_names.add(parts[1])
436
+ if grader_user_names:
437
+ missing_index_mask = output_df.isna().all(axis=1)
438
+ for g in grader_user_names:
439
+ col_name = f"outputs.{g}.row_missing"
440
+ if col_name not in output_df:
441
+ output_df[col_name] = False
442
+ output_df.loc[missing_index_mask, col_name] = True
443
+
444
+ # Drop the temporary helper column before returning (no public surface change)
445
+ if "__azure_ai_evaluation_index" in output_df.columns:
446
+ output_df.drop(columns=["__azure_ai_evaluation_index"], inplace=True, errors="ignore")
447
+
448
+ # Reset to RangeIndex so downstream concatenation aligns on position
449
+ output_df.reset_index(drop=True, inplace=True)
450
+ LOGGER.info(
451
+ f"AOAI: Successfully processed run {run_info['eval_run_id']} with final dataframe shape: {output_df.shape}"
452
+ )
453
+ return output_df, run_metrics
454
+
455
+
456
+ def _convert_remote_eval_params_to_grader(grader_id: str, init_params: Dict[str, Any]) -> AzureOpenAIGrader:
457
+ """
458
+ Helper function for the remote evaluation service.
459
+ Given a model ID that refers to a specific AOAI grader wrapper class, return an instance of that class
460
+ using the provided initialization parameters.
461
+
462
+ :param grader_id: The model ID that refers to a specific AOAI grader wrapper class.
463
+ :type grader_id: str
464
+ :param init_params: The initialization parameters to be used for the AOAI grader wrapper class.
465
+ Requires that it contain a model_config and grader_config as top-level keys.
466
+ :type init_params: Dict[str, Any]
467
+ """
468
+
469
+ model_config = init_params.get("model_config", None)
470
+ if model_config is None:
471
+ raise EvaluationException(
472
+ message="Grader converter needs a valid 'model_config' key in init_params.",
473
+ blame=ErrorBlame.USER_ERROR,
474
+ category=ErrorCategory.INVALID_VALUE,
475
+ target=ErrorTarget.AOAI_GRADER,
476
+ )
477
+
478
+ grader_class = _get_grader_class(grader_id)
479
+ return grader_class(**init_params)
480
+
481
+
482
+ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
483
+ """
484
+ Given a model ID, return the class of the corresponding grader wrapper.
485
+ """
486
+
487
+ from azure.ai.evaluation import (
488
+ AzureOpenAIGrader,
489
+ AzureOpenAILabelGrader,
490
+ AzureOpenAIStringCheckGrader,
491
+ AzureOpenAITextSimilarityGrader,
492
+ AzureOpenAIScoreModelGrader,
493
+ AzureOpenAIPythonGrader,
494
+ )
495
+
496
+ id_map = {
497
+ AzureOpenAIGrader.id: AzureOpenAIGrader,
498
+ AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
499
+ AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
500
+ AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
501
+ AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
502
+ AzureOpenAIPythonGrader.id: AzureOpenAIPythonGrader,
503
+ }
504
+
505
+ for key in id_map.keys():
506
+ if model_id == key:
507
+ return id_map[key]
508
+ raise EvaluationException(
509
+ message=f"Model ID {model_id} not recognized as an AOAI grader ID",
510
+ blame=ErrorBlame.USER_ERROR,
511
+ category=ErrorCategory.INVALID_VALUE,
512
+ target=ErrorTarget.AOAI_GRADER,
513
+ )
514
+
515
+
516
+ def _get_graders_and_column_mappings(
517
+ graders: Dict[str, AzureOpenAIGrader],
518
+ column_mappings: Optional[Dict[str, Dict[str, str]]],
519
+ ) -> List[Tuple[Dict[str, AzureOpenAIGrader], Optional[Dict[str, str]]]]:
520
+ """
521
+ Given a dictionary of column mappings and a dictionary of AOAI graders,
522
+ Split them into sub-lists and sub-dictionaries that each correspond to a single evaluation run
523
+ that must be performed to evaluate the entire dataset.
524
+
525
+ Currently this function is fairly naive; it always splits the data if there are multiple
526
+ graders present and any of them have a unique column mapping.
527
+
528
+ This odd separate of data is necessary because our system allows for different evaluators
529
+ to have different dataset columns mapped to the same input name for each evaluator, while
530
+ the OAI API can't. So, if if there's a possibility that such a conflict might arise,
531
+ we need to split the incoming data up.
532
+
533
+ Currently splits each grader into its own eval group/run to ensure they each use
534
+ their own credentials later on. Planned fast follow is to group things by
535
+ matching credentials later.
536
+
537
+ :param graders: The graders to use for the evaluation. Should be a dictionary of string to AOAIGrader.
538
+ :type graders: Dict[str, AoaiGrader]
539
+ :param column_mappings: The column mappings to use for the evaluation.
540
+ :type column_mappings: Optional[Dict[str, Dict[str, str]]]
541
+ :return: A list of tuples, each containing dictionary of AOAI graders,
542
+ and the column mapping they should use.
543
+ :rtype: List[Tuple[Dict[str, AoaiGrader], Optional[Dict[str, str]]]]
544
+ """
545
+
546
+ LOGGER.info(f"AOAI: Organizing {len(graders)} graders with column mappings...")
547
+ if column_mappings is None:
548
+ LOGGER.info("AOAI: No column mappings provided, each grader will have its own eval run.")
549
+ return [({name: grader}, None) for name, grader in graders.items()]
550
+ default_mapping = column_mappings.get("default", None)
551
+ if default_mapping is None:
552
+ default_mapping = {}
553
+ LOGGER.info(
554
+ f"AOAI: Using default mapping with {len(default_mapping)} entries for graders without specific mappings."
555
+ )
556
+ return [
557
+ ({name: grader}, None if column_mappings is None else column_mappings.get(name, default_mapping))
558
+ for name, grader in graders.items()
559
+ ]
560
+
561
+
562
+ def _build_schema_tree_from_paths(
563
+ paths: List[str],
564
+ force_leaf_type: str = "string",
565
+ ) -> Dict[str, Any]:
566
+ """
567
+ Build a nested JSON schema (object) from a list of dot-delimited paths.
568
+ Each path represents a leaf. Intermediate segments become nested object properties.
569
+
570
+ Example input paths:
571
+ ["item.query",
572
+ "item.context.company.policy.security.passwords.rotation_days",
573
+ "item.context.company.policy.security.network.vpn.required"]
574
+
575
+ Returns schema fragment:
576
+ {
577
+ "type": "object",
578
+ "properties": {
579
+ "item": {
580
+ "type": "object",
581
+ "properties": {
582
+ "query": {"type": "string"},
583
+ "context": {
584
+ "type": "object",
585
+ "properties": {
586
+ "company": { ... }
587
+ },
588
+ "required": ["company"]
589
+ }
590
+ },
591
+ "required": ["query", "context"]
592
+ }
593
+ },
594
+ "required": ["item"]
595
+ }
596
+
597
+ :param paths: A list of dot-delimited strings, each representing a leaf path
598
+ in the logical object hierarchy (e.g. ``"item.context.company.policy.security.passwords.rotation_days"``).
599
+ Empty path segments are ignored.
600
+ :type paths: List[str]
601
+ :param force_leaf_type: The JSON Schema ``type`` value to assign to every leaf node
602
+ produced from the supplied paths. Defaults to ``"string"``.
603
+ :type force_leaf_type: str
604
+ :return: A JSON Schema fragment describing the hierarchical structure implied by
605
+ the input paths. The returned schema root always has ``type: object`` with
606
+ recursively nested ``properties`` / ``required`` keys.
607
+ :rtype: Dict[str, Any]
608
+ """
609
+ # Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool }
610
+ root: Dict[str, Any] = {"__children__": {}, "__leaf__": False}
611
+
612
+ def insert(path: str):
613
+ parts = [p for p in path.split(".") if p]
614
+ node = root
615
+ for i, part in enumerate(parts):
616
+ children = node["__children__"]
617
+ if part not in children:
618
+ children[part] = {"__children__": {}, "__leaf__": False}
619
+ node = children[part]
620
+ if i == len(parts) - 1:
621
+ node["__leaf__"] = True
622
+
623
+ for p in paths:
624
+ insert(p)
625
+
626
+ def to_schema(node: Dict[str, Any]) -> Dict[str, Any]:
627
+ children = node["__children__"]
628
+ if not children:
629
+ # Leaf node
630
+ return {"type": force_leaf_type}
631
+ props = {}
632
+ required = []
633
+ for name, child in children.items():
634
+ props[name] = to_schema(child)
635
+ required.append(name)
636
+ return {
637
+ "type": "object",
638
+ "properties": props,
639
+ "required": required,
640
+ }
641
+
642
+ return to_schema(root)
643
+
644
+
645
+ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
646
+ """
647
+ Produce a data source config (JSON schema) that reflects nested object structure
648
+ when column mappings reference dotted paths (e.g., item.context.company...).
649
+
650
+ Backward compatibility:
651
+ - If all referenced source paths are single tokens (flat), fall back to legacy flat schema.
652
+ - Otherwise build a nested object schema covering only referenced leaves.
653
+
654
+ :type input_data_df: pd.DataFrame
655
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
656
+ :type column_mapping: Optional[Dict[str, str]]
657
+ :param column_mapping: The column mapping to use for the evaluation. If None, the default mapping will be used.
658
+ :return: A dictionary that can act as data source config for OAI evaluation group creation.
659
+ :rtype: Dict[str, Any]
660
+ helper function.
661
+ """
662
+ # Extract referenced data paths from mapping values of the form ${data.<path>} (ignore ${run.outputs.*})
663
+ LOGGER.info(
664
+ f"AOAI: Generating data source config for {len(input_data_df)} rows with {len(column_mapping)} column mapping(s)..."
665
+ )
666
+ referenced_paths: List[str] = []
667
+ for v in column_mapping.values():
668
+ m = DATA_PATH_PATTERN.match(v)
669
+ if m:
670
+ referenced_paths.append(m.group(1))
671
+
672
+ LOGGER.info(f"AOAI: Found {len(referenced_paths)} referenced paths in column mappings: {referenced_paths}")
673
+ # Decide if we have nested structures
674
+ has_nested = any("." in p for p in referenced_paths)
675
+ LOGGER.info(f"AOAI: Schema generation mode: {'nested' if has_nested else 'flat'}")
676
+
677
+ if not referenced_paths or not has_nested:
678
+ # Legacy flat behavior (existing logic): treat each mapping key as independent string field
679
+ LOGGER.info("AOAI: Using flat schema generation (no nested structures detected).")
680
+ data_source_config = {
681
+ "type": "custom",
682
+ "item_schema": {
683
+ "type": "object",
684
+ "properties": {},
685
+ "required": [],
686
+ },
687
+ }
688
+ props = data_source_config["item_schema"]["properties"]
689
+ req = data_source_config["item_schema"]["required"]
690
+ for key in column_mapping.keys():
691
+ if key in input_data_df and len(input_data_df[key]) > 0 and isinstance(input_data_df[key].iloc[0], list):
692
+ props[key] = {"type": "array"}
693
+ else:
694
+ props[key] = {"type": "string"}
695
+ req.append(key)
696
+ LOGGER.info(f"AOAI: Flat schema generated with {len(props)} properties: {list(props.keys())}")
697
+ return data_source_config
698
+
699
+ # NEW: If all nested paths share the same first segment (e.g. 'item'),
700
+ # treat that segment as the wrapper already provided by the JSONL line ("item": {...})
701
+ # so we exclude it from the schema (schema describes the *inside* of "item").
702
+ first_segments = {p.split(".")[0] for p in referenced_paths}
703
+ strip_wrapper = False
704
+ wrapper_name = None
705
+ LOGGER.info(f"AOAI: First segments in referenced paths: {first_segments}")
706
+ if len(first_segments) == 1:
707
+ only_seg = next(iter(first_segments))
708
+ # We only strip if that segment looks like the canonical wrapper.
709
+ if only_seg == WRAPPER_KEY:
710
+ strip_wrapper = True
711
+ wrapper_name = only_seg
712
+ LOGGER.info(f"AOAI: All paths start with wrapper '{WRAPPER_KEY}', will strip from schema.")
713
+
714
+ effective_paths = referenced_paths
715
+ if strip_wrapper:
716
+ stripped = []
717
+ for p in referenced_paths:
718
+ parts = p.split(".", 1)
719
+ if len(parts) == 2:
720
+ stripped.append(parts[1]) # drop leading 'item.'
721
+ else:
722
+ # Path was just 'item' (no leaf) – ignore; it doesn't define a leaf value.
723
+ continue
724
+ # If stripping produced at least one usable path, adopt; else fall back to original.
725
+ if stripped:
726
+ effective_paths = stripped
727
+ LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}")
728
+
729
+ LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...")
730
+ nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string")
731
+
732
+ LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'")
733
+ return {
734
+ "type": "custom",
735
+ "item_schema": nested_schema,
736
+ }
737
+
738
+
739
+ def _generate_default_data_source_config(input_data_df: pd.DataFrame) -> Dict[str, Any]:
740
+ """Produce a data source config that naively maps all columns from the supplied data source into
741
+ the OAI API.
742
+
743
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
744
+ helper function.
745
+ :type input_data_df: pd.DataFrame
746
+ :return: A dictionary that can act as data source config for OAI evaluation group creation.
747
+ :rtype: Dict[str, Any]
748
+ """
749
+
750
+ properties = {}
751
+ required = []
752
+
753
+ for column in input_data_df.columns:
754
+ properties[column] = {
755
+ "type": "string",
756
+ }
757
+ required.append(column)
758
+ data_source_config = {
759
+ "type": "custom",
760
+ "item_schema": {
761
+ "type": "object",
762
+ "properties": properties,
763
+ "required": required,
764
+ },
765
+ }
766
+ return data_source_config
767
+
768
+
769
+ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str]) -> Dict[str, Any]:
770
+ """
771
+ Given a dataframe of data to be evaluated, and a column mapping,
772
+ produce a dictionary that can be used as the data source input for an OAI evaluation run.
773
+ Builds a nested 'item' object mirroring the hierarchical paths in the mapping values.
774
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
775
+ helper function.
776
+ :type input_data_df: pd.DataFrame
777
+ :param column_mapping: The column mapping to use for the evaluation. If None, a naive 1:1 mapping is used.
778
+ :type column_mapping: Optional[Dict[str, str]]
779
+ :return: A dictionary that can be used as the data source input for an OAI evaluation run.
780
+ :rtype: Dict[str, Any]
781
+ """
782
+
783
+ def _convert_value_to_string(val: Any) -> str:
784
+ """Convert a value to string representation for AOAI evaluation."""
785
+ if val is None:
786
+ return ""
787
+ elif isinstance(val, (str, int, float, bool)):
788
+ return str(val)
789
+ else:
790
+ try: # Attempt to JSON serialize lists/dicts
791
+ return json.dumps(val, ensure_ascii=False)
792
+ except (TypeError, ValueError):
793
+ # Fallback for unserializable objects
794
+ return str(val)
795
+
796
+ LOGGER.info(
797
+ f"AOAI: Building data source from {len(input_data_df)} rows with {len(column_mapping)} column mappings..."
798
+ )
799
+ # Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name)
800
+ # relative_parts excludes the wrapper (so schema + content align).
801
+ path_specs: List[Tuple[str, List[str], str]] = []
802
+
803
+ for name, formatted_entry in column_mapping.items():
804
+ if not (
805
+ isinstance(formatted_entry, str) and formatted_entry.startswith("${") and formatted_entry.endswith("}")
806
+ ):
807
+ continue
808
+ body = formatted_entry[2:-1] # remove ${ }
809
+ pieces = body.split(".")
810
+
811
+ if not pieces:
812
+ continue
813
+
814
+ if pieces[0] == "data":
815
+ # Data path: data.<maybe wrapper>.<...>
816
+ if len(pieces) == 1:
817
+ continue
818
+ source_path = ".".join(pieces[1:]) # e.g. item.context.company...
819
+ # Skip mapping of wrapper itself
820
+ if source_path == WRAPPER_KEY:
821
+ continue
822
+
823
+ # Determine dataframe column name (it is the full dotted path as flattened earlier)
824
+ dataframe_col = source_path
825
+
826
+ # Relative parts for nested insertion (drop leading wrapper if present)
827
+ if source_path.startswith(WRAPPER_KEY + "."):
828
+ relative_path = source_path[len(WRAPPER_KEY) + 1 :]
829
+ else:
830
+ # Path not under wrapper; treat its segments as is (will live directly under wrapper)
831
+ relative_path = source_path
832
+
833
+ relative_parts = [p for p in relative_path.split(".") if p]
834
+
835
+ # Defensive: if mapping alias differs from leaf, prefer actual path leaf to stay consistent.
836
+ # (If you want alias override, replace relative_parts[-1] with name when name != path_leaf.)
837
+ if not relative_parts:
838
+ continue
839
+
840
+ path_specs.append((formatted_entry, relative_parts, dataframe_col))
841
+
842
+ elif pieces[0] == "run" and len(pieces) >= 3 and pieces[1] == "outputs":
843
+ # Target / run outputs become __outputs.<rest> columns
844
+ run_col = "__outputs." + ".".join(pieces[2:])
845
+ leaf_name = pieces[-1]
846
+ path_specs.append((formatted_entry, [leaf_name], run_col))
847
+
848
+ LOGGER.info(f"AOAI: Processed {len(path_specs)} path specifications from column mappings.")
849
+ content: List[Dict[str, Any]] = []
850
+
851
+ for _, row in input_data_df.iterrows():
852
+ item_root: Dict[str, Any] = {}
853
+
854
+ # Track which dataframe columns have been processed via column_mapping
855
+ processed_cols: Set[str] = set()
856
+
857
+ for _, rel_parts, df_col in path_specs:
858
+ # Safely fetch value
859
+ val = row.get(df_col, None)
860
+
861
+ if isinstance(val, list):
862
+ str_val = val
863
+ else:
864
+ # Convert value to string to match schema's "type": "string" leaves.
865
+ str_val = _convert_value_to_string(val)
866
+
867
+ # Insert into nested dict
868
+ cursor = item_root
869
+ for seg in rel_parts[:-1]:
870
+ nxt = cursor.get(seg)
871
+ if not isinstance(nxt, dict):
872
+ nxt = {}
873
+ cursor[seg] = nxt
874
+ cursor = nxt
875
+ leaf_key = rel_parts[-1]
876
+ cursor[leaf_key] = str_val
877
+
878
+ # Mark this dataframe column as processed
879
+ processed_cols.add(df_col)
880
+
881
+ # Add any unmapped dataframe columns directly to item_root
882
+ for col_name in input_data_df.columns:
883
+ if col_name not in processed_cols:
884
+ val = row.get(col_name, None)
885
+ if isinstance(val, list):
886
+ str_val = val
887
+ else:
888
+ str_val = _convert_value_to_string(val)
889
+ item_root[col_name] = str_val
890
+
891
+ content.append({WRAPPER_KEY: item_root})
892
+
893
+ LOGGER.info(f"AOAI: Generated {len(content)} content items for data source.")
894
+ return {
895
+ "type": "jsonl",
896
+ "source": {
897
+ "type": "file_content",
898
+ "content": content,
899
+ },
900
+ }
901
+
902
+
903
+ def _begin_eval_run(
904
+ client: Union[OpenAI, AzureOpenAI],
905
+ eval_group_id: str,
906
+ run_name: str,
907
+ input_data_df: pd.DataFrame,
908
+ column_mapping: Dict[str, str],
909
+ data_source_params: Optional[Dict[str, Any]] = None,
910
+ ) -> str:
911
+ """
912
+ Given an eval group id and a dataset file path, use the AOAI API to
913
+ start an evaluation run with the given name and description.
914
+ Returns a poller that can be used to monitor the run.
915
+
916
+ :param client: The AOAI client to use for the evaluation.
917
+ :type client: Union[OpenAI, AzureOpenAI]
918
+ :param eval_group_id: The ID of the evaluation group to use for the evaluation run.
919
+ :type eval_group_id: str
920
+ :param run_name: The name of the evaluation run.
921
+ :type run_name: str
922
+ :param input_data_df: The input data to be evaluated, as produced by the `_validate_and_load_data`
923
+ helper function.
924
+ :type input_data_df: pd.DataFrame
925
+ :return: The ID of the evaluation run.
926
+ :rtype: str
927
+ """
928
+
929
+ LOGGER.info(f"AOAI: Creating eval run '{run_name}' for eval group {eval_group_id}...")
930
+ data_source = _get_data_source(input_data_df, column_mapping)
931
+
932
+ if data_source_params is not None:
933
+ data_source.update(data_source_params)
934
+
935
+ eval_run = client.evals.runs.create(
936
+ eval_id=eval_group_id,
937
+ data_source=cast(Any, data_source), # Cast for type checker: dynamic schema dict accepted by SDK at runtime
938
+ name=run_name,
939
+ metadata={"sample_generation": "off", "file_format": "jsonl", "is_foundry_eval": "true"},
940
+ # TODO decide if we want to add our own timeout value?
941
+ )
942
+ LOGGER.info(f"AOAI: Eval run created successfully with ID: {eval_run.id}")
943
+ return eval_run.id
944
+
945
+
946
+ # Post built TODO: replace with _red_team.py's retry logic?
947
+ def _wait_for_run_conclusion(
948
+ client: Union[OpenAI, AzureOpenAI], eval_group_id: str, eval_run_id: str, max_wait_seconds=21600
949
+ ) -> Any:
950
+ """
951
+ Perform exponential backoff polling to get the results of an AOAI evaluation run.
952
+ Raises an EvaluationException if max attempts are reached without receiving a concluding status.
953
+
954
+ :param client: The AOAI client to use for the evaluation.
955
+ :type client: Union[OpenAI, AzureOpenAI]
956
+ :param eval_group_id: The ID of the evaluation group that contains the evaluation run of interest.
957
+ :type eval_group_id: str
958
+ :param eval_run_id: The evaluation run ID to get the results of.
959
+ :type eval_run_id: str
960
+ :param max_wait_seconds: The maximum amount of time to wait for the evaluation run to complete.
961
+ :type max_wait_seconds: int
962
+ :return: The results of the evaluation run.
963
+ :rtype: Any
964
+ """
965
+
966
+ LOGGER.info(f"AOAI: Getting OAI eval run results from group/run {eval_group_id}/{eval_run_id}...")
967
+ total_wait = 0
968
+ iters = 0
969
+ # start with ~51 minutes of exponential backoff
970
+ # max wait time = 2^10 * 3 = 3072 seconds ~= 51 minutes
971
+ wait_interval = 3 # Seconds.
972
+ while True:
973
+ wait_interval *= 1.5
974
+ total_wait += wait_interval
975
+ # Reduce last wait interval if total wait time exceeds max wait time
976
+ if total_wait > max_wait_seconds:
977
+ wait_interval -= total_wait - max_wait_seconds
978
+ sleep(wait_interval)
979
+ iters += 1
980
+ response = client.evals.runs.retrieve(eval_id=eval_group_id, run_id=eval_run_id)
981
+ LOGGER.info(f"AOAI: Polling iteration {iters}, status: {response.status}, total wait: {total_wait:.1f}s")
982
+ if response.status not in ["queued", "in_progress"]:
983
+ LOGGER.info(f"AOAI: Eval run {eval_run_id} reached terminal status: {response.status}")
984
+ return response
985
+ if total_wait > max_wait_seconds:
986
+ raise EvaluationException(
987
+ message=f"Timed out waiting for AOAI evaluation to complete after {iters}"
988
+ + f" rounds of polling. Final status was {response.status}",
989
+ blame=ErrorBlame.USER_ERROR,
990
+ category=ErrorCategory.FAILED_EXECUTION,
991
+ target=ErrorTarget.AOAI_GRADER,
992
+ )