azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,742 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import inspect
6
+ from abc import ABC, abstractmethod
7
+ import json
8
+ import copy
9
+ from typing import (
10
+ Any,
11
+ Callable,
12
+ Dict,
13
+ Generic,
14
+ List,
15
+ Tuple,
16
+ TypedDict,
17
+ TypeVar,
18
+ Union,
19
+ cast,
20
+ final,
21
+ Optional,
22
+ )
23
+
24
+ from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
25
+ from typing_extensions import ParamSpec, TypeAlias, get_overloads
26
+
27
+ from azure.ai.evaluation._exceptions import (
28
+ ErrorBlame,
29
+ ErrorCategory,
30
+ ErrorTarget,
31
+ EvaluationException,
32
+ )
33
+ from azure.ai.evaluation._common.utils import remove_optional_singletons
34
+ from azure.ai.evaluation._constants import (
35
+ _AggregationType,
36
+ EVALUATION_PASS_FAIL_MAPPING,
37
+ )
38
+ from azure.ai.evaluation._model_configurations import Conversation
39
+ from azure.ai.evaluation._common._experimental import experimental
40
+
41
+ from ._conversation_aggregators import GetAggregator, GetAggregatorType
42
+
43
+ import copy
44
+
45
+ P = ParamSpec("P")
46
+ T = TypeVar("T")
47
+ T_EvalValue = TypeVar("T_EvalValue")
48
+
49
+
50
+ class DerivedEvalInput(TypedDict, total=False):
51
+ """The eval input generated by EvaluatorBase._derive_conversation_starter."""
52
+
53
+ query: Dict[str, Any]
54
+ response: Dict[str, Any]
55
+ context: str
56
+ ground_truth: str
57
+
58
+
59
+ AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
60
+ """TypeAlias that models the return value of EvaluatorBase._aggregate_results
61
+
62
+ .. code-block:: python
63
+
64
+ foo: AggregateResult[float] = {
65
+ "evaluation_per_turn": {
66
+ "coherence": [1.0, 2.0, 3.0]
67
+ },
68
+ "coherence": 2.0
69
+ }
70
+ """
71
+
72
+ DoEvalResult: TypeAlias = Dict[str, T]
73
+ """TypeAlias that models the return value of EvaluatorBase._do_eval
74
+
75
+ .. code-block:: python
76
+
77
+ foo: DoEvalResult[float] = {
78
+ "coherence": 2.0
79
+ }
80
+ """
81
+
82
+
83
+ # TODO exception target pass down?
84
+ class EvaluatorBase(ABC, Generic[T_EvalValue]):
85
+ """Base class for all evaluators that are capable of accepting either a group of single values,
86
+ or conversation as input. All such evaluators need to implement two functions of their own:
87
+ - _convert_conversation_to_eval_input
88
+ - _do_eval
89
+
90
+ Additionally, __call__ should be overridden to reshape the function header as needed to produce more informative
91
+ documentation, although ideally the actual child implementation of __call__ should just amount to
92
+ 'super().__init__()'.
93
+
94
+
95
+ :param not_singleton_inputs: A list of strings that represent the names of
96
+ inputs to the child evaluator's __call__ function that are NOT singleton inputs. By default, this
97
+ is ["conversation", "kwargs"].
98
+ :type not_singleton_inputs: List[str]
99
+ :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
100
+ :type eval_last_turn: bool
101
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
102
+ to produce a single result.
103
+ Default is ~azure.ai.evaluation._AggregationType.MEAN.
104
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
105
+ :param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
106
+ overrides the standard aggregator implied by conversation_aggregation_type. None by default.
107
+ :type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
108
+ :param threshold: The threshold for the evaluation. Default is 3.
109
+ :type threshold: Optional[int]
110
+ :param _higher_is_better: If True, higher scores are better. Default is True.
111
+ :type _higher_is_better: Optional[bool]
112
+ """
113
+
114
+ _NOT_APPLICABLE_RESULT = "not applicable"
115
+ _PASS_RESULT = "pass"
116
+ _FAIL_RESULT = "fail"
117
+ _type = "azure_ai_evaluator"
118
+
119
+ # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
120
+
121
+ # Make sure to call super().__init__() in the child class's __init__ method.
122
+ # pylint: disable=dangerous-default-value
123
+ def __init__(
124
+ self,
125
+ *,
126
+ threshold: float = 3.0,
127
+ not_singleton_inputs: List[str] = ["conversation", "kwargs"],
128
+ eval_last_turn: bool = False,
129
+ conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
130
+ conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
131
+ _higher_is_better: Optional[bool] = True,
132
+ ):
133
+ self._not_singleton_inputs = not_singleton_inputs
134
+ self._eval_last_turn = eval_last_turn
135
+ self._singleton_inputs = self._derive_singleton_inputs()
136
+ self._async_evaluator = AsyncEvaluatorBase(self._real_call)
137
+ self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
138
+ self._higher_is_better = _higher_is_better
139
+ self._threshold = threshold
140
+ if conversation_aggregator_override is not None:
141
+ # Type ignore since we already checked for None, but mypy doesn't know that.
142
+ self._conversation_aggregation_function = conversation_aggregator_override # type: ignore[assignment]
143
+
144
+ # This needs to be overridden just to change the function header into something more informative,
145
+ # and to be able to add a more specific docstring. The actual function contents should just be
146
+ # super().__call__(<inputs>)
147
+ def __call__( # pylint: disable=docstring-missing-param
148
+ self,
149
+ *args,
150
+ **kwargs,
151
+ ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
152
+ """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
153
+ one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
154
+ The actual behavior of this function shouldn't change beyond adding more inputs to the
155
+ async_run_allowing_running_loop call.
156
+
157
+ :keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
158
+ :type kwargs: Dict
159
+ :return: The evaluation result
160
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
161
+ """
162
+ return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
163
+
164
+ @abstractmethod
165
+ async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
166
+ """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
167
+ In the default case, all required inputs are assumed to be within eval_input, as user-friendly
168
+ typing is handled above this function in favor of polymorphic simplicity. This function must be
169
+ asynchronous.
170
+
171
+ :param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
172
+ :type eval_input: Any
173
+ :return: A single evaluation result
174
+ :rtype: DoEvalResult[T_EvalValue]
175
+ """
176
+
177
+ # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
178
+
179
+ def _derive_singleton_inputs(self) -> List[List[str]]:
180
+ """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
181
+ when the evaluator is being used in a non-conversation context.
182
+ By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
183
+ Thankfully this works the way you'd hope, with the call_signature being based on the child
184
+ function's signature, not the parent's.
185
+
186
+ :return: A list of lists, where each inner list represents the singleton inputs for each overload.
187
+ :rtype: List[List[str]]
188
+ """
189
+
190
+ overloads = get_overloads(self.__call__)
191
+ if not overloads:
192
+ call_signatures = [inspect.signature(self.__call__)]
193
+ else:
194
+ call_signatures = [inspect.signature(overload) for overload in overloads]
195
+
196
+ overload_inputs = []
197
+ for call_signature in call_signatures:
198
+ params = call_signature.parameters
199
+ if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
200
+ continue
201
+ # exclude self since it is not a singleton input
202
+ overload_inputs.append([p for p in params if p != "self"])
203
+ return overload_inputs
204
+
205
+ def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
206
+ """Find the overload that matches the provided kwargs and return its input parameters.
207
+
208
+ :keyword kwargs: The keyword arguments to match against overloads.
209
+ :type kwargs: Dict
210
+ :return: List of input parameter names for the matching overload.
211
+ :rtype: List[str]
212
+ """
213
+ overload_inputs = self._singleton_inputs
214
+ provided_keys = set(key for key, value in kwargs.items() if value is not None)
215
+
216
+ # Find the overload that best matches the provided parameters
217
+ best_match = None
218
+ best_score = -1
219
+
220
+ for inputs in overload_inputs:
221
+ input_set = set(inputs)
222
+
223
+ # Calculate match score: how many of the overload's params are provided
224
+ if input_set.issubset(provided_keys):
225
+ score = len(input_set)
226
+ if score > best_score:
227
+ best_score = score
228
+ best_match = inputs
229
+
230
+ # If exact match found, return it
231
+ if best_match is not None:
232
+ return best_match
233
+
234
+ # If no exact match, find the overload with the most overlap
235
+ for inputs in overload_inputs:
236
+ input_set = set(inputs)
237
+ overlap = len(input_set.intersection(provided_keys))
238
+ if overlap > best_score:
239
+ best_score = overlap
240
+ best_match = inputs
241
+
242
+ # Return the best match or the first overload as fallback
243
+ return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
244
+
245
+ def _get_all_singleton_inputs(self) -> List[str]:
246
+ """Get a flattened list of all possible singleton inputs across all overloads.
247
+
248
+ :return: Flattened list of all singleton input names.
249
+ :rtype: List[str]
250
+ """
251
+ all_inputs = set()
252
+ for inputs in self._singleton_inputs:
253
+ all_inputs.update(inputs)
254
+ return list(all_inputs)
255
+
256
+ def _derive_conversation_converter(
257
+ self,
258
+ ) -> Callable[[Dict], List[DerivedEvalInput]]:
259
+ """Produce the function that will be used to convert conversations to a list of evaluable inputs.
260
+ This uses the inputs derived from the _derive_singleton_inputs function to determine which
261
+ aspects of a conversation ought to be extracted.
262
+
263
+ :return: The function that will be used to convert conversations to evaluable inputs.
264
+ :rtype: Callable
265
+ """
266
+ all_singleton_inputs = self._get_all_singleton_inputs()
267
+ include_context = "context" in all_singleton_inputs
268
+ include_query = "query" in all_singleton_inputs
269
+ include_response = "response" in all_singleton_inputs
270
+ include_ground_truth = "ground_truth" in all_singleton_inputs
271
+
272
+ def converter(conversation: Dict) -> List[DerivedEvalInput]:
273
+ messages = cast(List[Dict[str, Any]], conversation["messages"])
274
+ global_context = conversation.get("context", None)
275
+ # Extract queries, responses from conversation
276
+ queries: List[Dict[str, Any]] = []
277
+ responses: List[Dict[str, Any]] = []
278
+
279
+ # Convert conversation slice into queries and responses.
280
+ # Assume that 'user' role is asking queries and 'assistant' role is responding.
281
+ if self._eval_last_turn and len(messages) > 1:
282
+ messages = messages[-2:]
283
+
284
+ for each_turn in messages:
285
+ role = each_turn["role"]
286
+ if role == "user":
287
+ queries.append(each_turn)
288
+ elif role == "assistant":
289
+ responses.append(each_turn)
290
+ # TODO complain if len(queries) != len(responses)?
291
+ eval_inputs = []
292
+ for query, response in zip(queries, responses):
293
+ context = {}
294
+ if include_context:
295
+ query_context = query.get("context", None)
296
+ response_context = response.get("context", None)
297
+ if global_context:
298
+ context["global_context"] = global_context
299
+ if query_context and include_query:
300
+ context["query_context"] = query_context
301
+ if response_context and include_response:
302
+ context["response_context"] = response_context
303
+
304
+ eval_input: DerivedEvalInput = {}
305
+ if include_query:
306
+ eval_input["query"] = query.get("content", "")
307
+ if include_response:
308
+ eval_input["response"] = response.get("content", "")
309
+ if include_context:
310
+ eval_input["context"] = str(context)
311
+ if include_ground_truth:
312
+ eval_input["ground_truth"] = response.get("ground_truth", "")
313
+ eval_inputs.append(eval_input)
314
+ return eval_inputs
315
+
316
+ return converter
317
+
318
+ def _derive_multi_modal_conversation_converter(
319
+ self,
320
+ ) -> Callable[[Dict], List[Dict[str, Any]]]:
321
+ """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
322
+ This uses the inputs derived from the _derive_singleton_inputs function to determine which
323
+ aspects of a conversation ought to be extracted.
324
+
325
+ :return: The function that will be used to convert conversations to evaluable inputs.
326
+ :rtype: Callable
327
+ """
328
+
329
+ def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
330
+ messages = cast(List[Dict[str, Any]], conversation["messages"])
331
+ # Extract user messages, assistant messages from conversation
332
+ user_messages: List[Dict[str, Any]] = []
333
+ assistant_messages: List[Dict[str, Any]] = []
334
+ system_messages: List[Dict[str, Any]] = []
335
+
336
+ # Convert conversation slice into queries and responses.
337
+ # Assume that 'user' role is asking queries and 'assistant' role is responding.
338
+ if self._eval_last_turn and len(messages) > 1:
339
+ messages = messages[-2:]
340
+
341
+ for each_turn in messages:
342
+ role = each_turn["role"]
343
+ if role == "user":
344
+ user_messages.append(each_turn)
345
+ elif role == "assistant":
346
+ assistant_messages.append(each_turn)
347
+ elif role == "system":
348
+ system_messages.append(each_turn)
349
+
350
+ # validation
351
+ if len(user_messages) != len(assistant_messages):
352
+ raise EvaluationException(
353
+ message="Mismatched number of user and assistant messages.",
354
+ internal_message=("Mismatched number of user and assistant messages."),
355
+ )
356
+ if len(assistant_messages) > 1:
357
+ raise EvaluationException(
358
+ message="Conversation can have only one assistant message.",
359
+ internal_message=("Conversation can have only one assistant message."),
360
+ )
361
+ eval_conv_inputs = []
362
+ for user_msg, assist_msg in zip(user_messages, assistant_messages):
363
+ conv_messages = []
364
+ if len(system_messages) == 1:
365
+ conv_messages.append(system_messages[0])
366
+ conv_messages.append(user_msg)
367
+ conv_messages.append(assist_msg)
368
+ eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
369
+ return eval_conv_inputs
370
+
371
+ return multi_modal_converter
372
+
373
+ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
374
+ """Convert an arbitrary input into a list of inputs for evaluators.
375
+ It is assumed that evaluators generally make use of their inputs in one of two ways.
376
+ Either they receive a collection of keyname inputs that are all single values
377
+ (like a query and response), or they receive conversation that iss a list of dictionary
378
+ values.
379
+
380
+ The self._singleton_inputs list (containing overload signatures) assigned during initialization
381
+ is used to find and extract singleton keywords, and determine which overload matches the
382
+ provided arguments.
383
+
384
+ If both conversations and singletons are allowed, the function will raise an exception if both
385
+ are inputted.
386
+
387
+ This function must be overridden by child classes IF they need to both a conversation and
388
+ other inputs to be passed in.
389
+
390
+ :keyword kwargs: The inputs to convert.
391
+ :type kwargs: Dict
392
+ :return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
393
+ :rtype: List
394
+ """
395
+
396
+ # Collect inputs
397
+ conversation = kwargs.get("conversation", None)
398
+ singletons = {}
399
+ if len(self._singleton_inputs) > 0:
400
+ # Get all possible singleton inputs and check what's provided
401
+ all_singleton_inputs = self._get_all_singleton_inputs()
402
+ singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
403
+
404
+ # Check that both conversation and other inputs aren't set
405
+ if conversation is not None and any(singletons.values()):
406
+ msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
407
+ raise EvaluationException(
408
+ message=msg,
409
+ blame=ErrorBlame.USER_ERROR,
410
+ category=ErrorCategory.INVALID_VALUE,
411
+ target=ErrorTarget.CONVERSATION,
412
+ )
413
+ # Handle Conversation
414
+ if conversation is not None:
415
+ if self._is_multi_modal_conversation(conversation):
416
+ return self._derive_multi_modal_conversation_converter()(conversation)
417
+ return self._derive_conversation_converter()(conversation)
418
+
419
+ # Handle Singletons - find matching overload
420
+ matching_inputs = self._get_matching_overload_inputs(**kwargs)
421
+ if matching_inputs:
422
+ # Check if all required inputs for this overload are provided
423
+ required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
424
+ required_singletons = remove_optional_singletons(self, required_singletons)
425
+ if all(value is not None for value in required_singletons.values()):
426
+ return [singletons]
427
+
428
+ # Missing input
429
+ msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
430
+ raise EvaluationException(
431
+ message=msg,
432
+ blame=ErrorBlame.USER_ERROR,
433
+ category=ErrorCategory.INVALID_VALUE,
434
+ target=ErrorTarget.CONVERSATION,
435
+ )
436
+
437
+ def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
438
+ if "messages" not in conversation:
439
+ return False
440
+ messages = conversation["messages"]
441
+ if not isinstance(messages, list):
442
+ return False
443
+ for message in messages:
444
+ if "content" in message:
445
+ content = message.get("content", "")
446
+ if isinstance(content, list):
447
+ if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
448
+ return True
449
+ return False
450
+
451
+ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
452
+ """Aggregate the evaluation results of each conversation turn into a single result.
453
+
454
+ Exact implementation might need to vary slightly depending on the results produced.
455
+ Default behavior is to average the all number-based outputs.
456
+
457
+ :param per_turn_results: List of evaluation results for each turn in the conversation.
458
+ :type per_turn_results: List[Dict]
459
+ :return: A dictionary containing aggregated results, with numeric metrics having their
460
+ means as top-level values in the dictionary, and all original
461
+ values (including non-numerics) located in under the "evaluation_per_turn" key,
462
+ which each sub-key being a metric and each sub-value being a the list of that metric's
463
+ per-turn values.
464
+ :rtype: AggregateResult[T_EvalValue]
465
+ """
466
+
467
+ aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
468
+ evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
469
+
470
+ # Go over each turn, and rotate the results into a
471
+ # metric: List[values] format for the evals_per_turn dictionary.
472
+ for turn in per_turn_results:
473
+ for metric, value in turn.items():
474
+ if metric not in evaluation_per_turn:
475
+ evaluation_per_turn[metric] = []
476
+ evaluation_per_turn[metric].append(value)
477
+
478
+ # Find and average all numeric values
479
+ for metric, values in evaluation_per_turn.items():
480
+ if all(isinstance(value, (int, float)) for value in values):
481
+ aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
482
+ # Slap the per-turn results back in.
483
+ aggregated["evaluation_per_turn"] = evaluation_per_turn
484
+ return aggregated
485
+
486
+ def _parse_tools_from_response(self, response):
487
+ """Parse the response to extract tool calls and results.
488
+ :param response: The response to parse.
489
+ :type response: Union[str, List[dict]]
490
+ :return: List of tool calls extracted from the response.
491
+ :rtype: List[dict]
492
+ """
493
+ tool_calls = []
494
+ tool_results_map = {}
495
+
496
+ # Work on a deep copy to avoid modifying the original object
497
+ response_copy = copy.deepcopy(response)
498
+
499
+ if isinstance(response_copy, list):
500
+ for message in response_copy:
501
+ # Extract tool calls from assistant messages
502
+ if message.get("role") == "assistant" and isinstance(message.get("content"), list):
503
+ for content_item in message.get("content"):
504
+ if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
505
+ tool_calls.append(copy.deepcopy(content_item))
506
+
507
+ # Extract tool results from tool messages
508
+ elif message.get("role") == "tool" and message.get("tool_call_id"):
509
+ tool_call_id = message.get("tool_call_id")
510
+ if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
511
+ result_content = message.get("content")[0]
512
+ if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
513
+ tool_results_map[tool_call_id] = result_content
514
+
515
+ # Attach results to their corresponding calls
516
+ for tool_call in tool_calls:
517
+ tool_call_id = tool_call.get("tool_call_id")
518
+ if tool_call_id in tool_results_map:
519
+ tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
520
+
521
+ return tool_calls
522
+
523
+ def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
524
+ """Extract tool names and parameters from the response.
525
+
526
+ :param response: The response to parse.
527
+ :type response: Union[str, List[dict]]
528
+ :return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
529
+ :rtype: List[Tuple[str, Dict[str, str]]]
530
+ """
531
+ tool_calls = self._parse_tools_from_response(response)
532
+ tool_name_param_pairs = []
533
+ for tool_call in tool_calls:
534
+ if not isinstance(tool_call, dict):
535
+ raise EvaluationException(
536
+ "Tool call must be a dictionary.",
537
+ internal_message=str(tool_call),
538
+ target=ErrorTarget.EVALUATE,
539
+ category=ErrorCategory.UNKNOWN,
540
+ )
541
+ if tool_call.get("type") != "tool_call":
542
+ raise EvaluationException(
543
+ "Tool call must have 'type' set to 'tool_call'.",
544
+ internal_message=str(tool_call),
545
+ target=ErrorTarget.EVALUATE,
546
+ category=ErrorCategory.INVALID_VALUE,
547
+ )
548
+
549
+ if "name" not in tool_call:
550
+ raise EvaluationException(
551
+ "Tool call missing 'name' field.",
552
+ internal_message=str(tool_call),
553
+ target=ErrorTarget.EVALUATE,
554
+ category=ErrorCategory.MISSING_FIELD,
555
+ )
556
+
557
+ tool_name = str(tool_call["name"]).strip()
558
+
559
+ # Extract parameters/arguments
560
+ parameters = {}
561
+ if "arguments" in tool_call:
562
+ args = tool_call["arguments"]
563
+ if isinstance(args, dict):
564
+ # Convert all values to strings for consistent comparison
565
+ parameters = {str(k): str(v) for k, v in args.items()}
566
+ elif isinstance(args, str):
567
+ # If arguments is a string, try to parse it as JSON
568
+ try:
569
+ parsed_args = json.loads(args)
570
+ if isinstance(parsed_args, dict):
571
+ parameters = {str(k): str(v) for k, v in parsed_args.items()}
572
+ except json.JSONDecodeError:
573
+ raise EvaluationException(
574
+ "Failed to parse tool call arguments as JSON.",
575
+ internal_message=str(tool_call),
576
+ target=ErrorTarget.EVALUATE,
577
+ category=ErrorCategory.INVALID_VALUE,
578
+ )
579
+
580
+ tool_name_param_pairs.append((tool_name, parameters))
581
+
582
+ return tool_name_param_pairs
583
+
584
+ async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
585
+ """The asynchronous call where real end-to-end evaluation logic is performed.
586
+
587
+ :keyword kwargs: The inputs to evaluate.
588
+ :type kwargs: Dict
589
+ :return: The evaluation result.
590
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
591
+ """
592
+ # Convert inputs into list of evaluable inputs.
593
+ try:
594
+ eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
595
+ except Exception as e:
596
+ print(f"Error converting kwargs to eval_input_list: {e}")
597
+ raise e
598
+ per_turn_results = []
599
+ # Evaluate all inputs.
600
+ for eval_input in eval_input_list:
601
+ result = await self._do_eval(eval_input)
602
+ # logic to determine threshold pass/fail
603
+ try:
604
+ for key in list(result.keys()):
605
+ if key.endswith("_score") and "rouge" not in key:
606
+ score_value = result[key]
607
+ base_key = key[:-6] # Remove "_score" suffix
608
+ result_key = f"{base_key}_result"
609
+ threshold_key = f"{base_key}_threshold"
610
+ threshold_value = (
611
+ self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
612
+ )
613
+ if not isinstance(threshold_value, (int, float)):
614
+ raise EvaluationException(
615
+ "Threshold value must be a number.",
616
+ internal_message=str(threshold_value),
617
+ target=ErrorTarget.EVALUATE,
618
+ category=ErrorCategory.INVALID_VALUE,
619
+ )
620
+
621
+ result[threshold_key] = threshold_value
622
+ if self._higher_is_better:
623
+ if float(score_value) >= threshold_value:
624
+ result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
625
+ else:
626
+ result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
627
+ else:
628
+ if float(score_value) <= threshold_value:
629
+ result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
630
+ else:
631
+ result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
632
+ except Exception as e:
633
+ print(f"Error calculating binary result: {e}")
634
+ per_turn_results.append(result)
635
+ # Return results as-is if only one result was produced.
636
+
637
+ if len(per_turn_results) == 1:
638
+ return per_turn_results[0]
639
+ if len(per_turn_results) == 0:
640
+ return {} # TODO raise something?
641
+ # Otherwise, aggregate results.
642
+ return self._aggregate_results(per_turn_results=per_turn_results)
643
+
644
+ # ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
645
+
646
+ @final
647
+ def _to_async(self) -> "AsyncEvaluatorBase":
648
+ return self._async_evaluator
649
+
650
+ @experimental
651
+ @final
652
+ def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
653
+ """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
654
+ multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
655
+ multi-turn conversation into a single top-level result.
656
+
657
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn
658
+ results of a conversation to produce a single result.
659
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
660
+ """
661
+ self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
662
+
663
+ @experimental
664
+ @final
665
+ def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
666
+ """Set the conversation aggregator function directly. This function will be applied to all numeric outputs
667
+ of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
668
+ evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
669
+ suit your needs, but use with caution.
670
+
671
+ :param aggregator: The function to use to aggregate per-turn results.
672
+ :type aggregator: Callable[[List[float]], float]
673
+ """
674
+ self._conversation_aggregation_function = aggregator
675
+
676
+ @experimental
677
+ @final
678
+ def _get_conversation_aggregator_type(self) -> _AggregationType:
679
+ """Get the current conversation aggregation type used by this evaluator. This refers to the
680
+ method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
681
+ is inputted into an evaluator that evaluates each turn individually). The individual inputs
682
+ are combined by the function implied here to produce a single overall result.
683
+
684
+ :return: The conversation aggregation type.
685
+ :rtype: ~azure.ai.evaluation._AggregationType
686
+ """
687
+ return GetAggregatorType(self._conversation_aggregation_function)
688
+
689
+
690
+ class AsyncEvaluatorBase:
691
+ """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
692
+ to ensure that no one ever needs to extend or otherwise modify this class directly.
693
+ """
694
+
695
+ def __init__(self, real_call): # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT YOU ABOUT META GENERATION
696
+ self._real_call = real_call
697
+
698
+ # Don't look at my shame. Nothing to see here....
699
+ # Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
700
+ # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
701
+ # are just not passed into this function instead of ending up in kwargs.
702
+ # Since we want this to be relatively call-agnostic, we just account for every input that any children
703
+ # are known to throw at this, mash them into kwargs, and then pass them into the real call.
704
+ async def __call__(
705
+ self,
706
+ *,
707
+ query=None,
708
+ response=None,
709
+ context=None,
710
+ conversation=None,
711
+ ground_truth=None,
712
+ tool_calls=None,
713
+ tool_definitions=None,
714
+ messages=None,
715
+ retrieval_ground_truth=None,
716
+ retrieved_documents=None,
717
+ **kwargs,
718
+ ):
719
+ if conversation is not None:
720
+ kwargs["conversation"] = conversation
721
+ if query is not None:
722
+ kwargs["query"] = query
723
+ if response is not None:
724
+ kwargs["response"] = response
725
+ if tool_definitions is not None:
726
+ kwargs["tool_definitions"] = tool_definitions
727
+ if context is not None:
728
+ kwargs["context"] = context
729
+ if ground_truth is not None:
730
+ kwargs["ground_truth"] = ground_truth
731
+ if tool_calls is not None:
732
+ kwargs["tool_calls"] = tool_calls
733
+ if tool_definitions is not None:
734
+ kwargs["tool_definitions"] = tool_definitions
735
+ if messages is not None:
736
+ kwargs["messages"] = messages
737
+ if retrieval_ground_truth is not None:
738
+ kwargs["retrieval_ground_truth"] = retrieval_ground_truth
739
+ if retrieved_documents is not None:
740
+ kwargs["retrieved_documents"] = retrieved_documents
741
+
742
+ return await self._real_call(**kwargs)