azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,63 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from concurrent.futures import as_completed
5
+ from typing import TypeVar, Dict, List
6
+
7
+ from azure.ai.evaluation._legacy._adapters.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
8
+ from typing_extensions import override
9
+
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ class MultiEvaluatorBase(EvaluatorBase[T]):
16
+ """
17
+ Base class for evaluators that contain and run multiple other evaluators to produce a
18
+ suite of metrics.
19
+
20
+ Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval.
21
+
22
+ :param evaluators: The list of evaluators to run when this evaluator is called.
23
+ :type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase]
24
+ :param kwargs: Additional arguments to pass to the evaluator.
25
+ :type kwargs: Any
26
+ :return: An evaluator that runs multiple other evaluators and combines their results.
27
+ """
28
+
29
+ def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
30
+ self._threshold = kwargs.pop("threshold", 3)
31
+ self._higher_is_better = kwargs.pop("_higher_is_better", False)
32
+ super().__init__(threshold=self._threshold, _higher_is_better=self._higher_is_better)
33
+ self._parallel = kwargs.pop("_parallel", True)
34
+ self._evaluators = evaluators
35
+
36
+ @override
37
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
38
+ """Run each evaluator, possibly in parallel, and combine the results into
39
+ a single large dictionary containing each evaluation. Inputs are passed
40
+ directly to each evaluator without additional processing.
41
+
42
+
43
+ :param eval_input: The input to the evaluation function.
44
+ :type eval_input: Dict
45
+ :return: The evaluation result.
46
+ :rtype: Dict
47
+ """
48
+ results: Dict[str, T] = {}
49
+ if self._parallel:
50
+ with ThreadPoolExecutor() as executor:
51
+ # pylint: disable=no-value-for-parameter
52
+ futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators}
53
+
54
+ for future in as_completed(futures):
55
+ results.update(future.result())
56
+ else:
57
+ for evaluator in self._evaluators:
58
+ result = evaluator(**eval_input)
59
+ # Ignore is to avoid mypy getting upset over the amount of duck-typing
60
+ # that's going on to shove evaluators around like this.
61
+ results.update(result) # type: ignore[arg-type]
62
+
63
+ return results
@@ -0,0 +1,345 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import math
6
+ import re
7
+ import os
8
+ from itertools import chain
9
+ from typing import Dict, Optional, TypeVar, Union, List
10
+
11
+ if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
12
+ from promptflow.core._flow import AsyncPrompty
13
+ else:
14
+ from azure.ai.evaluation._legacy.prompty import AsyncPrompty
15
+ from typing_extensions import override
16
+
17
+ from azure.core.credentials import TokenCredential
18
+ from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
19
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
20
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
21
+ from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
22
+ from . import EvaluatorBase
23
+
24
+ try:
25
+ from ..._user_agent import UserAgentSingleton
26
+ except ImportError:
27
+
28
+ class UserAgentSingleton:
29
+ @property
30
+ def value(self) -> str:
31
+ return "None"
32
+
33
+
34
+ T = TypeVar("T")
35
+
36
+
37
+ class PromptyEvaluatorBase(EvaluatorBase[T]):
38
+ """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
39
+ make use of a prompty file, and return their results as a dictionary, with a single key-value pair
40
+ linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
41
+ per-turn results are stored in a list under the key "evaluation_per_turn").
42
+
43
+ :param result_key: The key to use for the result of the evaluation. Single turn evaluations will return
44
+ a dictionary in the format {result_key: float}.
45
+ :type result_key: str
46
+ :param prompty_file: The path to the prompty file to use for evaluation.
47
+ :type prompty_file: str
48
+ :param model_config: The model configuration to use for evaluation.
49
+ :type model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
50
+ :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
51
+ Useful since some evaluators of this format are response-only.
52
+ :type ignore_queries: bool
53
+ :keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
54
+ :type is_reasoning_model: bool
55
+ """
56
+
57
+ _LLM_CALL_TIMEOUT = 600
58
+ _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
59
+
60
+ def __init__(
61
+ self,
62
+ *,
63
+ result_key: str,
64
+ prompty_file: str,
65
+ model_config: dict,
66
+ eval_last_turn: bool = False,
67
+ threshold: int = 3,
68
+ credential: Optional[TokenCredential] = None,
69
+ _higher_is_better: bool = False,
70
+ **kwargs,
71
+ ) -> None:
72
+ self._result_key = result_key
73
+ self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
74
+ self._prompty_file = prompty_file
75
+ self._threshold = threshold
76
+ self._higher_is_better = _higher_is_better
77
+ super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better)
78
+
79
+ subclass_name = self.__class__.__name__
80
+ user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})"
81
+ prompty_model_config = construct_prompty_model_config(
82
+ validate_model_config(model_config),
83
+ self._DEFAULT_OPEN_API_VERSION,
84
+ user_agent,
85
+ )
86
+
87
+ self._flow = AsyncPrompty.load(
88
+ source=self._prompty_file,
89
+ model=prompty_model_config,
90
+ token_credential=credential,
91
+ is_reasoning_model=self._is_reasoning_model,
92
+ )
93
+
94
+ # __call__ not overridden here because child classes have such varied signatures that there's no point
95
+ # defining a default here.
96
+ def _get_binary_result(self, score: float) -> str:
97
+ """Get the binary result based on the score.
98
+
99
+ :param score: The score to evaluate.
100
+ :type score: float
101
+ :return: The binary result.
102
+ :rtype: str
103
+ """
104
+ if math.isnan(score):
105
+ return "unknown"
106
+ if self._higher_is_better:
107
+ if score >= self._threshold:
108
+ return EVALUATION_PASS_FAIL_MAPPING[True]
109
+ else:
110
+ return EVALUATION_PASS_FAIL_MAPPING[False]
111
+ else:
112
+ if score <= self._threshold:
113
+ return EVALUATION_PASS_FAIL_MAPPING[True]
114
+ else:
115
+ return EVALUATION_PASS_FAIL_MAPPING[False]
116
+
117
+ @override
118
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
119
+ """Do a relevance evaluation.
120
+
121
+ :param eval_input: The input to the evaluator. Expected to contain
122
+ whatever inputs are needed for the _flow method, including context
123
+ and other fields depending on the child class.
124
+ :type eval_input: Dict
125
+ :return: The evaluation result.
126
+ :rtype: Dict
127
+ """
128
+ if "query" not in eval_input and "response" not in eval_input:
129
+ raise EvaluationException(
130
+ message="Only text conversation inputs are supported.",
131
+ internal_message="Only text conversation inputs are supported.",
132
+ blame=ErrorBlame.USER_ERROR,
133
+ category=ErrorCategory.INVALID_VALUE,
134
+ target=ErrorTarget.CONVERSATION,
135
+ )
136
+ # Call the prompty flow to get the evaluation result.
137
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
138
+
139
+ score = math.nan
140
+ if prompty_output_dict:
141
+ llm_output = prompty_output_dict.get("llm_output", "")
142
+ input_token_count = prompty_output_dict.get("input_token_count", 0)
143
+ output_token_count = prompty_output_dict.get("output_token_count", 0)
144
+ total_token_count = prompty_output_dict.get("total_token_count", 0)
145
+ finish_reason = prompty_output_dict.get("finish_reason", "")
146
+ model_id = prompty_output_dict.get("model_id", "")
147
+ sample_input = prompty_output_dict.get("sample_input", "")
148
+ sample_output = prompty_output_dict.get("sample_output", "")
149
+ # Parse out score and reason from evaluators known to possess them.
150
+ if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
151
+ score, reason = parse_quality_evaluator_reason_score(llm_output)
152
+ binary_result = self._get_binary_result(score)
153
+ return {
154
+ self._result_key: float(score),
155
+ f"gpt_{self._result_key}": float(score),
156
+ f"{self._result_key}_reason": reason,
157
+ f"{self._result_key}_result": binary_result,
158
+ f"{self._result_key}_threshold": self._threshold,
159
+ f"{self._result_key}_prompt_tokens": input_token_count,
160
+ f"{self._result_key}_completion_tokens": output_token_count,
161
+ f"{self._result_key}_total_tokens": total_token_count,
162
+ f"{self._result_key}_finish_reason": finish_reason,
163
+ f"{self._result_key}_model": model_id,
164
+ f"{self._result_key}_sample_input": sample_input,
165
+ f"{self._result_key}_sample_output": sample_output,
166
+ }
167
+ match = re.search(r"\d", llm_output)
168
+ if match:
169
+ score = float(match.group())
170
+ binary_result = self._get_binary_result(score)
171
+ return {
172
+ self._result_key: float(score),
173
+ f"gpt_{self._result_key}": float(score),
174
+ f"{self._result_key}_result": binary_result,
175
+ f"{self._result_key}_threshold": self._threshold,
176
+ f"{self._result_key}_prompt_tokens": input_token_count,
177
+ f"{self._result_key}_completion_tokens": output_token_count,
178
+ f"{self._result_key}_total_tokens": total_token_count,
179
+ f"{self._result_key}_finish_reason": finish_reason,
180
+ f"{self._result_key}_model": model_id,
181
+ f"{self._result_key}_sample_input": sample_input,
182
+ f"{self._result_key}_sample_output": sample_output,
183
+ }
184
+
185
+ binary_result = self._get_binary_result(score)
186
+ return {
187
+ self._result_key: float(score),
188
+ f"gpt_{self._result_key}": float(score),
189
+ f"{self._result_key}_result": binary_result,
190
+ f"{self._result_key}_threshold": self._threshold,
191
+ }
192
+
193
+ @staticmethod
194
+ def _get_built_in_tool_definition(tool_name: str):
195
+ """Get the definition for the built-in tool."""
196
+ try:
197
+ from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
198
+
199
+ if tool_name in _BUILT_IN_DESCRIPTIONS:
200
+ return {
201
+ "type": tool_name,
202
+ "description": _BUILT_IN_DESCRIPTIONS[tool_name],
203
+ "name": tool_name,
204
+ "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
205
+ }
206
+ except ImportError:
207
+ pass
208
+ return None
209
+
210
+ def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
211
+ """Extract tool definitions needed for the given built-in tool calls."""
212
+ needed_definitions = []
213
+ for tool_call in tool_calls:
214
+ if isinstance(tool_call, dict):
215
+ tool_type = tool_call.get("type")
216
+
217
+ # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
218
+ if tool_type == "tool_call":
219
+ tool_name = tool_call.get("name")
220
+ if tool_name:
221
+ definition = self._get_built_in_tool_definition(tool_name)
222
+ if definition and definition not in needed_definitions:
223
+ needed_definitions.append(definition)
224
+
225
+ return needed_definitions
226
+
227
+ def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
228
+ """Extract just the tool names from tool calls, removing parameters."""
229
+ tool_names = []
230
+ for tool_call in tool_calls:
231
+ if isinstance(tool_call, dict):
232
+ tool_type = tool_call.get("type")
233
+ if tool_type == "tool_call":
234
+ tool_name = tool_call.get("name")
235
+ if tool_name:
236
+ tool_names.append(tool_name)
237
+ elif tool_call.get("function", {}).get("name"):
238
+ # Handle function call format
239
+ tool_names.append(tool_call["function"]["name"])
240
+ elif tool_call.get("name"):
241
+ # Handle direct name format
242
+ tool_names.append(tool_call["name"])
243
+ return tool_names
244
+
245
+ def _extract_needed_tool_definitions(
246
+ self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
247
+ ) -> List[Dict]:
248
+ """Extract the tool definitions that are needed for the provided tool calls.
249
+
250
+ :param tool_calls: The tool calls that need definitions
251
+ :type tool_calls: List[Dict]
252
+ :param tool_definitions: User-provided tool definitions
253
+ :type tool_definitions: List[Dict]
254
+ :param error_target: The evaluator-specific error target for exceptions
255
+ :type error_target: ErrorTarget
256
+ :return: List of needed tool definitions
257
+ :rtype: List[Dict]
258
+ :raises EvaluationException: If validation fails
259
+ """
260
+ needed_tool_definitions = []
261
+
262
+ # Add all user-provided tool definitions
263
+ needed_tool_definitions.extend(tool_definitions)
264
+
265
+ # Add the needed built-in tool definitions (if they are called)
266
+ built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
267
+ needed_tool_definitions.extend(built_in_definitions)
268
+
269
+ # OpenAPI tool is a collection of functions, so we need to expand it
270
+ tool_definitions_expanded = list(
271
+ chain.from_iterable(
272
+ tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
273
+ for tool in needed_tool_definitions
274
+ )
275
+ )
276
+
277
+ # Validate that all tool calls have corresponding definitions
278
+ for tool_call in tool_calls:
279
+ if isinstance(tool_call, dict):
280
+ tool_type = tool_call.get("type")
281
+
282
+ if tool_type == "tool_call":
283
+ tool_name = tool_call.get("name")
284
+ if tool_name and self._get_built_in_tool_definition(tool_name):
285
+ # This is a built-in tool from converter, already handled above
286
+ continue
287
+ elif tool_name:
288
+ # This is a regular function tool from converter
289
+ tool_definition_exists = any(
290
+ tool.get("name") == tool_name and tool.get("type", "function") == "function"
291
+ for tool in tool_definitions_expanded
292
+ )
293
+ if not tool_definition_exists:
294
+ raise EvaluationException(
295
+ message=f"Tool definition for {tool_name} not found",
296
+ blame=ErrorBlame.USER_ERROR,
297
+ category=ErrorCategory.INVALID_VALUE,
298
+ target=error_target,
299
+ )
300
+ else:
301
+ raise EvaluationException(
302
+ message=f"Tool call missing name: {tool_call}",
303
+ blame=ErrorBlame.USER_ERROR,
304
+ category=ErrorCategory.INVALID_VALUE,
305
+ target=error_target,
306
+ )
307
+ else:
308
+ # Unsupported tool format - only converter format is supported
309
+ raise EvaluationException(
310
+ message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
311
+ blame=ErrorBlame.USER_ERROR,
312
+ category=ErrorCategory.INVALID_VALUE,
313
+ target=error_target,
314
+ )
315
+ else:
316
+ # Tool call is not a dictionary
317
+ raise EvaluationException(
318
+ message=f"Tool call is not a dictionary: {tool_call}",
319
+ blame=ErrorBlame.USER_ERROR,
320
+ category=ErrorCategory.INVALID_VALUE,
321
+ target=error_target,
322
+ )
323
+
324
+ return needed_tool_definitions
325
+
326
+ def _not_applicable_result(
327
+ self, error_message: str, threshold: Union[int, float]
328
+ ) -> Dict[str, Union[str, float, Dict]]:
329
+ """Return a result indicating that the evaluation is not applicable.
330
+
331
+ :param error_message: The error message explaining why evaluation is not applicable.
332
+ :type error_message: str
333
+ :param threshold: The threshold value for the evaluator.
334
+ :type threshold: Union[int, float]
335
+ :return: A dictionary containing the result of the evaluation.
336
+ :rtype: Dict[str, Union[str, float, Dict]]
337
+ """
338
+ # If no tool calls were made or tool call type is not supported, return not applicable result
339
+ return {
340
+ self._result_key: self._NOT_APPLICABLE_RESULT,
341
+ f"{self._result_key}_result": "pass",
342
+ f"{self._result_key}_threshold": threshold,
343
+ f"{self._result_key}_reason": error_message,
344
+ f"{self._result_key}_details": {},
345
+ }
@@ -0,0 +1,198 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing import Dict, TypeVar, Union, Optional
5
+
6
+ from typing_extensions import override
7
+
8
+ from azure.ai.evaluation._common.constants import (
9
+ EvaluationMetrics,
10
+ _InternalEvaluationMetrics,
11
+ Tasks,
12
+ _InternalAnnotationTasks,
13
+ )
14
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
15
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
16
+ from azure.ai.evaluation._exceptions import EvaluationException
17
+ from azure.ai.evaluation._common.utils import validate_conversation
18
+ from azure.ai.evaluation._constants import _AggregationType
19
+ from azure.core.credentials import TokenCredential
20
+
21
+ from . import EvaluatorBase
22
+
23
+ T = TypeVar("T")
24
+
25
+
26
+ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
27
+ """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
28
+ This includes content safety evaluators, protected material evaluators, and others. These evaluators
29
+ are all assumed to be of the "query and response or conversation" input variety.
30
+
31
+ :param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
32
+ to specify which evaluation to perform.
33
+ :type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
34
+ :param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
35
+ aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
36
+ aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
37
+ when this occurs. Default is False, resulting full conversation evaluation and aggregation.
38
+ :type eval_last_turn: bool
39
+ :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation to produce a single result.
40
+ Default is ~azure.ai.evaluation._AggregationType.MEAN.
41
+ :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
42
+ :param threshold: The threshold for the evaluation. Default is 3.
43
+ :type threshold: Optional[int]
44
+ :param _higher_is_better: If True, higher scores are better. Default is True.
45
+ :type _higher_is_better: Optional[bool]
46
+ :param evaluate_query: If True, the query will be included in the evaluation data when evaluating
47
+ query-response pairs. If False, only the response will be evaluated. Default is False.
48
+ Can be passed as a keyword argument.
49
+ :type evaluate_query: bool
50
+ """
51
+
52
+ @override
53
+ def __init__(
54
+ self,
55
+ eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
56
+ azure_ai_project: Union[dict, str],
57
+ credential: TokenCredential,
58
+ eval_last_turn: bool = False,
59
+ conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
60
+ threshold: int = 3,
61
+ _higher_is_better: Optional[bool] = False,
62
+ **kwargs,
63
+ ):
64
+ super().__init__(
65
+ eval_last_turn=eval_last_turn,
66
+ conversation_aggregation_type=conversation_aggregation_type,
67
+ threshold=threshold,
68
+ _higher_is_better=_higher_is_better,
69
+ )
70
+ self._eval_metric = eval_metric
71
+ self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
72
+ self._credential = credential
73
+ self._threshold = threshold
74
+
75
+ # Handle evaluate_query parameter from kwargs
76
+ self._evaluate_query = kwargs.get("evaluate_query", False)
77
+ self._higher_is_better = _higher_is_better
78
+
79
+ @override
80
+ def __call__( # pylint: disable=docstring-missing-param
81
+ self,
82
+ *args,
83
+ **kwargs,
84
+ ):
85
+ """Evaluate either a query and response or a conversation. Must supply either a query AND response,
86
+ or a conversation, but not both.
87
+
88
+ :keyword query: The query to evaluate.
89
+ :paramtype query: Optional[str]
90
+ :keyword response: The response to evaluate.
91
+ :paramtype response: Optional[str]
92
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
93
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
94
+ to be dictionaries with keys "content", "role", and possibly "context".
95
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
96
+ :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
97
+ """
98
+ return super().__call__(*args, **kwargs)
99
+
100
+ @override
101
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
102
+ """Perform the evaluation using the Azure AI RAI service.
103
+ The exact evaluation performed is determined by the evaluation metric supplied
104
+ by the child class initializer.
105
+
106
+ :param eval_input: The input to the evaluation function.
107
+ :type eval_input: Dict
108
+ :return: The evaluation result.
109
+ :rtype: Dict
110
+ """
111
+ if "response" in eval_input:
112
+ return await self._evaluate_query_response(eval_input)
113
+
114
+ conversation = eval_input.get("conversation", None)
115
+ return await self._evaluate_conversation(conversation)
116
+
117
+ async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
118
+ """
119
+ Evaluates content according to this evaluator's metric.
120
+ :keyword conversation: The conversation contains list of messages to be evaluated.
121
+ Each message should have "role" and "content" keys.
122
+
123
+ :param conversation: The conversation to evaluate.
124
+ :type conversation: ~azure.ai.evaluation.Conversation
125
+ :return: The evaluation score computation based on the Content Safety metric (self.metric).
126
+ :rtype: Dict[str, Union[float, str]]
127
+ """
128
+ # validate inputs
129
+ validate_conversation(conversation)
130
+ messages = conversation["messages"]
131
+ # Run score computation based on supplied metric.
132
+ result = await evaluate_with_rai_service_multimodal(
133
+ messages=messages,
134
+ metric_name=self._eval_metric,
135
+ project_scope=self._azure_ai_project,
136
+ credential=self._credential,
137
+ )
138
+ return result
139
+
140
+ async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
141
+ query = eval_input.get("query", None)
142
+ response = eval_input.get("response", None)
143
+ if response is None:
144
+ raise EvaluationException(
145
+ message="Not implemented",
146
+ internal_message=(
147
+ "Reached query/response evaluation without supplying response."
148
+ + " This should have failed earlier."
149
+ ),
150
+ )
151
+ input_data = {"response": str(response)}
152
+
153
+ if query is not None and self._evaluate_query:
154
+ input_data["query"] = str(query)
155
+
156
+ if "context" in self._get_all_singleton_inputs():
157
+ context = eval_input.get("context", None)
158
+ if context is None:
159
+ raise EvaluationException(
160
+ message="Not implemented",
161
+ internal_message=(
162
+ "Attempted context-based evaluation without supplying context."
163
+ + " This should have failed earlier."
164
+ ),
165
+ )
166
+ input_data["context"] = context
167
+
168
+ return await evaluate_with_rai_service( # type: ignore
169
+ metric_name=self._eval_metric,
170
+ data=input_data,
171
+ project_scope=self._azure_ai_project,
172
+ credential=self._credential,
173
+ annotation_task=self._get_task(),
174
+ evaluator_name=self.__class__.__name__,
175
+ )
176
+
177
+ def _get_task(self):
178
+ """Get the annotation task for the current evaluation metric.
179
+ The annotation task is used by the RAI service script to determine a the message format
180
+ of the API call, and how the output is processed, among other things.
181
+
182
+ :return: The annotation task for the evaluator's self._eval_metric value.
183
+ :rtype: ~azure.ai.evaluation._common.constants.Tasks
184
+
185
+ """
186
+ if self._eval_metric == EvaluationMetrics.GROUNDEDNESS:
187
+ return Tasks.GROUNDEDNESS
188
+ if self._eval_metric == EvaluationMetrics.XPIA:
189
+ return Tasks.XPIA
190
+ if self._eval_metric == _InternalEvaluationMetrics.ECI:
191
+ return _InternalAnnotationTasks.ECI
192
+ if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
193
+ return Tasks.PROTECTED_MATERIAL
194
+ if self._eval_metric == EvaluationMetrics.CODE_VULNERABILITY:
195
+ return Tasks.CODE_VULNERABILITY
196
+ if self._eval_metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
197
+ return Tasks.UNGROUNDED_ATTRIBUTES
198
+ return Tasks.CONTENT_HARM
@@ -0,0 +1,49 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from typing import Callable, List
6
+ from azure.ai.evaluation._common.math import list_mean
7
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
8
+ from azure.ai.evaluation._constants import _AggregationType
9
+
10
+
11
+ def GetAggregator(aggregation_type: _AggregationType) -> Callable[[List[float]], float]:
12
+ if aggregation_type == _AggregationType.SUM:
13
+ return sum
14
+ if aggregation_type == _AggregationType.MEAN:
15
+ return list_mean
16
+ if aggregation_type == _AggregationType.MAX:
17
+ return max
18
+ if aggregation_type == _AggregationType.MIN:
19
+ return min
20
+ if aggregation_type == _AggregationType.CUSTOM:
21
+ msg = (
22
+ "Cannot 'get' aggregator function associated with custom aggregation enum."
23
+ + " This enum value should only be outputted as an indicator of an injected"
24
+ + " aggregation function, not inputted directly"
25
+ )
26
+ raise EvaluationException(
27
+ message=msg,
28
+ blame=ErrorBlame.UNKNOWN,
29
+ category=ErrorCategory.INVALID_VALUE,
30
+ target=ErrorTarget.EVALUATE,
31
+ )
32
+ raise EvaluationException(
33
+ message=f"Unaccounted for aggregation type: {aggregation_type}",
34
+ blame=ErrorBlame.UNKNOWN,
35
+ category=ErrorCategory.INVALID_VALUE,
36
+ target=ErrorTarget.EVALUATE,
37
+ )
38
+
39
+
40
+ def GetAggregatorType(aggregation_function: Callable) -> _AggregationType:
41
+ if aggregation_function == sum: # pylint: disable=comparison-with-callable
42
+ return _AggregationType.SUM
43
+ if aggregation_function == list_mean: # pylint: disable=comparison-with-callable
44
+ return _AggregationType.MEAN
45
+ if aggregation_function == max: # pylint: disable=comparison-with-callable
46
+ return _AggregationType.MAX
47
+ if aggregation_function == min: # pylint: disable=comparison-with-callable
48
+ return _AggregationType.MIN
49
+ return _AggregationType.CUSTOM
@@ -3,8 +3,6 @@
3
3
  # ---------------------------------------------------------
4
4
 
5
5
  from ._content_safety import ContentSafetyEvaluator
6
- from ._content_safety_base import ContentSafetyEvaluatorBase
7
- from ._content_safety_chat import ContentSafetyChatEvaluator
8
6
  from ._hate_unfairness import HateUnfairnessEvaluator
9
7
  from ._self_harm import SelfHarmEvaluator
10
8
  from ._sexual import SexualEvaluator
@@ -16,6 +14,4 @@ __all__ = [
16
14
  "SelfHarmEvaluator",
17
15
  "HateUnfairnessEvaluator",
18
16
  "ContentSafetyEvaluator",
19
- "ContentSafetyChatEvaluator",
20
- "ContentSafetyEvaluatorBase",
21
17
  ]