azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -1,357 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
- import json
5
- import logging
6
- from concurrent.futures import as_completed
7
- from typing import Dict, List
8
-
9
- import numpy as np
10
- from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
11
-
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
-
14
- from .._coherence import CoherenceEvaluator
15
- from .._fluency import FluencyEvaluator
16
- from .._groundedness import GroundednessEvaluator
17
- from .._relevance import RelevanceEvaluator
18
- from .retrieval import RetrievalChatEvaluator
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class ChatEvaluator:
24
- """
25
- Initialize a chat evaluator configured for a specific Azure OpenAI model.
26
-
27
- :param model_config: Configuration for the Azure OpenAI model.
28
- :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
29
- ~azure.ai.evaluation.OpenAIModelConfiguration]
30
- :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
31
- focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
32
- :type eval_last_turn: bool
33
- :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
34
- Default is True.
35
- :type parallel: bool
36
- :return: A function that evaluates and generates metrics for "chat" scenario.
37
- :rtype: Callable
38
-
39
- **Usage**
40
-
41
- .. code-block:: python
42
-
43
- chat_eval = ChatEvaluator(model_config)
44
- conversation = [
45
- {"role": "user", "content": "What is the value of 2 + 2?"},
46
- {"role": "assistant", "content": "2 + 2 = 4", "context": {
47
- "citations": [
48
- {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
49
- ]
50
- }
51
- }
52
- ]
53
- result = chat_eval(conversation=conversation)
54
-
55
- **Output format**
56
-
57
- .. code-block:: python
58
-
59
- {
60
- "evaluation_per_turn": {
61
- "gpt_retrieval": [1.0, 2.0],
62
- "gpt_groundedness": [5.0, 2.0],
63
- "gpt_relevance": [3.0, 5.0],
64
- "gpt_coherence": [1.0, 2.0],
65
- "gpt_fluency": [3.0, 5.0]
66
- }
67
- "gpt_retrieval": 1.5,
68
- "gpt_groundedness": 3.5,
69
- "gpt_relevance": 4.0,
70
- "gpt_coherence": 1.5,
71
- "gpt_fluency": 4.0
72
- }
73
- """
74
-
75
- def __init__(
76
- self,
77
- model_config: dict,
78
- eval_last_turn: bool = False,
79
- parallel: bool = True,
80
- ):
81
- self._eval_last_turn = eval_last_turn
82
- self._parallel = parallel
83
-
84
- # TODO: Need a built-in evaluator for retrieval. It needs to be added to `self._rag_evaluators` collection
85
- self._rag_evaluators = [
86
- GroundednessEvaluator(model_config),
87
- RelevanceEvaluator(model_config),
88
- ]
89
- self._non_rag_evaluators = [
90
- CoherenceEvaluator(model_config),
91
- FluencyEvaluator(model_config),
92
- ]
93
- # TODO: Temporary workaround to close the gap of missing retrieval score
94
- # https://msdata.visualstudio.com/Vienna/_workitems/edit/3186644
95
- # For long term, we need to add a built-in evaluator for retrieval after prompt is generalized for QA and Chat
96
- self._retrieval_chat_evaluator = RetrievalChatEvaluator(model_config)
97
-
98
- def __call__(self, *, conversation, **kwargs):
99
- """
100
- Evaluates chat scenario.
101
-
102
- :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
103
- "context" key is optional for assistant's turn and should have "citations" key with list of citations.
104
- :paramtype conversation: List[Dict]
105
- :return: The scores for Chat scenario.
106
- :rtype: dict
107
- """
108
- self._validate_conversation(conversation)
109
-
110
- # Extract queries, responses and contexts from conversation
111
- queries = []
112
- responses = []
113
- contexts = []
114
-
115
- if self._eval_last_turn:
116
- # Process only the last two turns if _eval_last_turn is True
117
- conversation_slice = conversation[-2:] if len(conversation) >= 2 else conversation
118
- else:
119
- conversation_slice = conversation
120
-
121
- for each_turn in conversation_slice:
122
- role = each_turn["role"]
123
- if role == "user":
124
- queries.append(each_turn["content"])
125
- elif role == "assistant":
126
- responses.append(each_turn["content"])
127
- if "context" in each_turn and "citations" in each_turn["context"]:
128
- citations = json.dumps(each_turn["context"]["citations"])
129
- contexts.append(citations)
130
-
131
- # Select evaluators to be used for evaluation
132
- compute_rag_based_metrics = True
133
- if len(responses) != len(contexts):
134
- safe_message = (
135
- "Skipping rag based metrics as we need citations or "
136
- "retrieved_documents in context key of every assistant's turn"
137
- )
138
- logger.warning(safe_message)
139
- compute_rag_based_metrics = False
140
-
141
- selected_evaluators = []
142
- selected_evaluators.extend(self._non_rag_evaluators)
143
- if compute_rag_based_metrics:
144
- selected_evaluators.extend(self._rag_evaluators)
145
-
146
- # Evaluate each turn
147
- per_turn_results = []
148
- for turn_num in range(len(queries)):
149
- current_turn_result = {}
150
-
151
- if self._parallel:
152
- # Parallel execution
153
- with ThreadPoolExecutor() as executor:
154
- future_to_evaluator = {
155
- executor.submit(
156
- self._evaluate_turn, turn_num, queries, responses, contexts, evaluator
157
- ): evaluator
158
- for evaluator in selected_evaluators
159
- }
160
-
161
- for future in as_completed(future_to_evaluator):
162
- result = future.result()
163
- current_turn_result.update(result)
164
- else:
165
- # Sequential execution
166
- for evaluator in selected_evaluators:
167
- async_evaluator = evaluator._to_async()
168
- result = self._evaluate_turn(turn_num, queries, responses, contexts, async_evaluator)
169
- current_turn_result.update(result)
170
-
171
- per_turn_results.append(current_turn_result)
172
-
173
- # Aggregate results
174
- # Final aggregated results for a conversation will look like:
175
- # "gpt_groundedness": 2.0, # Mean of all groundedness scores
176
- # "evaluation_per_turn": {
177
- # "gpt_groundedness": {
178
- # "score": [1.0, ...],
179
- # "reason": ["reason1", ...],
180
- # },
181
- # },
182
- # }
183
- aggregated = self._aggregate_results(per_turn_results)
184
-
185
- # Run RetrievalChatEvaluator and merge the results
186
- if compute_rag_based_metrics:
187
- retrieval_score = self._retrieval_chat_evaluator(conversation=conversation_slice)
188
- aggregated["gpt_retrieval"] = retrieval_score["gpt_retrieval"]
189
- aggregated["evaluation_per_turn"]["gpt_retrieval"] = retrieval_score["evaluation_per_turn"]["gpt_retrieval"]
190
- aggregated = dict(sorted(aggregated.items()))
191
-
192
- return aggregated
193
-
194
- def _evaluate_turn(self, turn_num, queries, responses, contexts, evaluator):
195
- try:
196
- query = queries[turn_num] if turn_num < len(queries) else ""
197
- response = responses[turn_num] if turn_num < len(responses) else ""
198
- context = contexts[turn_num] if turn_num < len(contexts) else ""
199
-
200
- score = evaluator(query=query, response=response, context=context)
201
-
202
- return score
203
- except Exception as e: # pylint: disable=broad-exception-caught
204
- logger.warning(
205
- "Evaluator %s failed for turn %s with exception: %s", evaluator.__class__.__name__, turn_num + 1, e
206
- )
207
- return {}
208
-
209
- def _aggregate_results(self, per_turn_results: List[Dict]):
210
- scores = {}
211
- reasons = {}
212
-
213
- for turn in per_turn_results:
214
- for metric, value in turn.items():
215
- if "reason" in metric:
216
- if metric not in reasons:
217
- reasons[metric] = []
218
- reasons[metric].append(value)
219
- else:
220
- if metric not in scores:
221
- scores[metric] = []
222
- scores[metric].append(value)
223
-
224
- aggregated = {}
225
- evaluation_per_turn = {}
226
-
227
- for metric, values in scores.items():
228
- aggregated[metric] = np.nanmean(values)
229
-
230
- # Prepare per-turn evaluations
231
- evaluation_per_turn[metric] = {"score": values}
232
- reason_key = f"{metric}_reason"
233
- if reason_key in reasons:
234
- evaluation_per_turn[metric]["reason"] = reasons[reason_key]
235
-
236
- aggregated["evaluation_per_turn"] = evaluation_per_turn
237
-
238
- return aggregated
239
-
240
- def _validate_conversation(self, conversation: List[Dict]):
241
- if conversation is None or not isinstance(conversation, list):
242
- msg = "conversation must be a list of dictionaries"
243
- raise EvaluationException(
244
- message=msg,
245
- internal_message=msg,
246
- target=ErrorTarget.CHAT_EVALUATOR,
247
- category=ErrorCategory.INVALID_VALUE,
248
- blame=ErrorBlame.USER_ERROR,
249
- )
250
-
251
- expected_role = "user"
252
- for turn_num, turn in enumerate(conversation):
253
- one_based_turn_num = turn_num + 1
254
-
255
- if not isinstance(turn, dict):
256
- msg = f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}"
257
- raise EvaluationException(
258
- message=msg,
259
- internal_message=msg,
260
- target=ErrorTarget.CHAT_EVALUATOR,
261
- category=ErrorCategory.INVALID_VALUE,
262
- blame=ErrorBlame.USER_ERROR,
263
- )
264
-
265
- if "role" not in turn or "content" not in turn:
266
- msg = (
267
- "Each turn in 'conversation' must have 'role' and 'content' keys. "
268
- + f"Turn number: {one_based_turn_num}"
269
- )
270
- raise EvaluationException(
271
- message=msg,
272
- internal_message=msg,
273
- target=ErrorTarget.CHAT_EVALUATOR,
274
- category=ErrorCategory.INVALID_VALUE,
275
- blame=ErrorBlame.USER_ERROR,
276
- )
277
-
278
- if turn["role"] != expected_role:
279
- msg = f"Expected role {expected_role} but got {turn['role']}. Turn number: {one_based_turn_num}"
280
- raise EvaluationException(
281
- message=msg,
282
- internal_message=msg,
283
- target=ErrorTarget.CHAT_EVALUATOR,
284
- category=ErrorCategory.INVALID_VALUE,
285
- blame=ErrorBlame.USER_ERROR,
286
- )
287
-
288
- if not isinstance(turn["content"], str):
289
- msg = f"Content in each turn must be a string. Turn number: {one_based_turn_num}"
290
- raise EvaluationException(
291
- message=msg,
292
- internal_message=msg,
293
- target=ErrorTarget.CHAT_EVALUATOR,
294
- category=ErrorCategory.INVALID_VALUE,
295
- blame=ErrorBlame.USER_ERROR,
296
- )
297
-
298
- if turn["role"] == "assistant" and "context" in turn:
299
- if not isinstance(turn["context"], dict):
300
- msg = f"Context in each assistant's turn must be a dictionary. Turn number: {one_based_turn_num}"
301
- raise EvaluationException(
302
- message=msg,
303
- internal_message=msg,
304
- target=ErrorTarget.CHAT_EVALUATOR,
305
- category=ErrorCategory.INVALID_VALUE,
306
- blame=ErrorBlame.USER_ERROR,
307
- )
308
-
309
- if "citations" not in turn["context"]:
310
- msg = (
311
- f"Context in each assistant's turn must have 'citations' key. Turn number: {one_based_turn_num}"
312
- )
313
- raise EvaluationException(
314
- message=msg,
315
- internal_message=msg,
316
- target=ErrorTarget.CHAT_EVALUATOR,
317
- category=ErrorCategory.MISSING_FIELD,
318
- blame=ErrorBlame.USER_ERROR,
319
- )
320
-
321
- if not isinstance(turn["context"]["citations"], list):
322
- msg = f"'citations' in context must be a list. Turn number: {one_based_turn_num}"
323
- raise EvaluationException(
324
- message=msg,
325
- internal_message=msg,
326
- target=ErrorTarget.CHAT_EVALUATOR,
327
- category=ErrorCategory.INVALID_VALUE,
328
- blame=ErrorBlame.USER_ERROR,
329
- )
330
-
331
- for citation_num, citation in enumerate(turn["context"]["citations"]):
332
- if not isinstance(citation, dict):
333
- msg = (
334
- "Each citation in 'citations' must be a dictionary. "
335
- + f"Turn number: {one_based_turn_num}, Citation number: {citation_num + 1}"
336
- )
337
- raise EvaluationException(
338
- message=msg,
339
- internal_message=msg,
340
- target=ErrorTarget.CHAT_EVALUATOR,
341
- category=ErrorCategory.INVALID_VALUE,
342
- blame=ErrorBlame.USER_ERROR,
343
- )
344
-
345
- # Toggle expected role for the next turn
346
- expected_role = "user" if expected_role == "assistant" else "assistant"
347
-
348
- # Ensure the conversation ends with an assistant's turn
349
- if expected_role != "user":
350
- msg = "The conversation must end with an assistant's turn."
351
- raise EvaluationException(
352
- message=msg,
353
- internal_message=msg,
354
- target=ErrorTarget.CHAT_EVALUATOR,
355
- category=ErrorCategory.INVALID_VALUE,
356
- blame=ErrorBlame.USER_ERROR,
357
- )
@@ -1,157 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- import json
6
- import logging
7
- import os
8
- import re
9
-
10
- import numpy as np
11
- from promptflow._utils.async_utils import async_run_allowing_running_loop
12
- from promptflow.core import AsyncPrompty
13
-
14
- from ...._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- try:
19
- from ...._user_agent import USER_AGENT
20
- except ImportError:
21
- USER_AGENT = None
22
-
23
-
24
- class _AsyncRetrievalChatEvaluator:
25
- # Constants must be defined within eval's directory to be save/loadable
26
- PROMPTY_FILE = "retrieval.prompty"
27
- LLM_CALL_TIMEOUT = 600
28
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
29
-
30
- def __init__(self, model_config: dict):
31
- ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
32
-
33
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
34
-
35
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
36
- # https://github.com/encode/httpx/discussions/2959
37
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
38
-
39
- ensure_user_agent_in_aoai_model_config(
40
- model_config,
41
- prompty_model_config,
42
- USER_AGENT,
43
- )
44
-
45
- current_dir = os.path.dirname(__file__)
46
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
47
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
48
-
49
- async def __call__(self, *, conversation, **kwargs):
50
- # Extract queries, responses and contexts from conversation
51
- queries = []
52
- responses = []
53
- contexts = []
54
-
55
- for each_turn in conversation:
56
- role = each_turn["role"]
57
- if role == "user":
58
- queries.append(each_turn["content"])
59
- elif role == "assistant":
60
- responses.append(each_turn["content"])
61
- if "context" in each_turn and "citations" in each_turn["context"]:
62
- citations = json.dumps(each_turn["context"]["citations"])
63
- contexts.append(citations)
64
-
65
- # Evaluate each turn
66
- per_turn_scores = []
67
- history = []
68
- for turn_num, query in enumerate(queries):
69
- try:
70
- query = query if turn_num < len(queries) else ""
71
- answer = responses[turn_num] if turn_num < len(responses) else ""
72
- context = contexts[turn_num] if turn_num < len(contexts) else ""
73
-
74
- history.append({"user": query, "assistant": answer})
75
-
76
- llm_output = await self._flow(
77
- query=query, history=history, documents=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs
78
- )
79
- score = np.nan
80
- if llm_output:
81
- parsed_score_response = re.findall(r"\d+", llm_output.split("# Result")[-1].strip())
82
- if len(parsed_score_response) > 0:
83
- score = float(parsed_score_response[0].replace("'", "").strip())
84
-
85
- per_turn_scores.append(score)
86
-
87
- except Exception as e: # pylint: disable=broad-exception-caught
88
- logger.warning(
89
- "Evaluator %s failed for turn %s with exception: %s", self.__class__.__name__, turn_num + 1, e
90
- )
91
-
92
- per_turn_scores.append(np.nan)
93
-
94
- return {
95
- "gpt_retrieval": np.nanmean(per_turn_scores),
96
- "evaluation_per_turn": {
97
- "gpt_retrieval": {
98
- "score": per_turn_scores,
99
- }
100
- },
101
- }
102
-
103
-
104
- class RetrievalChatEvaluator:
105
- """
106
- Initialize an evaluator configured for a specific Azure OpenAI model.
107
-
108
- :param model_config: Configuration for the Azure OpenAI model.
109
- :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
110
- ~azure.ai.evaluation.OpenAIModelConfiguration]
111
- :return: A function that evaluates and generates metrics for "chat" scenario.
112
- :rtype: Callable
113
- **Usage**
114
-
115
- .. code-block:: python
116
-
117
- chat_eval = RetrievalChatEvaluator(model_config)
118
- conversation = [
119
- {"role": "user", "content": "What is the value of 2 + 2?"},
120
- {"role": "assistant", "content": "2 + 2 = 4", "context": {
121
- "citations": [
122
- {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
123
- ]
124
- }
125
- }
126
- ]
127
- result = chat_eval(conversation=conversation)
128
-
129
- **Output format**
130
-
131
- .. code-block:: python
132
-
133
- {
134
- "gpt_retrieval": 3.0
135
- "evaluation_per_turn": {
136
- "gpt_retrieval": {
137
- "score": [1.0, 2.0, 3.0]
138
- }
139
- }
140
- }
141
- """
142
-
143
- def __init__(self, model_config: dict):
144
- self._async_evaluator = _AsyncRetrievalChatEvaluator(model_config)
145
-
146
- def __call__(self, *, conversation, **kwargs):
147
- """Evaluates retrieval score chat scenario.
148
-
149
- :keyword conversation: The conversation to be evaluated.
150
- :paramtype conversation: List[Dict]
151
- :return: The scores for Chat scenario.
152
- :rtype: dict
153
- """
154
- return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs)
155
-
156
- def _to_async(self):
157
- return self._async_evaluator
@@ -1,48 +0,0 @@
1
- ---
2
- name: Retrieval
3
- description: Evaluates retrieval score for Chat scenario
4
- model:
5
- api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
- parameters:
12
- temperature: 0.0
13
- top_p: 1.0
14
- presence_penalty: 0
15
- frequency_penalty: 0
16
- response_format:
17
- type: text
18
-
19
- inputs:
20
- query:
21
- type: string
22
- history:
23
- type: string
24
- documents:
25
- type: string
26
-
27
- ---
28
- system:
29
- A chat history between user and bot is shown below
30
- A list of documents is shown below in json format, and each document has one unique id.
31
- These listed documents are used as context to answer the given question.
32
- The task is to score the relevance between the documents and the potential answer to the given question in the range of 1 to 5.
33
- 1 means none of the documents is relevant to the question at all. 5 means either one of the document or combination of a few documents is ideal for answering the given question.
34
- Think through step by step:
35
- - Summarize each given document first
36
- - Determine the underlying intent of the given question, when the question is ambiguous, refer to the given chat history
37
- - Measure how suitable each document to the given question, list the document id and the corresponding relevance score.
38
- - Summarize the overall relevance of given list of documents to the given question after # Overall Reason, note that the answer to the question can solely from single document or a combination of multiple documents.
39
- - Finally, output "# Result" followed by a score from 1 to 5.
40
-
41
- # Question
42
- {{ query }}
43
- # Chat History
44
- {{ history }}
45
- # Documents
46
- ===BEGIN RETRIEVED DOCUMENTS===
47
- {{ documents }}
48
- ===END RETRIEVED DOCUMENTS===
@@ -1,65 +0,0 @@
1
- # ---------------------------------------------------------
2
- # Copyright (c) Microsoft Corporation. All rights reserved.
3
- # ---------------------------------------------------------
4
-
5
- from abc import ABC
6
-
7
- from azure.ai.evaluation._common.constants import EvaluationMetrics
8
- from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
9
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
10
-
11
-
12
- class ContentSafetyEvaluatorBase(ABC):
13
- """
14
- Initialize a evaluator for a specified Evaluation Metric. Base class that is not
15
- meant to be instantiated by users.
16
-
17
-
18
- :param metric: The metric to be evaluated.
19
- :type metric: ~azure.ai.evaluation._evaluators._content_safety.flow.constants.EvaluationMetrics
20
- :param azure_ai_project: The scope of the Azure AI project.
21
- It contains subscription id, resource group, and project name.
22
- :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
23
- :param credential: The credential for connecting to Azure AI project.
24
- :type credential: ~azure.core.credentials.TokenCredential
25
- """
26
-
27
- def __init__(self, metric: EvaluationMetrics, azure_ai_project: dict, credential=None):
28
- self._metric = metric
29
- self._azure_ai_project = azure_ai_project
30
- self._credential = credential
31
-
32
- async def __call__(self, *, query: str, response: str, **kwargs):
33
- """
34
- Evaluates content according to this evaluator's metric.
35
-
36
- :keyword query: The query to be evaluated.
37
- :paramtype query: str
38
- :keyword response: The response to be evaluated.
39
- :paramtype response: str
40
- :return: The evaluation score computation based on the Content Safety metric (self.metric).
41
- :rtype: Any
42
- """
43
- # Validate inputs
44
- # Raises value error if failed, so execution alone signifies success.
45
- if not (query and query.strip() and query != "None") or not (
46
- response and response.strip() and response != "None"
47
- ):
48
- msg = "Both 'query' and 'response' must be non-empty strings."
49
- raise EvaluationException(
50
- message=msg,
51
- internal_message=msg,
52
- error_category=ErrorCategory.MISSING_FIELD,
53
- error_blame=ErrorBlame.USER_ERROR,
54
- error_target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
55
- )
56
-
57
- # Run score computation based on supplied metric.
58
- result = await evaluate_with_rai_service(
59
- metric_name=self._metric,
60
- query=query,
61
- response=response,
62
- project_scope=self._azure_ai_project,
63
- credential=self._credential,
64
- )
65
- return result