azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,148 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import logging
6
+ import os
7
+ from typing import Dict, List, Union
8
+ from typing_extensions import overload, override
9
+
10
+ from azure.ai.evaluation._evaluators._common._base_prompty_eval import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._model_configurations import Conversation
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
17
+ """
18
+ Evaluates retrieval score for a given query and context or a multi-turn conversation, including reasoning.
19
+
20
+ The retrieval measure assesses the AI system's performance in retrieving information
21
+ for additional context (e.g. a RAG scenario).
22
+
23
+ Retrieval scores range from 1 to 5, with 1 being the worst and 5 being the best.
24
+
25
+ High retrieval scores indicate that the AI system has successfully extracted and ranked
26
+ the most relevant information at the top, without introducing bias from external knowledge
27
+ and ignoring factual correctness. Conversely, low retrieval scores suggest that the AI system
28
+ has failed to surface the most relevant context chunks at the top of the list
29
+ and/or introduced bias and ignored factual correctness.
30
+
31
+ :param model_config: Configuration for the Azure OpenAI model.
32
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
33
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
34
+ :param threshold: The threshold for the evaluation. Default is 3.
35
+ :type threshold: float
36
+ :param credential: The credential for authenticating to Azure AI service.
37
+ :type credential: ~azure.core.credentials.TokenCredential
38
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
39
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
40
+ :paramtype is_reasoning_model: bool
41
+ :return: A function that evaluates and generates metrics for "chat" scenario.
42
+ :rtype: Callable
43
+
44
+ .. admonition:: Example:
45
+
46
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
47
+ :start-after: [START retrieval_evaluator]
48
+ :end-before: [END retrieval_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize and call a RetrievalEvaluator.
52
+
53
+ .. admonition:: Example using Azure AI Project URL:
54
+
55
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
56
+ :start-after: [START retrieval_evaluator]
57
+ :end-before: [END retrieval_evaluator]
58
+ :language: python
59
+ :dedent: 8
60
+ :caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
61
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
62
+
63
+ .. admonition:: Example with Threshold:
64
+
65
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
66
+ :start-after: [START threshold_retrieval_evaluator]
67
+ :end-before: [END threshold_retrieval_evaluator]
68
+ :language: python
69
+ :dedent: 8
70
+ :caption: Initialize with threshold and call a RetrievalEvaluator.
71
+
72
+ .. note::
73
+
74
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
75
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
76
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
77
+ """
78
+
79
+ _PROMPTY_FILE = "retrieval.prompty"
80
+ _RESULT_KEY = "retrieval"
81
+
82
+ id = "azureai://built-in/evaluators/retrieval"
83
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
84
+
85
+ @override
86
+ def __init__(self, model_config, *, threshold: float = 3, credential=None, **kwargs):
87
+ current_dir = os.path.dirname(__file__)
88
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
89
+ self._threshold = threshold
90
+ self._higher_is_better = True
91
+ super().__init__(
92
+ model_config=model_config,
93
+ prompty_file=prompty_path,
94
+ result_key=self._RESULT_KEY,
95
+ threshold=threshold,
96
+ credential=credential,
97
+ _higher_is_better=self._higher_is_better,
98
+ **kwargs,
99
+ )
100
+
101
+ @overload
102
+ def __call__(
103
+ self,
104
+ *,
105
+ query: str,
106
+ context: str,
107
+ ) -> Dict[str, Union[str, float]]:
108
+ """Evaluates retrieval for a given a query and context
109
+
110
+ :keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter.
111
+ :paramtype query: Optional[str]
112
+ :keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter.
113
+ :paramtype context: Optional[str]
114
+ :return: The scores for Chat scenario.
115
+ :rtype: Dict[str, Union[str, float]]
116
+ """
117
+
118
+ @overload
119
+ def __call__(
120
+ self,
121
+ *,
122
+ conversation: Conversation,
123
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
124
+ """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn,
125
+ the evaluator will aggregate the results of each turn.
126
+
127
+ :keyword conversation: The conversation to be evaluated.
128
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
129
+ :return: The scores for Chat scenario.
130
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
131
+ """
132
+
133
+ @override
134
+ def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param
135
+ """Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation,
136
+ or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
137
+ the evaluator will aggregate the results of each turn.
138
+
139
+ :keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter.
140
+ :paramtype query: Optional[str]
141
+ :keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter.
142
+ :paramtype context: Optional[str]
143
+ :keyword conversation: The conversation to be evaluated.
144
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
145
+ :return: The scores for Chat scenario.
146
+ :rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]]
147
+ """
148
+ return super().__call__(*args, **kwargs)
@@ -0,0 +1,93 @@
1
+ ---
2
+ name: Retrieval
3
+ description: Evaluates retrieval quality score for RAG scenario
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 1600
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: text
14
+
15
+ inputs:
16
+ query:
17
+ type: string
18
+ context:
19
+ type: string
20
+
21
+ ---
22
+ system:
23
+ # Instruction
24
+ ## Goal
25
+ ### You are an expert in evaluating the quality of a list of CONTEXT chunks from a query based on provided definition and data. Your goal will involve answering the questions below using the information provided.
26
+ - **Definition**: You are given a definition of the retrieval quality that is being evaluated to help guide your Score.
27
+ - **Data**: Your input data include QUERY and CONTEXT.
28
+ - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
29
+
30
+ user:
31
+ # Definition
32
+ **Retrieval** refers to measuring how relevant the context chunks are to address a query and how the most relevant context chunks are surfaced at the top of the list. It emphasizes the extraction and ranking of the most relevant information at the top, without introducing bias from external knowledge and ignoring factual correctness. It assesses the relevance and effectiveness of the retrieved context chunks with respect to the query.
33
+
34
+ # Ratings
35
+ ## [Retrieval: 1] (Irrelevant Context, External Knowledge Bias)
36
+ **Definition:** The retrieved context chunks are not relevant to the query despite any conceptual similarities. There is no overlap between the query and the retrieved information, and no useful chunks appear in the results. They introduce external knowledge that isn't part of the retrieval documents.
37
+
38
+ **Examples:**
39
+ **Query:** what is kuchen?
40
+ **Context:** ["There's nothing like the taste of a cake you made in your own kitchen. Baking a cake is as simple as measuring ingredients, mixing them in the right order, and remembering to take the cake out of the oven before it burns.", "A steady 325-350 degrees is ideal when it comes to baking pound cake. Position the pan in the middle of the oven, and rotate it once, halfway through the baking time, as it bakes to account for any hot spots. "CHOCOLATE POUND CAKE. Cream butter, sugar ... and floured bundt pan, 10 inch pan or 2 (9x5x3 inch) loaf pans. Bake at ... pans. Bake until cake tester inserted in ... to drizzle down sides. 4. BUTTERMILK LEMON POUND CAKE."", "Pour batter into your pan(s) and place in the oven. Cook for 75 minutes, checking periodically. Some ovens cook unevenly or quickly -- if this describes yours, keep an eye on it. 1 If to be used for fancy ornamented cakes, bake 30 to 35 minutes in a dripping-pan. 2 Insert a skewer or toothpick to see if it's finished.", "As a general rule of thumb you can bake most cakes at 375 degrees Fahrenheit (which is 180 degrees Celsius) and check them after about 30 minutes and expect it to take at least 45 minutes.", "Till a toothpick inserted in the center of the cake comes out clean. Depends on the heat of your oven but start checking at about 45 minutes and when the cake is golden brown. sonnyboy · 8 years ago. Thumbs up.", "1 This results in a pound cake with maximum volume. 2 Be patient. Beat softened butter (and cream cheese or vegetable shortening) at medium speed with an electric mixer until creamy. 3 This can take from 1 to 7 minutes, depending on the power of your mixer."]
41
+
42
+ **Query:** What are the main economic impacts of global warming?
43
+ **Context:** ["Economic theories such as supply and demand explain how prices fluctuate in a free market.", "Global warming is caused by increased carbon dioxide levels, which affect the environment and the atmosphere.", "Political factors also play a role in economic decisions across nations."]
44
+
45
+ ## [Retrieval: 2] (Partially Relevant Context, Poor Ranking, External Knowledge Bias)
46
+ **Definition:** The context chunks are partially relevant to address the query but are mostly irrelevant, and external knowledge or LLM bias starts influencing the context chunks. The most relevant chunks are either missing or placed at the bottom.
47
+
48
+ **Examples:**
49
+ **Query:** what is rappelling
50
+ **Context:** ["5. Cancel. Rappelling is the process of coming down from a mountain that is usually done with two pieces of rope. Use a natural anchor or a set of bolts to rappel from with help from an experienced rock climber in this free video on rappelling techniques. Part of the Video Series: Rappelling & Rock Climbing.", "Abseiling (/ˈaebseɪl/ ˈæbseɪl /or/ ; ˈɑːpzaɪl From german, abseilen meaning to rope), down also called, rappelling is the controlled descent of a vertical, drop such as a rock, face using a. Rope climbers use this technique when a cliff or slope is too steep/and or dangerous to descend without. protection", "1. rappel - (mountaineering) a descent of a vertical cliff or wall made by using a doubled rope that is fixed to a higher point and wrapped around the body. abseil. mountain climbing, mountaineering-the activity of climbing a mountain. descent-the act of changing your location in a downward direction."]
51
+
52
+ **Query:** Describe the causes of the French Revolution.
53
+ **Context:** ["The French Revolution started due to economic disparity, leading to unrest among the lower classes.", "The Industrial Revolution also contributed to changes in society during the 18th century.", "Philosophers like Rousseau inspired revolutionary thinking, but the taxation system played a role as well."]
54
+
55
+ ## [Retrieval: 3] (Relevant Context Ranked Bottom)
56
+ **Definition:** The context chunks contain relevant information to address the query, but the most pertinent chunks are located at the bottom of the list.
57
+
58
+ **Examples:**
59
+ **Query:** what are monocytes
60
+ **Context:** ["Monocytes are produced by the bone marrow from precursors called monoblasts, bipotent cells that differentiated from hematopoietic stem cells. Monocytes circulate in the bloodstream for about one to three days and then typically move into tissues throughout the body. Monocytes which migrate from the bloodstream to other tissues will then differentiate into tissue resident macrophages or dendritic cells. Macrophages are responsible for protecting tissues from foreign substances, but are also suspected to be important in the formation of important organs like the heart and brain.", "Report Abuse. A high level of monocytes could mean a number of things. They're a type of phagocyte-a type of cell found in your blood that 'eats' many types of attacking bacteria and other microorganisms when it matures. High levels could mean that you have an infection as more develop to fight it.", "Our immune system has a key component called the white blood cells, of which there are several different kinds. Monocytes are a type of white blood cell that fights off bacteria, viruses and fungi. Monocytes are the biggest type of white blood cell in the immune system. Originally formed in the bone marrow, they are released into our blood and tissues. When certain germs enter the body, they quickly rush to the site for attack.", "Monocyte. Monocytes are produced by the bone marrow from stem cell precursors called monoblasts. Monocytes circulate in the bloodstream for about one to three days and then typically move into tissues throughout the body. They make up three to eight percent of the leukocytes in the blood. Monocyte under a light microscope (40x) from a peripheral blood smear surrounded by red blood cells. Monocytes are a type of white blood cell, part of the human body's immune system. They are usually identified in stained smears by their large two-lobed nucleus.", "A monocyte (pictured below) is a large type of white blood cell with one large, smooth, well-defined, indented, slightly folded, oval, kidney-shaped, or notched nucleus (the cell's control center). White blood cells help protect the body against diseases and fight infections.", "Monocytes are white blood cells that are common to the blood of all vertebrates and they help the immune system to function properly. There are a number of reasons for a high monocyte count, which can also be called monocytosis. Some of the reasons can include stress, viral fevers, inflammation and organ necrosis. A physician may order a monocyte blood count test to check for raised levels of monocytes. There are a number of reasons for this test, from a simple health check up to people suffering from heart attacks and leukemia. Complications with the blood and cancer are two other reasons that this test may be performed.", "Monocytes are considered the largest white blood cell. These cells are part of the innate immune system. Monocytes also play important roles in the immune function of the body. These cells are often found when doing a stained smear and appear large kidney shaped. Many of these are found in the spleen area.", "This is taken directly from-http://www.wisegeek.com/what-are-monocytes.htm#. Monocytes are a type of leukocyte or white blood cell which play a role in immune system function. Depending on a patient's level of health, monocytes make up between one and three percent of the total white blood cells in the body. For example, if monocytes are elevated because of an inflammation caused by a viral infection, the patient would be given medication to kill the virus and bring down the inflammation. Typically, when a monocyte count is requested, the lab will also run other tests on the blood to generate a complete picture.", "3D Rendering of a Monocyte. Monocytes are a type of white blood cells (leukocytes). They are the largest of all leukocytes. They are part of the innate immune system of vertebrates including all mammals (humans included), birds, reptiles, and fish. Monocytes which migrate from the bloodstream to other tissues will then differentiate into tissue resident macrophages or dendritic cells. Macrophages are responsible for protecting tissues from foreign substances, but are also suspected to be important in the formation of important organs like the heart and brain."]
61
+
62
+ **Query:** What were the key features of the Magna Carta?
63
+ **Context:** ["The Magna Carta influenced the legal system in Europe, especially in constitutional law.", "It was signed in 1215 by King John of England to limit the powers of the monarchy.", "The Magna Carta introduced principles like due process and habeas corpus, which are key features of modern legal systems."]
64
+
65
+ ## [Retrieval: 4] (Relevant Context Ranked Middle, No External Knowledge Bias and Factual Accuracy Ignored)
66
+ **Definition:** The context chunks fully address the query, but the most relevant chunk is ranked in the middle of the list. No external knowledge is used to influence the ranking of the chunks; the system only relies on the provided context. Factual accuracy remains out of scope for evaluation.
67
+
68
+ **Examples:**
69
+ **Query:** do game shows pay their contestants
70
+ **Context:** ["So, in the end, game show winners get some of the money that TV advertisers pay to the networks, who pay the show producers, who then pay the game show winners. Just in the same way that the actors, and crew of a show get paid. Game shows, like other programs, have costs to produce the programs—they have to pay for sets, cameras, talent (the hosts), and also prizes to contestants.", "(Valerie Macon/Getty Images). Oh, happy day! You're a contestant on a popular game show—The Price Is Right, let's say. You spin the wheel, you make the winning bid, and suddenly—ka-ching!—you've won the Lexus or the dishwasher or the lifetime supply of nail clippers.", "1 If you can use most of the prizes the show offers, such as a new car or trip, you may be content to appear on a game show that features material prizes. 2 If not, you should probably try out for a show where cash is the main prize. 3 In the United States, game show contestants must pay taxes on any prizes they win. 2. Meet the eligibility requirements. All game shows have certain eligibility requirements for their contestants. Generally, you must be at least 18 years of age, except for those shows that use child or teenage contestants, and you are allowed to appear on no more than 1 game show per year.", "Rating Newest Oldest. Best Answer: You don't always win the money amount on the front of your lectern when you are on a game show. As someone else said, 2nd place earns $2000 and 3rd place earns $1000 in Jeopardy! In any case, the prize money is paid out from the ad revenue that the show receives from sponsors. I think in this case Who Wants to be a Millionaire or Deal or No Deal is the best example of how shows can be successful while still paying the prize money. I feel this way because these shows have a potential, however small it may be, to pay out 1 million dollars to every contestant on the show. Here is the reality. Regardless of the show whether it be a game show or a drama, a network will receive money from commercial advertising based on the viewership. With this in mind a game show costs very little to actually air compared to a full production drama series, that's where the prize money comes from"]
71
+
72
+ ## [Retrieval: 5] (Highly Relevant, Well Ranked, No Bias Introduced)
73
+ **Definition:** The context chunks not only fully address the query, but also surface the most relevant chunks at the top of the list. The retrieval respects the internal context, avoids relying on any outside knowledge, and focuses solely on pulling the most useful content to the forefront, irrespective of the factual correctness of the information.
74
+
75
+ **Examples:**
76
+ **Query:** The smallest blood vessels in your body, where gas exchange occurs are called
77
+ **Context:** ["Gas exchange is the delivery of oxygen from the lungs to the bloodstream, and the elimination of carbon dioxide from the bloodstream to the lungs. It occurs in the lungs between the alveoli and a network of tiny blood vessels called capillaries, which are located in the walls of the alveoli. The walls of the alveoli actually share a membrane with the capillaries in which oxygen and carbon dioxide move freely between the respiratory system and the bloodstream.", "Arterioles branch into capillaries, the smallest of all blood vessels. Capillaries are the sites of nutrient and waste exchange between the blood and body cells. Capillaries are microscopic vessels that join the arterial system with the venous system.", "Arterioles are the smallest arteries and regulate blood flow into capillary beds through vasoconstriction and vasodilation. Capillaries are the smallest vessels and allow for exchange of substances between the blood and interstitial fluid. Continuous capillaries are most common and allow passage of fluids and small solutes. Fenestrated capillaries are more permeable to fluids and solutes than continuous capillaries.", "Tweet. The smallest blood vessels in the human body are capillaries. They are responsible for the absorption of oxygen into the blood stream and for removing the deoxygenated red blood cells for return to the heart and lungs for reoxygenation.", "2. Capillaries—these are the sites of gas exchange between the tissues. 3. Veins—these return oxygen poor blood to the heart, except for the vein that carries blood from the lungs. On the right is a diagram showing how the three connect. Notice the artery and vein are much larger than the capillaries.", "Gas exchange occurs in the capillaries which are the smallest blood vessels in the body. Each artery that comes from the heart is surrounded by capillaries so that they can take it to the various parts of the body."]
78
+
79
+
80
+ # Data
81
+ QUERY: {{query}}
82
+ CONTEXT: {{context}}
83
+
84
+
85
+ # Tasks
86
+ ## Please provide your assessment Score for the previous CONTEXT in relation to the QUERY based on the Definitions above. Your output should include the following information:
87
+ - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
88
+ - **Explanation**: a very short explanation of why you think the input Data should get that Score.
89
+ - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
90
+
91
+
92
+ ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
93
+ # Output
@@ -3,13 +3,16 @@
3
3
  # ---------------------------------------------------------
4
4
  from enum import Enum
5
5
 
6
- from promptflow._utils.async_utils import async_run_allowing_running_loop
7
- from rouge_score import rouge_scorer
6
+ from typing import Dict, Union
7
+ from typing_extensions import overload, override
8
8
 
9
- from azure.core import CaseInsensitiveEnumMeta
9
+ from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
12
+ import math
10
13
 
11
14
 
12
- class RougeType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
15
+ class RougeType(str, Enum):
13
16
  """
14
17
  Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
15
18
  """
@@ -33,54 +36,179 @@ class RougeType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
33
36
  """Overlap of L-grams (L consecutive words) between generated and reference text."""
34
37
 
35
38
 
36
- class _AsyncRougeScoreEvaluator:
37
- def __init__(self, rouge_type: RougeType):
38
- self._rouge_type = rouge_type
39
-
40
- async def __call__(self, *, ground_truth: str, response: str, **kwargs):
41
- scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
42
- metrics = scorer.score(ground_truth, response)[self._rouge_type]
43
- return {
44
- "rouge_precision": metrics.precision,
45
- "rouge_recall": metrics.recall,
46
- "rouge_f1_score": metrics.fmeasure,
47
- }
48
-
49
-
50
- class RougeScoreEvaluator:
39
+ class RougeScoreEvaluator(EvaluatorBase):
40
+ """
41
+ Calculates the ROUGE score for a given response and ground truth.
42
+
43
+ The ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) evaluates the similarity between the
44
+ generated text and reference text based on n-gram overlap, including ROUGE-N (unigram, bigram, etc.), and
45
+ ROUGE-L (longest common subsequence). It calculates precision, recall, and F1 scores to capture how well
46
+ the generated text matches the reference text. Rouge type options are "rouge1" (Unigram overlap), "rouge2"
47
+ (Bigram overlap), "rouge3" (Trigram overlap), "rouge4" (4-gram overlap), "rouge5" (5-gram overlap), "rougeL"
48
+ (L-graph overlap)
49
+
50
+ Use the ROUGE score when you need a robust evaluation metric for text summarization, machine translation, and
51
+ other natural language processing tasks, especially when focusing on recall and the ability to capture relevant
52
+ information from the reference text.
53
+
54
+ ROUGE scores range from 0 to 1, with higher scores indicating better quality.
55
+ :param rouge_type: The type of ROUGE score to calculate. Default is "rouge1".
56
+ :type rouge_type: str
57
+ :param precision_threshold: The threshold value to determine if the precision evaluation passes or fails. Default is 0.5.
58
+ :type precision_threshold: float
59
+ :param recall_threshold: The threshold value to determine if the recall evaluation passes or fails. Default is 0.5.
60
+ :type recall_threshold: float
61
+ :param f1_score_threshold: The threshold value to determine if the F1 score evaluation passes or fails. Default is 0.5.
62
+ :type f1_score_threshold: float
63
+
64
+ .. admonition:: Example:
65
+
66
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
67
+ :start-after: [START rouge_score_evaluator]
68
+ :end-before: [END rouge_score_evaluator]
69
+ :language: python
70
+ :dedent: 8
71
+ :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
72
+
73
+ .. admonition:: Example using Azure AI Project URL:
74
+
75
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
76
+ :start-after: [START rouge_score_evaluator]
77
+ :end-before: [END rouge_score_evaluator]
78
+ :language: python
79
+ :dedent: 8
80
+ :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
81
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
82
+
83
+ .. admonition:: Example with threshold:
84
+
85
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
86
+ :start-after: [START threshold_rouge_score_evaluator]
87
+ :end-before: [END threshold_rouge_score_evaluator]
88
+ :language: python
89
+ :dedent: 8
90
+ :caption: Initialize with a specified threshold and call a RougeScoreEvaluator with a four-gram rouge type.
51
91
  """
52
- Evaluator for computes the ROUGE scores between two strings.
53
-
54
- ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate automatic
55
- summarization and machine translation. It measures the overlap between generated text and reference summaries.
56
- ROUGE focuses on recall-oriented measures to assess how well the generated text covers the reference text. Text
57
- summarization and document comparison are among optimal use cases for ROUGE, particularly in scenarios where text
58
- coherence and relevance are critical.
59
-
60
- **Usage**
61
-
62
- .. code-block:: python
63
92
 
64
- eval_fn = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
65
- result = eval_fn(
66
- response="Tokyo is the capital of Japan.",
67
- ground_truth="The capital of Japan is Tokyo.")
93
+ id = "azureai://built-in/evaluators/rouge_score"
94
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
95
+
96
+ @override
97
+ def __init__(
98
+ self,
99
+ rouge_type: RougeType,
100
+ *,
101
+ precision_threshold: float = 0.5,
102
+ recall_threshold: float = 0.5,
103
+ f1_score_threshold: float = 0.5,
104
+ ):
105
+ self._rouge_type = rouge_type
106
+ self._higher_is_better = True
107
+ super().__init__()
108
+
109
+ # Type checking for threshold parameters
110
+ for name, value in [
111
+ ("precision_threshold", precision_threshold),
112
+ ("recall_threshold", recall_threshold),
113
+ ("f1_score_threshold", f1_score_threshold),
114
+ ]:
115
+ if not isinstance(value, float):
116
+ raise TypeError(f"{name} must be a float, got {type(value)}")
117
+
118
+ self._threshold = {
119
+ "precision": precision_threshold,
120
+ "recall": recall_threshold,
121
+ "f1_score": f1_score_threshold,
122
+ }
68
123
 
69
- **Output format**
124
+ def _get_binary_result(
125
+ self,
126
+ rouge_precision: float,
127
+ rouge_recall: float,
128
+ rouge_f1_score: float,
129
+ ) -> Dict[str, bool]:
130
+ """
131
+ Get binary result based on the threshold.
70
132
 
71
- .. code-block:: python
133
+ :param rouge_precision: The precision score.
134
+ :type rouge_precision: float
135
+ :param rouge_recall: The recall score.
136
+ :type rouge_recall: float
137
+ :param rouge_f1_score: The F1 score.
138
+ :type rouge_f1_score: float
139
+ :return: A dictionary with binary results for precision, recall, and F1 score.
72
140
 
73
- {
74
- "rouge_precision": 1.0,
75
- "rouge_recall": 1.0,
76
- "rouge_f1_score": 1.0
141
+ """
142
+ # Initialize results with False for NaN values
143
+ results = {
144
+ "rouge_precision_result": False,
145
+ "rouge_recall_result": False,
146
+ "rouge_f1_score_result": False,
77
147
  }
78
- """
79
148
 
80
- def __init__(self, rouge_type: RougeType):
81
- self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
149
+ # Check if values are valid (not NaN) before comparison
150
+ precision_valid = not math.isnan(rouge_precision)
151
+ recall_valid = not math.isnan(rouge_recall)
152
+ f1_valid = not math.isnan(rouge_f1_score)
153
+
154
+ if self._higher_is_better:
155
+ if precision_valid:
156
+ results["rouge_precision_result"] = rouge_precision >= self._threshold["precision"]
157
+ if recall_valid:
158
+ results["rouge_recall_result"] = rouge_recall >= self._threshold["recall"]
159
+ if f1_valid:
160
+ results["rouge_f1_score_result"] = rouge_f1_score >= self._threshold["f1_score"]
161
+ else:
162
+ if precision_valid:
163
+ results["rouge_precision_result"] = rouge_precision <= self._threshold["precision"]
164
+ if recall_valid:
165
+ results["rouge_recall_result"] = rouge_recall <= self._threshold["recall"]
166
+ if f1_valid:
167
+ results["rouge_f1_score_result"] = rouge_f1_score <= self._threshold["f1_score"]
168
+
169
+ return results
170
+
171
+ @override
172
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
173
+ """Produce a rouge score evaluation result.
174
+
175
+ :param eval_input: The input to the evaluation function.
176
+ :type eval_input: Dict
177
+ :return: The evaluation result.
178
+ :rtype: Dict
179
+ """
180
+ ground_truth = eval_input["ground_truth"]
181
+ response = eval_input["response"]
182
+ scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
183
+ metrics = scorer.score(ground_truth, response)[self._rouge_type]
184
+ binary_results = {
185
+ "rouge_precision_result": False,
186
+ "rouge_recall_result": False,
187
+ "rouge_f1_score_result": False,
188
+ }
189
+ # Convert metrics to floats, using nan for None or non-convertible values
190
+ rouge_precision = float(metrics.precision) if metrics.precision is not None else float("nan")
191
+ rouge_recall = float(metrics.recall) if metrics.recall is not None else float("nan")
192
+ rouge_f1_score = float(metrics.fmeasure) if metrics.fmeasure is not None else float("nan")
193
+ binary_results = self._get_binary_result(
194
+ rouge_precision=rouge_precision,
195
+ rouge_recall=rouge_recall,
196
+ rouge_f1_score=rouge_f1_score,
197
+ )
198
+ return {
199
+ "rouge_precision": rouge_precision,
200
+ "rouge_recall": rouge_recall,
201
+ "rouge_f1_score": rouge_f1_score,
202
+ "rouge_precision_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_precision_result"]],
203
+ "rouge_recall_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_recall_result"]],
204
+ "rouge_f1_score_result": EVALUATION_PASS_FAIL_MAPPING[binary_results["rouge_f1_score_result"]],
205
+ "rouge_precision_threshold": self._threshold["precision"],
206
+ "rouge_recall_threshold": self._threshold["recall"],
207
+ "rouge_f1_score_threshold": self._threshold["f1_score"],
208
+ }
82
209
 
83
- def __call__(self, *, ground_truth: str, response: str, **kwargs):
210
+ @overload # type: ignore
211
+ def __call__(self, *, ground_truth: str, response: str) -> Dict[str, float]:
84
212
  """
85
213
  Evaluate the ROUGE score between the response and the ground truth.
86
214
 
@@ -89,11 +217,22 @@ class RougeScoreEvaluator:
89
217
  :keyword ground_truth: The ground truth to be compared against.
90
218
  :paramtype ground_truth: str
91
219
  :return: The ROUGE score.
92
- :rtype: dict
220
+ :rtype: Dict[str, float]
93
221
  """
94
- return async_run_allowing_running_loop(
95
- self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
96
- )
97
222
 
98
- def _to_async(self):
99
- return self._async_evaluator
223
+ @override
224
+ def __call__( # pylint: disable=docstring-missing-param
225
+ self,
226
+ *args,
227
+ **kwargs,
228
+ ):
229
+ """
230
+ Evaluate route score.
231
+ :keyword response: The response to be evaluated.
232
+ :paramtype response: str
233
+ :keyword ground_truth: The ground truth to be compared against.
234
+ :paramtype ground_truth: str
235
+ :return: The ROUGE score.
236
+ :rtype: Dict[str, float]
237
+ """
238
+ return super().__call__(*args, **kwargs)
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._service_groundedness import GroundednessProEvaluator
6
+
7
+ __all__ = [
8
+ "GroundednessProEvaluator",
9
+ ]