azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -10,91 +10,189 @@ model:
10
10
  presence_penalty: 0
11
11
  frequency_penalty: 0
12
12
  response_format:
13
- type: text
13
+ type: json_object
14
14
 
15
15
  inputs:
16
16
  query:
17
17
  type: string
18
18
  response:
19
19
  type: string
20
-
21
20
  ---
21
+
22
22
  system:
23
- # Instruction
24
- ## Goal
25
- ### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
26
- - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
27
- - **Data**: Your input data include QUERY and RESPONSE.
28
- - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
23
+ You are a Relevance-Judge, an impartial evaluator that scores how well the RESPONSE addresses the user's queries in the CONVERSATION_HISTORY using the definitions provided.
29
24
 
30
25
  user:
31
- # Definition
32
- **Relevance** refers to how effectively a response addresses a question. It assesses the accuracy, completeness, and direct relevance of the response based solely on the given information.
26
+ ROLE
27
+ ====
28
+ You are a Relevance Evaluator. Your task is to judge how relevant a RESPONSE is to the CONVERSATION_HISTORY using the Relevance definitions provided.
33
29
 
34
- # Ratings
35
- ## [Relevance: 1] (Irrelevant Response)
36
- **Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed.
30
+ INPUT
31
+ =====
32
+ CONVERSATION_HISTORY: {{query}}
33
+ RESPONSE: {{response}}
37
34
 
38
- **Examples:**
39
- **Query:** What is the team preparing for?
40
- **Response:** I went grocery shopping yesterday evening.
35
+ CONVERSATION_HISTORY is the full dialogue between the user and the agent up to the user's latest message. For single-turn interactions, this will be just the user's query.
36
+ RESPONSE is the agent's reply to the user's latest message.
37
+
38
+ TASK
39
+ ====
40
+ Output a JSON object with:
41
+ 1) a concise explanation of 15-60 words justifying your score based on how well the response is relevant to the user's queries in the CONVERSATION_HISTORY.
42
+ 2) an integer score from 1 (very poor) to 5 (excellent) using the rubric below.
41
43
 
42
- **Query:** When will the company's new product line launch?
43
- **Response:** International travel can be very rewarding and educational.
44
+ The explanation should always precede the score and should clearly justify the score based on the rubric definitions.
45
+ Response format exactly as follows:
44
46
 
45
- ## [Relevance: 2] (Incorrect Response)
46
- **Definition:** The response attempts to address the question but includes incorrect information. It provides a response that is factually wrong based on the provided information.
47
+ {
48
+ "explanation": "<15-60 words>",
49
+ "score": <1-5>
50
+ }
47
51
 
48
- **Examples:**
49
- **Query:** When was the merger between the two firms finalized?
50
- **Response:** The merger was finalized on April 10th.
51
52
 
52
- **Query:** Where and when will the solar eclipse be visible?
53
- **Response:** The solar eclipse will be visible in Asia on December 14th.
53
+ EVALUATION STEPS
54
+ ================
55
+ A. Read the CONVERSATION_HISTORY and RESPONSE carefully.
56
+ B. Identify the user's query from the latest message (use conversation history for context if needed).
57
+ C. Compare the RESPONSE against the rubric below:
58
+ - Does the response directly address the user's query?
59
+ - Is the information complete, partial, or off-topic?
60
+ - Is it vague, generic, or insightful?
61
+ D. Match the response to the best score from the rubric.
62
+ E. Provide a short explanation and the score using the required format.
54
63
 
55
- ## [Relevance: 3] (Incomplete Response)
56
- **Definition:** The response addresses the question but omits key details necessary for a full understanding. It provides a partial response that lacks essential information.
64
+ SCORING RUBRIC
65
+ ==============
57
66
 
58
- **Examples:**
59
- **Query:** What type of food does the new restaurant offer?
60
- **Response:** The restaurant offers Italian food like pasta.
67
+ ### Score 1 - Irrelevant Response
68
+ Definition: The response is unrelated to the question. It provides off-topic information and does not attempt to address the question posed.
61
69
 
62
- **Query:** What topics will the conference cover?
63
- **Response:** The conference will cover renewable energy and climate change.
70
+ **Example A**
71
+ CONVERSATION_HISTORY: What is the team preparing for?
72
+ RESPONSE: I went grocery shopping yesterday evening.
64
73
 
65
- ## [Relevance: 4] (Complete Response)
66
- **Definition:** The response fully addresses the question with accurate and complete information. It includes all essential details required for a comprehensive understanding, without adding any extraneous information.
74
+ Expected Output:
75
+ {
76
+ "explanation": "The response is entirely off-topic and doesn't address the question.",
77
+ "score": 1
78
+ }
67
79
 
68
- **Examples:**
69
- **Query:** What type of food does the new restaurant offer?
70
- **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto.
71
80
 
72
- **Query:** What topics will the conference cover?
73
- **Response:** The conference will cover renewable energy, climate change, and sustainability practices.
81
+ **Example B**
82
+ CONVERSATION_HISTORY: When will the company's new product line launch?
83
+ RESPONSE: International travel can be very rewarding and educational.
74
84
 
75
- ## [Relevance: 5] (Comprehensive Response with Insights)
76
- **Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding.
85
+ Expected Output:
86
+ {
87
+ "explanation": "The response is completely irrelevant to the product launch question.",
88
+ "score": 1
89
+ }
77
90
 
78
- **Examples:**
79
- **Query:** What type of food does the new restaurant offer?
80
- **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience.
81
91
 
82
- **Query:** What topics will the conference cover?
83
- **Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues.
92
+ ### Score 2 Related but Unhelpful / Superficial
93
+ Definition: The response is loosely or formally related to the query but fails to deliver any meaningful, specific, or useful information. This includes vague phrases, non-answers, or failure/error messages.
84
94
 
95
+ **Example A**
96
+ CONVERSATION_HISTORY: What is the event about?
97
+ RESPONSE: It’s something important.
98
+
99
+ Expected Output:
100
+ {
101
+ "explanation": "The response vaguely refers to the query topic but lacks specific or informative content.",
102
+ "score": 2
103
+ }
104
+
105
+ **Example B**
106
+ CONVERSATION_HISTORY: What’s the weather in Paris?
107
+ RESPONSE: I tried to find the forecast but the query failed.
108
+
109
+ Expected Output:
110
+ {
111
+ "explanation": "The response acknowledges the query but provides no usable weather information. It is related but unhelpful.",
112
+ "score": 2
113
+ }
114
+
115
+ ### Score 3 - Partially Relevant / Incomplete
116
+ Definition: The response addresses the query and includes relevant information, but omits essential components or detail. The answer is on-topic but insufficient to fully satisfy the request.
117
+
118
+ **Example A**
119
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
120
+ RESPONSE: The apartment complex has a gym.
121
+
122
+ Expected Output:
123
+ {
124
+ "explanation": "The response mentions one amenity but does not provide a fuller list or clarify whether other standard features (like parking or security) are included. It partially addresses the query but lacks completeness.",
125
+ "score": 3
126
+ }
127
+
128
+ **Example B**
129
+ CONVERSATION_HISTORY: What services does the premium membership include?
130
+ RESPONSE: It includes priority customer support.
131
+
132
+ Expected Output:
133
+ {
134
+ "explanation": "The response identifies one service but omits other likely components of a premium membership (e.g., exclusive content or discounts). The information is relevant but incomplete.",
135
+ "score": 3
136
+ }
137
+
138
+
139
+
140
+ ### Score 4 - Fully Relevant / Sufficient Response
141
+ Definition: The response fully addresses the question with accurate and sufficient information, covering all essential aspects. Very minor omissions are acceptable as long as the core information is intact and the intent is clearly conveyed.
142
+
143
+ **Example A**
144
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
145
+ RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security.
146
+
147
+ Expected Output:
148
+ {
149
+ "explanation": "The response mentions multiple key amenities likely to be relevant to most users. While it may not list every feature, it clearly conveys the core offerings of the complex.",
150
+ "score": 4
151
+ }
152
+
153
+ **Example B**
154
+ CONVERSATION_HISTORY: What services does the premium membership include?
155
+ RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases.
156
+
157
+ Expected Output:
158
+ {
159
+ "explanation": "The response outlines all major services expected from a premium membership. Even if a minor service is not mentioned, the core value is clearly and fully represented.",
160
+ "score": 4
161
+ }
85
162
 
86
163
 
87
- # Data
88
- QUERY: {{query}}
89
- RESPONSE: {{response}}
164
+ ### Score 5 - Comprehensive Response with Insights
165
+ Definition: The response not only fully and accurately answers the question, but also adds meaningful elaboration, interpretation, or context that enhances the user's understanding. This goes beyond just listing relevant details — it offers insight into why the information matters, how it's useful, or what impact it has.
90
166
 
167
+ **Example A**
168
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
169
+ RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security, designed to offer residents a comfortable and active lifestyle while ensuring their safety.
170
+
171
+ Expected Output:
172
+ {
173
+ "explanation": "The response fully lists key amenities and additionally explains how these features contribute to resident experience, enhancing the usefulness of the information.",
174
+ "score": 5
175
+ }
91
176
 
92
- # Tasks
93
- ## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
94
- - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
95
- - **Explanation**: a very short explanation of why you think the input Data should get that Score.
96
- - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
177
+ **Example B**
178
+ CONVERSATION_HISTORY: What services does the premium membership include?
179
+ RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases tailored for users who want quicker resolutions and first access to new features.
97
180
 
181
+ Expected Output:
182
+ {
183
+ "explanation": "The response covers all essential services and adds valuable insight about the target user and benefits, enriching the response beyond basic listing.",
184
+ "score": 5
185
+ }
186
+
187
+ ### Multi-turn Conversation Example
188
+ When evaluating responses in a multi-turn conversation, consider the conversation context to understand the user's intent:
98
189
 
99
- ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
100
- # Output
190
+ **Example - Multi-turn Context**
191
+ CONVERSATION_HISTORY: [{"role":"user","content":"I'm planning a vacation to Europe."},{"role":"assistant","content":"That sounds exciting! What time of year are you thinking of traveling?"},{"role":"user","content":"Probably in July. What's the weather like then?"}]
192
+ RESPONSE: [{"role":"assistant","content":"July is summer in Europe with generally warm and pleasant weather. Most countries have temperatures between 20-25°C (68-77°F). It's a popular travel time, so expect crowds at major tourist attractions and higher accommodation prices."}]
193
+
194
+ Expected Output:
195
+ {
196
+ "explanation": "The response directly addresses the weather question while providing valuable context about crowds and pricing that's relevant to vacation planning established in the conversation.",
197
+ "score": 5
198
+ }
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._response_completeness import ResponseCompletenessEvaluator
6
+
7
+ __all__ = ["ResponseCompletenessEvaluator"]
@@ -0,0 +1,202 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import os
6
+ import logging
7
+ import math
8
+ from typing import Dict, List, Union, Optional
9
+
10
+ from typing_extensions import overload, override
11
+
12
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
13
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
14
+ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
15
+ from azure.ai.evaluation._model_configurations import Conversation, Message
16
+ from azure.ai.evaluation._common._experimental import experimental
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @experimental
22
+ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
23
+ """Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
24
+ provided ground truth.
25
+
26
+ The completeness measure assesses how thoroughly an AI model's generated response aligns with the key information,
27
+ claims, and statements established in the ground truth. This evaluation considers the presence, accuracy,
28
+ and relevance of the content provided.
29
+
30
+ The assessment spans multiple levels, ranging from fully incomplete to fully complete, ensuring a comprehensive
31
+ evaluation of the response's content quality.
32
+
33
+ Use this metric when you need to evaluate an AI model's ability to deliver comprehensive and accurate information,
34
+ particularly in text generation tasks where conveying all essential details is crucial for clarity,
35
+ context, and correctness.
36
+
37
+ Completeness scores range from 1 to 5:
38
+
39
+ 1: Fully incomplete — Contains none of the necessary information.
40
+ 2: Barely complete — Contains only a small portion of the required information.
41
+ 3: Moderately complete — Covers about half of the required content.
42
+ 4: Mostly complete — Includes most of the necessary details with minimal omissions.
43
+ 5: Fully complete — Contains all key information without any omissions.
44
+
45
+ :param model_config: Configuration for the Azure OpenAI model.
46
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
47
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
48
+
49
+ .. admonition:: Example using Azure AI Project URL:
50
+
51
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
52
+ :start-after: [START completeness_evaluator]
53
+ :end-before: [END completeness_evaluator]
54
+ :language: python
55
+ :dedent: 8
56
+ :caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
57
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
58
+
59
+ """
60
+
61
+ # Constants must be defined within eval's directory to be save/loadable
62
+
63
+ _PROMPTY_FILE = "response_completeness.prompty"
64
+ _RESULT_KEY = "response_completeness"
65
+
66
+ id = "azureai://built-in/evaluators/response_completeness"
67
+
68
+ _MIN_COMPLETENESS_SCORE = 1
69
+ _MAX_COMPLETENESS_SCORE = 5
70
+ _DEFAULT_COMPLETENESS_THRESHOLD = 3
71
+
72
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
73
+
74
+ @override
75
+ def __init__(
76
+ self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
77
+ ):
78
+ current_dir = os.path.dirname(__file__)
79
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
80
+ self.threshold = threshold # to be removed in favor of _threshold
81
+ super().__init__(
82
+ model_config=model_config,
83
+ prompty_file=prompty_path,
84
+ result_key=self._RESULT_KEY,
85
+ threshold=threshold,
86
+ credential=credential,
87
+ _higher_is_better=True,
88
+ **kwargs,
89
+ )
90
+
91
+ @overload
92
+ def __call__(
93
+ self,
94
+ *,
95
+ ground_truth: str,
96
+ response: str,
97
+ ) -> Dict[str, Union[str, float]]:
98
+ """Evaluate completeness in given response. Accepts ground truth and response for evaluation.
99
+ Example usage:
100
+ Evaluating completeness for a response string
101
+ ```python
102
+ from azure.ai.evaluation import CompletenessEvaluator
103
+ completeness_evaluator = CompletenessEvaluator(model_config)
104
+ ground_truth = "The ground truth to be evaluated."
105
+ response = "The response to be evaluated."
106
+ completeness_results = completeness_evaluator(ground_truth=ground_truth, response=response)
107
+ ```
108
+ :keword ground_truth: The ground truth to be evaluated.
109
+ :paramtype ground_truth: str
110
+ :keyword response: The response to be evaluated.
111
+ :paramtype response: Union[str, List[Message]]
112
+ :return: The response completeness score results.
113
+ :rtype: Dict[str, Union[str, float]]
114
+ """
115
+
116
+ @overload
117
+ def __call__(
118
+ self,
119
+ *,
120
+ conversation: Conversation,
121
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
122
+ """Evaluate completeness for a conversation
123
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
124
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
125
+ to be dictionaries with keys "content", "role", and possibly "context".
126
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
127
+ :return: The fluency score
128
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
129
+ """
130
+
131
+ @override
132
+ def __call__( # pylint: disable=docstring-missing-param
133
+ self,
134
+ *args,
135
+ **kwargs,
136
+ ):
137
+ """
138
+ Invokes the instance using the overloaded __call__ signature.
139
+
140
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
141
+ """
142
+ return super().__call__(*args, **kwargs)
143
+
144
+ @override
145
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
146
+ """Do completeness evaluation.
147
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
148
+ :type eval_input: Dict
149
+ :return: The evaluation result.
150
+ :rtype: Dict
151
+ """
152
+ # we override the _do_eval method as we want the output to be a dictionary,
153
+ # which is a different schema than _base_prompty_eval.py
154
+ if "ground_truth" not in eval_input or "response" not in eval_input:
155
+ raise EvaluationException(
156
+ message=f"Both ground_truth and response must be provided as input to the completeness evaluator.",
157
+ internal_message=f"Both ground_truth and response must be provided as input to the completeness"
158
+ f" evaluator.",
159
+ blame=ErrorBlame.USER_ERROR,
160
+ category=ErrorCategory.MISSING_FIELD,
161
+ target=ErrorTarget.COMPLETENESS_EVALUATOR,
162
+ )
163
+
164
+ result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
165
+ llm_output = result.get("llm_output") if isinstance(result, dict) else result
166
+
167
+ score = math.nan
168
+ llm_output_is_dict = isinstance(llm_output, dict)
169
+ if llm_output_is_dict or isinstance(llm_output, str):
170
+ reason = ""
171
+ if llm_output_is_dict:
172
+ score = float(llm_output.get("score", math.nan))
173
+ reason = llm_output.get("explanation", "")
174
+ else:
175
+ score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
176
+
177
+ binary_result = self._get_binary_result(score)
178
+
179
+ # updating the result key and threshold to int based on the schema
180
+ return {
181
+ f"{self._result_key}": int(score),
182
+ f"{self._result_key}_result": binary_result,
183
+ f"{self._result_key}_threshold": int(self._threshold),
184
+ f"{self._result_key}_reason": reason,
185
+ f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
186
+ f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
187
+ f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
188
+ f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
189
+ f"{self._result_key}_model": result.get("model_id", ""),
190
+ f"{self._result_key}_sample_input": result.get("sample_input", ""),
191
+ f"{self._result_key}_sample_output": result.get("sample_output", ""),
192
+ }
193
+
194
+ if logger:
195
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
196
+
197
+ binary_result = self._get_binary_result(score)
198
+ return {
199
+ self._result_key: float(score),
200
+ f"{self._result_key}_result": binary_result,
201
+ f"{self._result_key}_threshold": self._threshold,
202
+ }
@@ -0,0 +1,84 @@
1
+ ---
2
+ name: Completeness
3
+ description: Evaluates Completeness score for QA scenario
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 800
9
+ top_p: 1.0
10
+ seed: 123
11
+ presence_penalty: 0
12
+ frequency_penalty: 0
13
+ response_format:
14
+ type: text
15
+
16
+ inputs:
17
+ response:
18
+ type: string
19
+ ground_truth:
20
+ type: string
21
+
22
+ ---
23
+ system:
24
+ # Instruction
25
+ ## Goal
26
+ ### You are an expert in evaluating the quality of a Response from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
27
+ - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
28
+ - **Data**: Your input data include Response and Ground Truth.
29
+ - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
30
+
31
+ user:
32
+ # Definition
33
+ **Completeness** refers to how accurately and thoroughly a response represents the information provided in the ground truth. It considers both the inclusion of all relevant statements and the correctness of those statements. Each statement in the ground truth should be evaluated individually to determine if it is accurately reflected in the response without missing any key information. The scale ranges from 1 to 5, with higher numbers indicating greater completeness.
34
+
35
+ # Ratings
36
+ ## [Completeness: 1] (Fully Incomplete)
37
+ **Definition:** A response that does not contain any of the necessary and relevant information with respect to the ground truth. It completely misses all the information, especially claims and statements, established in the ground truth.
38
+
39
+ **Examples:**
40
+ **Response:** "Flu shot cannot cure cancer. Stay healthy requires sleeping exactly 8 hours a day. A few hours of exercise per week will have little benefits for physical and mental health. Physical and mental health benefits are separate topics. Scientists have not studied any of them."
41
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Staying healthy requires proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
42
+
43
+ ## [Completeness: 2] (Barely Complete)
44
+ **Definition:** A response that contains only a small percentage of all the necessary and relevant information with respect to the ground truth. It misses almost all the information, especially claims and statements, established in the ground truth.
45
+
46
+ **Examples:**
47
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires 2 meals a day. Exercise per week makes no difference to physical and mental health. This is because physical and mental health benefits have low correlation through scientific studies. Scientists are making this observation in studies."
48
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
49
+
50
+ ## [Completeness: 3] (Moderately Complete)
51
+ **Definition:** A response that contains half of the necessary and relevant information with respect to the ground truth. It misses half of the information, especially claims and statements, established in the ground truth.
52
+
53
+ **Examples:**
54
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires a few dollars of investments a day. Even a few dollars of investments per week will not make an impact on physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Fiction writers are starting to discover them through their works."
55
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
56
+
57
+ ## [Completeness: 4] (Mostly Complete)
58
+ **Definition:** A response that contains most of the necessary and relevant information with respect to the ground truth. It misses some minor information, especially claims and statements, established in the ground truth.
59
+
60
+ **Examples:**
61
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires keto diet and rigorous athletic training. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
62
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
63
+
64
+ ## [Completeness: 5] (Fully Complete)
65
+ **Definition:** A response that perfectly contains all the necessary and relevant information with respect to the ground truth. It does not miss any information from statements and claims in the ground truth.
66
+
67
+ **Examples:**
68
+ **Response:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
69
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
70
+
71
+
72
+ # Data
73
+ Response: {{response}}
74
+ Ground Truth: {{ground_truth}}
75
+
76
+
77
+ # Tasks
78
+ ## Please provide your assessment Score for the previous RESPONSE in relation to the GROUND TRUTH based on the Definitions above. Your output should include the following information:
79
+ - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
80
+ - **Explanation**: a very short explanation of why you think the input data should get that Score.
81
+ - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be an integer score (i.e., "1", "2"...) based on the levels of the definitions.
82
+
83
+ ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
84
+ # Output
@@ -31,6 +31,13 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
31
31
  :param model_config: Configuration for the Azure OpenAI model.
32
32
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
33
33
  ~azure.ai.evaluation.OpenAIModelConfiguration]
34
+ :param threshold: The threshold for the evaluation. Default is 3.
35
+ :type threshold: float
36
+ :param credential: The credential for authenticating to Azure AI service.
37
+ :type credential: ~azure.core.credentials.TokenCredential
38
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
39
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
40
+ :paramtype is_reasoning_model: bool
34
41
  :return: A function that evaluates and generates metrics for "chat" scenario.
35
42
  :rtype: Callable
36
43
 
@@ -43,6 +50,25 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
43
50
  :dedent: 8
44
51
  :caption: Initialize and call a RetrievalEvaluator.
45
52
 
53
+ .. admonition:: Example using Azure AI Project URL:
54
+
55
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
56
+ :start-after: [START retrieval_evaluator]
57
+ :end-before: [END retrieval_evaluator]
58
+ :language: python
59
+ :dedent: 8
60
+ :caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
61
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
62
+
63
+ .. admonition:: Example with Threshold:
64
+
65
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
66
+ :start-after: [START threshold_retrieval_evaluator]
67
+ :end-before: [END threshold_retrieval_evaluator]
68
+ :language: python
69
+ :dedent: 8
70
+ :caption: Initialize with threshold and call a RetrievalEvaluator.
71
+
46
72
  .. note::
47
73
 
48
74
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -53,14 +79,24 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
53
79
  _PROMPTY_FILE = "retrieval.prompty"
54
80
  _RESULT_KEY = "retrieval"
55
81
 
56
- id = "azureml://registries/azureml/models/Retrieval-Evaluator/versions/1"
82
+ id = "azureai://built-in/evaluators/retrieval"
57
83
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
58
84
 
59
85
  @override
60
- def __init__(self, model_config): # pylint: disable=super-init-not-called
86
+ def __init__(self, model_config, *, threshold: float = 3, credential=None, **kwargs):
61
87
  current_dir = os.path.dirname(__file__)
62
88
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
63
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
89
+ self._threshold = threshold
90
+ self._higher_is_better = True
91
+ super().__init__(
92
+ model_config=model_config,
93
+ prompty_file=prompty_path,
94
+ result_key=self._RESULT_KEY,
95
+ threshold=threshold,
96
+ credential=credential,
97
+ _higher_is_better=self._higher_is_better,
98
+ **kwargs,
99
+ )
64
100
 
65
101
  @overload
66
102
  def __call__(