azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -3,67 +3,196 @@ name: Relevance
3
3
  description: Evaluates relevance score for QA scenario
4
4
  model:
5
5
  api: chat
6
- configuration:
7
- type: azure_openai
8
- azure_deployment: ${env:AZURE_DEPLOYMENT}
9
- api_key: ${env:AZURE_OPENAI_API_KEY}
10
- azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
11
6
  parameters:
12
7
  temperature: 0.0
13
- max_tokens: 1
8
+ max_tokens: 800
14
9
  top_p: 1.0
15
10
  presence_penalty: 0
16
11
  frequency_penalty: 0
17
12
  response_format:
18
- type: text
13
+ type: json_object
19
14
 
20
15
  inputs:
21
16
  query:
22
17
  type: string
23
18
  response:
24
19
  type: string
25
- context:
26
- type: string
27
-
28
20
  ---
21
+
29
22
  system:
30
- You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
23
+ You are a Relevance-Judge, an impartial evaluator that scores how well the RESPONSE addresses the user's queries in the CONVERSATION_HISTORY using the definitions provided.
24
+
31
25
  user:
32
- Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale:
33
- One star: the answer completely lacks relevance
34
- Two stars: the answer mostly lacks relevance
35
- Three stars: the answer is partially relevant
36
- Four stars: the answer is mostly relevant
37
- Five stars: the answer has perfect relevance
38
-
39
- This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
40
-
41
- context: Marie Curie was a Polish-born physicist and chemist who pioneered research on radioactivity and was the first woman to win a Nobel Prize.
42
- question: What field did Marie Curie excel in?
43
- answer: Marie Curie was a renowned painter who focused mainly on impressionist styles and techniques.
44
- stars: 1
45
-
46
- context: The Beatles were an English rock band formed in Liverpool in 1960, and they are widely regarded as the most influential music band in history.
47
- question: Where were The Beatles formed?
48
- answer: The band The Beatles began their journey in London, England, and they changed the history of music.
49
- stars: 2
50
-
51
- context: The recent Mars rover, Perseverance, was launched in 2020 with the main goal of searching for signs of ancient life on Mars. The rover also carries an experiment called MOXIE, which aims to generate oxygen from the Martian atmosphere.
52
- question: What are the main goals of Perseverance Mars rover mission?
53
- answer: The Perseverance Mars rover mission focuses on searching for signs of ancient life on Mars.
54
- stars: 3
55
-
56
- context: The Mediterranean diet is a commonly recommended dietary plan that emphasizes fruits, vegetables, whole grains, legumes, lean proteins, and healthy fats. Studies have shown that it offers numerous health benefits, including a reduced risk of heart disease and improved cognitive health.
57
- question: What are the main components of the Mediterranean diet?
58
- answer: The Mediterranean diet primarily consists of fruits, vegetables, whole grains, and legumes.
59
- stars: 4
60
-
61
- context: The Queen's Royal Castle is a well-known tourist attraction in the United Kingdom. It spans over 500 acres and contains extensive gardens and parks. The castle was built in the 15th century and has been home to generations of royalty.
62
- question: What are the main attractions of the Queen's Royal Castle?
63
- answer: The main attractions of the Queen's Royal Castle are its expansive 500-acre grounds, extensive gardens, parks, and the historical castle itself, which dates back to the 15th century and has housed generations of royalty.
64
- stars: 5
65
-
66
- context: {{context}}
67
- question: {{query}}
68
- answer: {{response}}
69
- stars:
26
+ ROLE
27
+ ====
28
+ You are a Relevance Evaluator. Your task is to judge how relevant a RESPONSE is to the CONVERSATION_HISTORY using the Relevance definitions provided.
29
+
30
+ INPUT
31
+ =====
32
+ CONVERSATION_HISTORY: {{query}}
33
+ RESPONSE: {{response}}
34
+
35
+ CONVERSATION_HISTORY is the full dialogue between the user and the agent up to the user's latest message. For single-turn interactions, this will be just the user's query.
36
+ RESPONSE is the agent's reply to the user's latest message.
37
+
38
+ TASK
39
+ ====
40
+ Output a JSON object with:
41
+ 1) a concise explanation of 15-60 words justifying your score based on how well the response is relevant to the user's queries in the CONVERSATION_HISTORY.
42
+ 2) an integer score from 1 (very poor) to 5 (excellent) using the rubric below.
43
+
44
+ The explanation should always precede the score and should clearly justify the score based on the rubric definitions.
45
+ Response format exactly as follows:
46
+
47
+ {
48
+ "explanation": "<15-60 words>",
49
+ "score": <1-5>
50
+ }
51
+
52
+
53
+ EVALUATION STEPS
54
+ ================
55
+ A. Read the CONVERSATION_HISTORY and RESPONSE carefully.
56
+ B. Identify the user's query from the latest message (use conversation history for context if needed).
57
+ C. Compare the RESPONSE against the rubric below:
58
+ - Does the response directly address the user's query?
59
+ - Is the information complete, partial, or off-topic?
60
+ - Is it vague, generic, or insightful?
61
+ D. Match the response to the best score from the rubric.
62
+ E. Provide a short explanation and the score using the required format.
63
+
64
+ SCORING RUBRIC
65
+ ==============
66
+
67
+ ### Score 1 - Irrelevant Response
68
+ Definition: The response is unrelated to the question. It provides off-topic information and does not attempt to address the question posed.
69
+
70
+ **Example A**
71
+ CONVERSATION_HISTORY: What is the team preparing for?
72
+ RESPONSE: I went grocery shopping yesterday evening.
73
+
74
+ Expected Output:
75
+ {
76
+ "explanation": "The response is entirely off-topic and doesn't address the question.",
77
+ "score": 1
78
+ }
79
+
80
+
81
+ **Example B**
82
+ CONVERSATION_HISTORY: When will the company's new product line launch?
83
+ RESPONSE: International travel can be very rewarding and educational.
84
+
85
+ Expected Output:
86
+ {
87
+ "explanation": "The response is completely irrelevant to the product launch question.",
88
+ "score": 1
89
+ }
90
+
91
+
92
+ ### Score 2 – Related but Unhelpful / Superficial
93
+ Definition: The response is loosely or formally related to the query but fails to deliver any meaningful, specific, or useful information. This includes vague phrases, non-answers, or failure/error messages.
94
+
95
+ **Example A**
96
+ CONVERSATION_HISTORY: What is the event about?
97
+ RESPONSE: It’s something important.
98
+
99
+ Expected Output:
100
+ {
101
+ "explanation": "The response vaguely refers to the query topic but lacks specific or informative content.",
102
+ "score": 2
103
+ }
104
+
105
+ **Example B**
106
+ CONVERSATION_HISTORY: What’s the weather in Paris?
107
+ RESPONSE: I tried to find the forecast but the query failed.
108
+
109
+ Expected Output:
110
+ {
111
+ "explanation": "The response acknowledges the query but provides no usable weather information. It is related but unhelpful.",
112
+ "score": 2
113
+ }
114
+
115
+ ### Score 3 - Partially Relevant / Incomplete
116
+ Definition: The response addresses the query and includes relevant information, but omits essential components or detail. The answer is on-topic but insufficient to fully satisfy the request.
117
+
118
+ **Example A**
119
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
120
+ RESPONSE: The apartment complex has a gym.
121
+
122
+ Expected Output:
123
+ {
124
+ "explanation": "The response mentions one amenity but does not provide a fuller list or clarify whether other standard features (like parking or security) are included. It partially addresses the query but lacks completeness.",
125
+ "score": 3
126
+ }
127
+
128
+ **Example B**
129
+ CONVERSATION_HISTORY: What services does the premium membership include?
130
+ RESPONSE: It includes priority customer support.
131
+
132
+ Expected Output:
133
+ {
134
+ "explanation": "The response identifies one service but omits other likely components of a premium membership (e.g., exclusive content or discounts). The information is relevant but incomplete.",
135
+ "score": 3
136
+ }
137
+
138
+
139
+
140
+ ### Score 4 - Fully Relevant / Sufficient Response
141
+ Definition: The response fully addresses the question with accurate and sufficient information, covering all essential aspects. Very minor omissions are acceptable as long as the core information is intact and the intent is clearly conveyed.
142
+
143
+ **Example A**
144
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
145
+ RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security.
146
+
147
+ Expected Output:
148
+ {
149
+ "explanation": "The response mentions multiple key amenities likely to be relevant to most users. While it may not list every feature, it clearly conveys the core offerings of the complex.",
150
+ "score": 4
151
+ }
152
+
153
+ **Example B**
154
+ CONVERSATION_HISTORY: What services does the premium membership include?
155
+ RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases.
156
+
157
+ Expected Output:
158
+ {
159
+ "explanation": "The response outlines all major services expected from a premium membership. Even if a minor service is not mentioned, the core value is clearly and fully represented.",
160
+ "score": 4
161
+ }
162
+
163
+
164
+ ### Score 5 - Comprehensive Response with Insights
165
+ Definition: The response not only fully and accurately answers the question, but also adds meaningful elaboration, interpretation, or context that enhances the user's understanding. This goes beyond just listing relevant details — it offers insight into why the information matters, how it's useful, or what impact it has.
166
+
167
+ **Example A**
168
+ CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
169
+ RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security, designed to offer residents a comfortable and active lifestyle while ensuring their safety.
170
+
171
+ Expected Output:
172
+ {
173
+ "explanation": "The response fully lists key amenities and additionally explains how these features contribute to resident experience, enhancing the usefulness of the information.",
174
+ "score": 5
175
+ }
176
+
177
+ **Example B**
178
+ CONVERSATION_HISTORY: What services does the premium membership include?
179
+ RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases — tailored for users who want quicker resolutions and first access to new features.
180
+
181
+ Expected Output:
182
+ {
183
+ "explanation": "The response covers all essential services and adds valuable insight about the target user and benefits, enriching the response beyond basic listing.",
184
+ "score": 5
185
+ }
186
+
187
+ ### Multi-turn Conversation Example
188
+ When evaluating responses in a multi-turn conversation, consider the conversation context to understand the user's intent:
189
+
190
+ **Example - Multi-turn Context**
191
+ CONVERSATION_HISTORY: [{"role":"user","content":"I'm planning a vacation to Europe."},{"role":"assistant","content":"That sounds exciting! What time of year are you thinking of traveling?"},{"role":"user","content":"Probably in July. What's the weather like then?"}]
192
+ RESPONSE: [{"role":"assistant","content":"July is summer in Europe with generally warm and pleasant weather. Most countries have temperatures between 20-25°C (68-77°F). It's a popular travel time, so expect crowds at major tourist attractions and higher accommodation prices."}]
193
+
194
+ Expected Output:
195
+ {
196
+ "explanation": "The response directly addresses the weather question while providing valuable context about crowds and pricing that's relevant to vacation planning established in the conversation.",
197
+ "score": 5
198
+ }
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._response_completeness import ResponseCompletenessEvaluator
6
+
7
+ __all__ = ["ResponseCompletenessEvaluator"]
@@ -0,0 +1,202 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import os
6
+ import logging
7
+ import math
8
+ from typing import Dict, List, Union, Optional
9
+
10
+ from typing_extensions import overload, override
11
+
12
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
13
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
14
+ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
15
+ from azure.ai.evaluation._model_configurations import Conversation, Message
16
+ from azure.ai.evaluation._common._experimental import experimental
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ @experimental
22
+ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
23
+ """Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
24
+ provided ground truth.
25
+
26
+ The completeness measure assesses how thoroughly an AI model's generated response aligns with the key information,
27
+ claims, and statements established in the ground truth. This evaluation considers the presence, accuracy,
28
+ and relevance of the content provided.
29
+
30
+ The assessment spans multiple levels, ranging from fully incomplete to fully complete, ensuring a comprehensive
31
+ evaluation of the response's content quality.
32
+
33
+ Use this metric when you need to evaluate an AI model's ability to deliver comprehensive and accurate information,
34
+ particularly in text generation tasks where conveying all essential details is crucial for clarity,
35
+ context, and correctness.
36
+
37
+ Completeness scores range from 1 to 5:
38
+
39
+ 1: Fully incomplete — Contains none of the necessary information.
40
+ 2: Barely complete — Contains only a small portion of the required information.
41
+ 3: Moderately complete — Covers about half of the required content.
42
+ 4: Mostly complete — Includes most of the necessary details with minimal omissions.
43
+ 5: Fully complete — Contains all key information without any omissions.
44
+
45
+ :param model_config: Configuration for the Azure OpenAI model.
46
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
47
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
48
+
49
+ .. admonition:: Example using Azure AI Project URL:
50
+
51
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
52
+ :start-after: [START completeness_evaluator]
53
+ :end-before: [END completeness_evaluator]
54
+ :language: python
55
+ :dedent: 8
56
+ :caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
57
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
58
+
59
+ """
60
+
61
+ # Constants must be defined within eval's directory to be save/loadable
62
+
63
+ _PROMPTY_FILE = "response_completeness.prompty"
64
+ _RESULT_KEY = "response_completeness"
65
+
66
+ id = "azureai://built-in/evaluators/response_completeness"
67
+
68
+ _MIN_COMPLETENESS_SCORE = 1
69
+ _MAX_COMPLETENESS_SCORE = 5
70
+ _DEFAULT_COMPLETENESS_THRESHOLD = 3
71
+
72
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
73
+
74
+ @override
75
+ def __init__(
76
+ self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
77
+ ):
78
+ current_dir = os.path.dirname(__file__)
79
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
80
+ self.threshold = threshold # to be removed in favor of _threshold
81
+ super().__init__(
82
+ model_config=model_config,
83
+ prompty_file=prompty_path,
84
+ result_key=self._RESULT_KEY,
85
+ threshold=threshold,
86
+ credential=credential,
87
+ _higher_is_better=True,
88
+ **kwargs,
89
+ )
90
+
91
+ @overload
92
+ def __call__(
93
+ self,
94
+ *,
95
+ ground_truth: str,
96
+ response: str,
97
+ ) -> Dict[str, Union[str, float]]:
98
+ """Evaluate completeness in given response. Accepts ground truth and response for evaluation.
99
+ Example usage:
100
+ Evaluating completeness for a response string
101
+ ```python
102
+ from azure.ai.evaluation import CompletenessEvaluator
103
+ completeness_evaluator = CompletenessEvaluator(model_config)
104
+ ground_truth = "The ground truth to be evaluated."
105
+ response = "The response to be evaluated."
106
+ completeness_results = completeness_evaluator(ground_truth=ground_truth, response=response)
107
+ ```
108
+ :keword ground_truth: The ground truth to be evaluated.
109
+ :paramtype ground_truth: str
110
+ :keyword response: The response to be evaluated.
111
+ :paramtype response: Union[str, List[Message]]
112
+ :return: The response completeness score results.
113
+ :rtype: Dict[str, Union[str, float]]
114
+ """
115
+
116
+ @overload
117
+ def __call__(
118
+ self,
119
+ *,
120
+ conversation: Conversation,
121
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
122
+ """Evaluate completeness for a conversation
123
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
124
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
125
+ to be dictionaries with keys "content", "role", and possibly "context".
126
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
127
+ :return: The fluency score
128
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
129
+ """
130
+
131
+ @override
132
+ def __call__( # pylint: disable=docstring-missing-param
133
+ self,
134
+ *args,
135
+ **kwargs,
136
+ ):
137
+ """
138
+ Invokes the instance using the overloaded __call__ signature.
139
+
140
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
141
+ """
142
+ return super().__call__(*args, **kwargs)
143
+
144
+ @override
145
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
146
+ """Do completeness evaluation.
147
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
148
+ :type eval_input: Dict
149
+ :return: The evaluation result.
150
+ :rtype: Dict
151
+ """
152
+ # we override the _do_eval method as we want the output to be a dictionary,
153
+ # which is a different schema than _base_prompty_eval.py
154
+ if "ground_truth" not in eval_input or "response" not in eval_input:
155
+ raise EvaluationException(
156
+ message=f"Both ground_truth and response must be provided as input to the completeness evaluator.",
157
+ internal_message=f"Both ground_truth and response must be provided as input to the completeness"
158
+ f" evaluator.",
159
+ blame=ErrorBlame.USER_ERROR,
160
+ category=ErrorCategory.MISSING_FIELD,
161
+ target=ErrorTarget.COMPLETENESS_EVALUATOR,
162
+ )
163
+
164
+ result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
165
+ llm_output = result.get("llm_output") if isinstance(result, dict) else result
166
+
167
+ score = math.nan
168
+ llm_output_is_dict = isinstance(llm_output, dict)
169
+ if llm_output_is_dict or isinstance(llm_output, str):
170
+ reason = ""
171
+ if llm_output_is_dict:
172
+ score = float(llm_output.get("score", math.nan))
173
+ reason = llm_output.get("explanation", "")
174
+ else:
175
+ score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
176
+
177
+ binary_result = self._get_binary_result(score)
178
+
179
+ # updating the result key and threshold to int based on the schema
180
+ return {
181
+ f"{self._result_key}": int(score),
182
+ f"{self._result_key}_result": binary_result,
183
+ f"{self._result_key}_threshold": int(self._threshold),
184
+ f"{self._result_key}_reason": reason,
185
+ f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
186
+ f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
187
+ f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
188
+ f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
189
+ f"{self._result_key}_model": result.get("model_id", ""),
190
+ f"{self._result_key}_sample_input": result.get("sample_input", ""),
191
+ f"{self._result_key}_sample_output": result.get("sample_output", ""),
192
+ }
193
+
194
+ if logger:
195
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
196
+
197
+ binary_result = self._get_binary_result(score)
198
+ return {
199
+ self._result_key: float(score),
200
+ f"{self._result_key}_result": binary_result,
201
+ f"{self._result_key}_threshold": self._threshold,
202
+ }
@@ -0,0 +1,84 @@
1
+ ---
2
+ name: Completeness
3
+ description: Evaluates Completeness score for QA scenario
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 800
9
+ top_p: 1.0
10
+ seed: 123
11
+ presence_penalty: 0
12
+ frequency_penalty: 0
13
+ response_format:
14
+ type: text
15
+
16
+ inputs:
17
+ response:
18
+ type: string
19
+ ground_truth:
20
+ type: string
21
+
22
+ ---
23
+ system:
24
+ # Instruction
25
+ ## Goal
26
+ ### You are an expert in evaluating the quality of a Response from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
27
+ - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
28
+ - **Data**: Your input data include Response and Ground Truth.
29
+ - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
30
+
31
+ user:
32
+ # Definition
33
+ **Completeness** refers to how accurately and thoroughly a response represents the information provided in the ground truth. It considers both the inclusion of all relevant statements and the correctness of those statements. Each statement in the ground truth should be evaluated individually to determine if it is accurately reflected in the response without missing any key information. The scale ranges from 1 to 5, with higher numbers indicating greater completeness.
34
+
35
+ # Ratings
36
+ ## [Completeness: 1] (Fully Incomplete)
37
+ **Definition:** A response that does not contain any of the necessary and relevant information with respect to the ground truth. It completely misses all the information, especially claims and statements, established in the ground truth.
38
+
39
+ **Examples:**
40
+ **Response:** "Flu shot cannot cure cancer. Stay healthy requires sleeping exactly 8 hours a day. A few hours of exercise per week will have little benefits for physical and mental health. Physical and mental health benefits are separate topics. Scientists have not studied any of them."
41
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Staying healthy requires proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
42
+
43
+ ## [Completeness: 2] (Barely Complete)
44
+ **Definition:** A response that contains only a small percentage of all the necessary and relevant information with respect to the ground truth. It misses almost all the information, especially claims and statements, established in the ground truth.
45
+
46
+ **Examples:**
47
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires 2 meals a day. Exercise per week makes no difference to physical and mental health. This is because physical and mental health benefits have low correlation through scientific studies. Scientists are making this observation in studies."
48
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
49
+
50
+ ## [Completeness: 3] (Moderately Complete)
51
+ **Definition:** A response that contains half of the necessary and relevant information with respect to the ground truth. It misses half of the information, especially claims and statements, established in the ground truth.
52
+
53
+ **Examples:**
54
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires a few dollars of investments a day. Even a few dollars of investments per week will not make an impact on physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Fiction writers are starting to discover them through their works."
55
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
56
+
57
+ ## [Completeness: 4] (Mostly Complete)
58
+ **Definition:** A response that contains most of the necessary and relevant information with respect to the ground truth. It misses some minor information, especially claims and statements, established in the ground truth.
59
+
60
+ **Examples:**
61
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires keto diet and rigorous athletic training. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
62
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
63
+
64
+ ## [Completeness: 5] (Fully Complete)
65
+ **Definition:** A response that perfectly contains all the necessary and relevant information with respect to the ground truth. It does not miss any information from statements and claims in the ground truth.
66
+
67
+ **Examples:**
68
+ **Response:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
69
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
70
+
71
+
72
+ # Data
73
+ Response: {{response}}
74
+ Ground Truth: {{ground_truth}}
75
+
76
+
77
+ # Tasks
78
+ ## Please provide your assessment Score for the previous RESPONSE in relation to the GROUND TRUTH based on the Definitions above. Your output should include the following information:
79
+ - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
80
+ - **Explanation**: a very short explanation of why you think the input data should get that Score.
81
+ - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be an integer score (i.e., "1", "2"...) based on the levels of the definitions.
82
+
83
+ ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
84
+ # Output
@@ -2,8 +2,8 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from ._retrieval import RetrievalChatEvaluator
5
+ from ._retrieval import RetrievalEvaluator
6
6
 
7
7
  __all__ = [
8
- "RetrievalChatEvaluator",
8
+ "RetrievalEvaluator",
9
9
  ]