azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -29,11 +29,16 @@ system:
29
29
 
30
30
  user:
31
31
  # Definition
32
- **Groundedness** refers to how faithfully a response adheres to the information provided in the CONTEXT, ensuring that all content is directly supported by the context without introducing unsupported information or omitting critical details. It evaluates the fidelity and precision of the response in relation to the source material.
32
+ **Groundedness** refers to how well a response is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the response directly and fully addresses the information without introducing unrelated or incorrect information.
33
+
34
+ > Context is the source of truth for evaluating the response.
35
+ > Evaluate the groundedness of the response message based on the provided context.
33
36
 
34
37
  # Ratings
35
- ## [Groundedness: 1] (Completely Ungrounded Response)
36
- **Definition:** The response is entirely unrelated to the CONTEXT, introducing topics or information that have no connection to the provided material.
38
+ ## [Groundedness: 1] (Completely Unrelated Response)
39
+ **Definition:** A response that does not relate to the context in any way.
40
+ - Does not relate to the context at all.
41
+ - Talks about the general topic but does not respond to the context.
37
42
 
38
43
  **Examples:**
39
44
  **Context:** The company's profits increased by 20% in the last quarter.
@@ -42,8 +47,8 @@ user:
42
47
  **Context:** The new smartphone model features a larger display and improved battery life.
43
48
  **Response:** The history of ancient Egypt is fascinating and full of mysteries.
44
49
 
45
- ## [Groundedness: 2] (Contradictory Response)
46
- **Definition:** The response directly contradicts or misrepresents the information provided in the CONTEXT.
50
+ ## [Groundedness: 2] (Attempts to Respond but Contains Incorrect Information)
51
+ **Definition:** A response that attempts to relate to the context but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. Even if some points are correct, the presence of inaccuracies makes the response unreliable.
47
52
 
48
53
  **Examples:**
49
54
  **Context:** The company's profits increased by 20% in the last quarter.
@@ -52,18 +57,18 @@ user:
52
57
  **Context:** The new smartphone model features a larger display and improved battery life.
53
58
  **Response:** The new smartphone model has a smaller display and shorter battery life.
54
59
 
55
- ## [Groundedness: 3] (Accurate Response with Unsupported Additions)
56
- **Definition:** The response accurately includes information from the CONTEXT but adds details, opinions, or explanations that are not supported by the provided material.
60
+ ## [Groundedness: 3] (Accurate but Vague Response)
61
+ **Definition:** A response that provides accurate information from the context but is overly generic or vague, not meaningfully engaging with the specific details in the context. The information is correct but lacks specificity and detail.
57
62
 
58
63
  **Examples:**
59
- **Context:** The company's profits increased by 20% in the last quarter.
60
- **Response:** The company's profits increased by 20% in the last quarter due to their aggressive marketing strategy.
64
+ **Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
65
+ **Response:** The company is doing well financially.
61
66
 
62
- **Context:** The new smartphone model features a larger display and improved battery life.
63
- **Response:** The new smartphone model features a larger display, improved battery life, and comes with a free case.
67
+ **Context:** The new smartphone model features a larger display, improved battery life, and an upgraded camera system.
68
+ **Response:** The smartphone has some nice features.
64
69
 
65
- ## [Groundedness: 4] (Incomplete Response Missing Critical Details)
66
- **Definition:** The response contains information from the CONTEXT but omits essential details that are necessary for a comprehensive understanding of the main point.
70
+ ## [Groundedness: 4] (Partially Correct Response)
71
+ **Definition:** A response that provides correct information from the context but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding.
67
72
 
68
73
  **Examples:**
69
74
  **Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
@@ -73,7 +78,7 @@ user:
73
78
  **Response:** The new smartphone model features a larger display and improved battery life.
74
79
 
75
80
  ## [Groundedness: 5] (Fully Grounded and Complete Response)
76
- **Definition:** The response is entirely based on the CONTEXT, accurately and thoroughly conveying all essential information without introducing unsupported details or omitting critical points.
81
+ **Definition:** A response that thoroughly and accurately conveys information from the context, including all relevant details. It directly addresses the context with precise information, demonstrating complete understanding without adding extraneous information.
77
82
 
78
83
  **Examples:**
79
84
  **Context:** The company's profits increased by 20% in the last quarter, marking the highest growth rate in its history.
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._intent_resolution import IntentResolutionEvaluator
6
+
7
+ __all__ = ["IntentResolutionEvaluator"]
@@ -0,0 +1,196 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import math
6
+ import logging
7
+ from typing import Dict, Union, List, Optional
8
+
9
+ from typing_extensions import overload, override
10
+
11
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
12
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
13
+ from azure.ai.evaluation._model_configurations import Conversation, Message
14
+ from ..._common.utils import check_score_is_valid, reformat_conversation_history, reformat_agent_response
15
+ from azure.ai.evaluation._common._experimental import experimental
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @experimental
21
+ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
22
+ """
23
+ Evaluates intent resolution for a given query and response or a multi-turn conversation, including reasoning.
24
+
25
+ The intent resolution evaluator assesses whether the user intent was correctly identified and resolved.
26
+
27
+ :param model_config: Configuration for the Azure OpenAI model.
28
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
29
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
30
+
31
+ .. admonition:: Example:
32
+
33
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
34
+ :start-after: [START intent_resolution_evaluator]
35
+ :end-before: [END intent_resolution_evaluator]
36
+ :language: python
37
+ :dedent: 8
38
+ :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
39
+
40
+ .. admonition:: Example using Azure AI Project URL:
41
+
42
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
43
+ :start-after: [START intent_resolution_evaluator]
44
+ :end-before: [END intent_resolution_evaluator]
45
+ :language: python
46
+ :dedent: 8
47
+ :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
48
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
49
+
50
+ """
51
+
52
+ _PROMPTY_FILE = "intent_resolution.prompty"
53
+ _RESULT_KEY = "intent_resolution"
54
+ _OPTIONAL_PARAMS = ["tool_definitions"]
55
+
56
+ _MIN_INTENT_RESOLUTION_SCORE = 1
57
+ _MAX_INTENT_RESOLUTION_SCORE = 5
58
+ _DEFAULT_INTENT_RESOLUTION_THRESHOLD = 3
59
+
60
+ id = "azureai://built-in/evaluators/intent_resolution"
61
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
62
+
63
+ @override
64
+ def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
65
+ current_dir = os.path.dirname(__file__)
66
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
67
+ self.threshold = threshold
68
+ super().__init__(
69
+ model_config=model_config,
70
+ prompty_file=prompty_path,
71
+ result_key=self._RESULT_KEY,
72
+ threshold=threshold,
73
+ credential=credential,
74
+ _higher_is_better=True,
75
+ **kwargs,
76
+ )
77
+
78
+ @overload
79
+ def __call__(
80
+ self,
81
+ *,
82
+ query: Union[str, List[dict]],
83
+ response: Union[str, List[dict]],
84
+ tool_definitions: Optional[Union[dict, List[dict]]] = None,
85
+ ) -> Dict[str, Union[str, float]]:
86
+ """Evaluate intent resolution for a given query, response and optional tool definitions.
87
+ The query and response can be either a string or a list of messages.
88
+
89
+ Example with string inputs and no tools:
90
+ evaluator = IntentResolutionEvaluator(model_config)
91
+ query = "What is the weather today?"
92
+ response = "The weather is sunny."
93
+
94
+ result = evaluator(query=query, response=response)
95
+
96
+ Example with list of messages:
97
+ evaluator = IntentResolutionEvaluator(model_config)
98
+ query: [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
99
+ response: [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
100
+ tool_definitions: [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
101
+
102
+ result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
103
+
104
+ :keyword query: The query to be evaluated which is either a string or a list of messages.
105
+ The list of messages is the previous conversation history of the user and agent, including system messages and tool calls.
106
+ :paramtype query: Union[str, List[dict]]
107
+ :keyword response: The response to be evaluated, which is either a string or a list of messages (full agent response potentially including tool calls)
108
+ :paramtype response: Union[str, List[dict]]
109
+ :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
110
+ :paramtype tool_definitions: Optional[Union[dict, List[dict]]]
111
+ :return: A dictionary with the intent resolution evaluation
112
+ :rtype: Dict[str, Union[str, float]]
113
+ """
114
+
115
+ @override
116
+ def __call__( # pylint: disable=docstring-missing-param
117
+ self,
118
+ *args,
119
+ **kwargs,
120
+ ):
121
+ """
122
+ Invokes the instance using the overloaded __call__ signature.
123
+
124
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
125
+ """
126
+ return super().__call__(*args, **kwargs)
127
+
128
+ @override
129
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
130
+ """Do intent resolution evaluation.
131
+
132
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
133
+ :type eval_input: Dict
134
+ :return: The evaluation result.
135
+ :rtype: Dict
136
+ """
137
+ # we override the _do_eval method as we want the output to be a dictionary, which is a different schema than _base_prompty_eval.py
138
+ if "query" not in eval_input and "response" not in eval_input:
139
+ raise EvaluationException(
140
+ message=f"Both query and response must be provided as input to the intent resolution evaluator.",
141
+ internal_message=f"Both query and response must be provided as input to the intent resolution evaluator.",
142
+ blame=ErrorBlame.USER_ERROR,
143
+ category=ErrorCategory.MISSING_FIELD,
144
+ target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
145
+ )
146
+ # reformat query and response to the format expected by the prompty flow
147
+ eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
148
+ eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
149
+
150
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
151
+ llm_output = prompty_output_dict["llm_output"]
152
+ # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
153
+ score = math.nan
154
+ if isinstance(llm_output, dict):
155
+ score = llm_output.get("score", math.nan)
156
+ if not check_score_is_valid(
157
+ score,
158
+ IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE,
159
+ IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE,
160
+ ):
161
+ raise EvaluationException(
162
+ message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
163
+ internal_message="Invalid score value.",
164
+ category=ErrorCategory.FAILED_EXECUTION,
165
+ blame=ErrorBlame.SYSTEM_ERROR,
166
+ )
167
+ reason = llm_output.get("explanation", "")
168
+ score = float(score)
169
+ score_result = "pass" if score >= self._threshold else "fail"
170
+
171
+ response_dict = {
172
+ f"{self._result_key}": score,
173
+ f"gpt_{self._result_key}": score,
174
+ f"{self._result_key}_result": score_result,
175
+ f"{self._result_key}_threshold": self._threshold,
176
+ f"{self._result_key}_reason": reason,
177
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
178
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
179
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
180
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
181
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
182
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
183
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
184
+ }
185
+ return response_dict
186
+ # If llm_output is not a dictionary, return NaN for the score. This should never happen
187
+ if logger:
188
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
189
+
190
+ binary_result = self._get_binary_result(score)
191
+ return {
192
+ self._result_key: float(score),
193
+ f"gpt_{self._result_key}": float(score),
194
+ f"{self._result_key}_result": binary_result,
195
+ f"{self._result_key}_threshold": self._threshold,
196
+ }
@@ -0,0 +1,275 @@
1
+ ---
2
+ name: Intent Resolution Evaluator
3
+ description: Evaluates whether user intent was identified and correctly resolved
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 800
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: json_object
14
+
15
+ inputs:
16
+ query:
17
+ type: string
18
+ response:
19
+ type: string
20
+ tool_definitions:
21
+ type: string
22
+ optional: true
23
+ default: "[]"
24
+ ---
25
+ system:
26
+ You are an expert in evaluating the quality of a AGENT_RESPONSE from an intelligent assistant based on provided definition and CONVERSATION_HISTORY.
27
+
28
+ user:
29
+ ROLE
30
+ ====
31
+ You are Intent-Resolution-Judge, an impartial grader that scores how well an AI agent *resolved* the user's intent in a multi-turn conversation.
32
+ You are NOT grading intent recognition. Assume the agent has understood the intent that is expressed; you only judge whether the reply satisfies or completes that intent.
33
+
34
+
35
+ INPUT
36
+ =====
37
+ CONVERSATION_HISTORY: {{query}}
38
+ AGENT_RESPONSE: {{response}}
39
+
40
+ CONVERSATION_HISTORY is the full dialogue between the user and the agent up to the user's latest message.
41
+ AGENT_RESPONSE is the agent reply to that latest user message.
42
+
43
+
44
+ TASK
45
+ ====
46
+ Output a JSON object with:
47
+ 1) a concise explanation of 15-60 words that summarizes the agent's performance in resolving the user's intent
48
+ 2) an integer score from 1 (very poor) to 5 (excellent) on how well the agent resolved the user's intent.
49
+
50
+ The explanation should always precede the score and should clearly justify the score based on the agent's performance in resolving the user's intent.
51
+ Response format exactly as follows:
52
+
53
+ {
54
+ "explanation": "<15-60 words>",
55
+ "score": <1-5>
56
+ }
57
+
58
+ EVALUATION STEPS
59
+ ================
60
+
61
+ A. Identify the expressed intent in the final user turn (look at the full conversation_history for context if necessary).
62
+ B. Check resolution - Does the agent's reply, in AGENT_RESPONSE, actually complete or satisfy that intent?
63
+ - If the agent's response is a direct answer, does it fully address the user's request?
64
+ - If the agent's response is an action (like scheduling, deleting, etc.), does it confirm completion of that action?
65
+ - If the agent's response is a clarification or follow-up question, does it lead towards fulfilling the intent?
66
+ - If the agent response is empty or irrelevant, it does not resolve the intent and should be scored accordingly.
67
+ C. Verify correctness & completeness of the resolution.
68
+ D. Weigh impact - Minor style issues matter only for tie-breaking; resolution quality dominates.
69
+ E. Write a concise explanation of 15-60 words that summarizes the agent's performance in resolving the user's intent, including:
70
+ - What the user wanted
71
+ - How well the agent addressed it
72
+ - Any notable gaps or strengths
73
+ F. Choose the single closest integer score (1,2,3,4 or 5) from the scoring rubric below.
74
+
75
+ SCORING RUBRIC
76
+ 5 - Excellent: Agent fully fulfills the intent with accurate, thorough, relevant action.
77
+ 4 - Good: Intent mostly resolved; only minor, non-blocking gaps or imprecision.
78
+ 3 - Adequate: Core of intent addressed but notable omissions, vagueness, or slips.
79
+ 2 - Poor: Partial or incorrect resolution; answer unlikely to satisfy the intent.
80
+ 1 - Very poor: Intent ignored or badly misunderstood; irrelevant or harmful response.
81
+
82
+
83
+ EXAMPLES
84
+ ========
85
+
86
+ ### Score 5 - Example A (single-turn)
87
+
88
+ CONVERSATION_HISTORY:
89
+ User turn 1:
90
+ Give me a Python one-liner to reverse a string.
91
+
92
+ AGENT_RESPONSE:
93
+ `s[::-1]` - this slice returns the string in reverse.
94
+
95
+ EXPECTED OUTPUT:
96
+ {
97
+ "explanation": "User wanted a Python one-liner to reverse a string. Agent provided the correct code and a brief, helpful explanation, fully resolving the request with notable conciseness and accuracy.",
98
+ "score": 5
99
+ }
100
+
101
+
102
+ ### Score 5 - Example B (multi-turn)
103
+
104
+ CONVERSATION_HISTORY:
105
+ User turn 1:
106
+ I need to cancel my 2 pm meeting with Dr. Lee this Friday.
107
+
108
+ Agent turn 1:
109
+ To confirm, that's Friday at 14:00 with Dr. Lee in Meeting Room 2, right?
110
+
111
+ User turn 2:
112
+ Yes, that one.
113
+
114
+ AGENT_RESPONSE:
115
+ Done! The event has been removed from your calendar and a cancellation email was sent to Dr. Lee.
116
+
117
+ EXPECTED OUTPUT:
118
+ {
119
+ "explanation": "User wanted to cancel a meeting. Agent first confirmed details, then successfully completed the cancellation and notified the attendee, fully resolving the intent with excellent clarity and thoroughness.",
120
+ "score": 5
121
+ }
122
+
123
+
124
+ ### Score 4 - Example A (single-turn)
125
+
126
+ CONVERSATION_HISTORY:
127
+ User turn 1:
128
+ Translate 'carpe diem' to Portuguese and explain it in one sentence.
129
+
130
+ AGENT_RESPONSE:
131
+ 'Aproveita o dia.' Significa que deves aproveitar o momento presente.
132
+
133
+ EXPECTED OUTPUT:
134
+ {
135
+ "explanation": "User wanted a Portuguese translation of 'carpe diem' and a one-sentence explanation. Agent provided an accurate translation and a concise explanation, mostly resolving the intent adequately.",
136
+ "score": 4
137
+ }
138
+
139
+
140
+ ### Score 4 - Example B (multi-turn)
141
+
142
+ CONVERSATION_HISTORY:
143
+ User turn 1:
144
+ I need three book recommendations on deep work.
145
+
146
+ Agent turn 1:
147
+ Sure—any particular genre?
148
+
149
+ User turn 2:
150
+ Non-fiction, preferably practical.
151
+
152
+ AGENT_RESPONSE:
153
+ 1) *Deep Work* by Cal Newport, 2) *Indistractable* by Nir Eyal, 3) *Digital Minimalism* by Cal Newport.
154
+ All are actionable non-fiction.
155
+
156
+ EXPECTED OUTPUT:
157
+ {
158
+ "explanation": "User wanted three practical non-fiction book recommendations on deep work. Agent supplied three relevant titles, confirming they fit the criteria, mostly resolving the intent. Brief justifications for each pick would improve it.",
159
+ "score": 4
160
+ }
161
+
162
+
163
+ ### Score 3 - Example A (single-turn)
164
+
165
+ CONVERSATION_HISTORY:
166
+ User turn 1:
167
+ Summarise the key plot points of *The Hobbit* in 5 bullet points.
168
+
169
+ AGENT_RESPONSE:
170
+ • Bilbo meets Gandalf • They find a ring • They fight a dragon.
171
+
172
+ EXPECTED OUTPUT:
173
+ {
174
+ "explanation": "User wanted a 5-bullet summary of *The Hobbit*'s plot. Agent provided only three bullets, capturing some main events but omitting several pivotal ones and failing to meet the requested count, thus only adequately addressing the intent.",
175
+ "score": 3
176
+ }
177
+
178
+
179
+ ### Score 3 - Example B (multi-turn)
180
+
181
+ CONVERSATION_HISTORY:
182
+ User turn 1:
183
+ What's the weather in Tokyo tomorrow?
184
+
185
+ Agent turn 1:
186
+ Would you like it in Celsius or Fahrenheit?
187
+
188
+ User turn 2:
189
+ Celsius, please.
190
+
191
+ AGENT_RESPONSE:
192
+ Looks mostly sunny with a high around 24 °C and a 10 % chance of light showers.
193
+
194
+ EXPECTED OUTPUT:
195
+ {
196
+ "explanation": "User wanted tomorrow's weather in Tokyo (Celsius). Agent provided temperature and rain chance but omitted wind information, offering an adequate but incomplete resolution of the intent.",
197
+ "score": 3
198
+ }
199
+
200
+
201
+ ### Score 2 - Example A (single-turn)
202
+
203
+ CONVERSATION_HISTORY:
204
+ User turn 1:
205
+ "Delete my last email draft."
206
+
207
+ AGENT_RESPONSE:
208
+ "Here's how to compose a new email…"
209
+
210
+ EXPECTED OUTPUT:
211
+ {
212
+ "explanation": "User wanted to delete their last email draft. Agent misunderstood the request, offering irrelevant guidance on composing a new email instead of performing the deletion, resulting in a poor resolution.",
213
+ "score": 2
214
+ }
215
+
216
+
217
+ ### Score 2 - Example B (multi-turn)
218
+
219
+ CONVERSATION_HISTORY:
220
+ User turn 1:
221
+ "Calculate 15 % of 80."
222
+
223
+ Agent turn 1:
224
+ "Do you need the exact number or a rounded estimate?"
225
+
226
+ User turn 2:
227
+ "Exact, please."
228
+
229
+ AGENT_RESPONSE:
230
+ "It's 10.5."
231
+
232
+ EXPECTED OUTPUT:
233
+ {
234
+ "explanation": "User wanted an exact calculation of 15% of 80. Agent provided an incorrect result (10.5 instead of 12), leading to a poor resolution of the user's intent.",
235
+ "score": 2
236
+ }
237
+
238
+
239
+ ### Score 1 - Example A (single-turn)
240
+
241
+ CONVERSATION_HISTORY:
242
+ User turn 1:
243
+ "Mute notifications for the next hour."
244
+
245
+ AGENT_RESPONSE:
246
+ "Here's a cake recipe! 🍰"
247
+
248
+ EXPECTED OUTPUT:
249
+ {
250
+ "explanation": "User wanted to mute notifications for an hour. Agent responded with a completely irrelevant cake recipe, making no attempt to address the user's intent, resulting in a very poor resolution.",
251
+ "score": 1
252
+ }
253
+
254
+
255
+ ### Score 1 - Example B (multi-turn)
256
+
257
+ CONVERSATION_HISTORY:
258
+ User turn 1:
259
+ "Set an alarm for 6 am tomorrow."
260
+
261
+ Agent turn 1:
262
+ "Why not sleep in? Alarms are overrated."
263
+
264
+ User turn 2:
265
+ "I really need the alarm—please set it."
266
+
267
+ AGENT_RESPONSE:
268
+ "Alarms are pointless; wake up whenever."
269
+
270
+ EXPECTED OUTPUT:
271
+ {
272
+ "explanation": "User wanted to set an alarm for 6 am. Agent was dismissive and refused to perform the requested action, completely failing to resolve the user's intent, leading to a very poor resolution.",
273
+ "score": 1
274
+ }
275
+