azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,220 @@
1
+ ---
2
+ name: Task Completion
3
+ description: Evaluates whether a task was successfully completed
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 1500
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: json_object
14
+ inputs:
15
+ query:
16
+ type: string
17
+ response:
18
+ type: string
19
+ tool_definitions:
20
+ type: Dict
21
+ optional: true
22
+ default: {}
23
+ ---
24
+ system:
25
+ You are an expert evaluator who determines if an agent has successfully completed the task required by the user based on the final outcome.
26
+
27
+ user:
28
+ ROLE
29
+ ====
30
+ You are a judge on Task Completion who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?**
31
+
32
+ You are NOT evaluating:
33
+ - How well the agent followed instructions
34
+ - How well the agent understood the user's intent
35
+
36
+ You ARE evaluating:
37
+ - Whether the task is actually completed in the final outcome
38
+ - Whether the deliverable meets the user's requirements
39
+ - Whether the end result is actionable and usable
40
+
41
+ INPUT
42
+ =====
43
+ CONVERSATION_HISTORY: {{query}}
44
+ AGENT_RESPONSE: {{response}}
45
+
46
+ CONVERSATION_HISTORY includes the full dialogue. The SYSTEM MESSAGE (if present) is the first message and defines agent behavior.
47
+ AGENT_RESPONSE is the agent's reply to the latest user query.
48
+ Tool calls and tool results are not visible to the user. The user only sees the agent's final response.
49
+
50
+ EVALUATION FRAMEWORK
51
+ ====================
52
+
53
+ A. Identify the Task Requirements:
54
+ - What specific outcome did the user request?
55
+ - What deliverables were expected?
56
+ - What criteria define "completion" for this task?
57
+
58
+ B. Analyze the Final Outcome:
59
+ - What did the agent actually deliver?
60
+ - Is the deliverable complete and usable?
61
+ - Does it meet the user's specified requirements?
62
+
63
+ C. Assess Task Completion:
64
+ - **Fully Complete**: All requirements met, deliverable is ready for use
65
+ - **Partially Complete**: Some requirements met, but missing key components
66
+ - **Incomplete**: No usable deliverable or major requirements unmet
67
+
68
+ D. Assign a Score:
69
+ - **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
70
+ - **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved.
71
+
72
+ OUTPUT FORMAT
73
+ =============
74
+ Output a JSON object with these keys:
75
+ {
76
+ "explanation": "<15-60 words explaining the completion status>",
77
+ "details": {
78
+ "task_requirements": "<15-60 words on what the user specifically requested>",
79
+ "delivered_outcome": "<15-60 words on what the agent actually provided>",
80
+ "completion_gaps": "<15-60 words on missing elements if task is incomplete>"
81
+ },
82
+ "success": <TRUE or FALSE>
83
+ }
84
+
85
+ SCORING EXAMPLES
86
+ ================
87
+
88
+ ### SUCCESS: TRUE - Example A
89
+
90
+ CONVERSATION_HISTORY:
91
+ User: Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine.
92
+
93
+ AGENT_RESPONSE:
94
+ **Day 1:**
95
+ - Morning: Visit Louvre Museum (9 AM - 12 PM)
96
+ - Lunch: Le Comptoir du Relais (traditional French bistro)
97
+ - Afternoon: Walk through Tuileries Garden
98
+ - Dinner: L'As du Fallafel (famous in Marais district)
99
+
100
+ **Day 2:**
101
+ - Morning: Tour Eiffel Tower and Trocadéro
102
+ - Lunch: Du Pain et des Idées (artisanal bakery)
103
+ - Afternoon: Explore Montmartre and Sacré-Cœur
104
+ - Dinner: Le Procope (historic restaurant)
105
+
106
+ **Day 3:**
107
+ - Morning: Visit Notre-Dame Cathedral area
108
+ - Lunch: Marché des Enfants Rouges (food market)
109
+ - Afternoon: Musée d'Orsay
110
+ - Dinner: Guy Savoy (Michelin-starred French cuisine)
111
+
112
+ Transportation: Metro Day Pass recommended. Book Louvre tickets online in advance.
113
+
114
+ EXPECTED OUTPUT:
115
+ {
116
+ "explanation": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.",
117
+ "details": {
118
+ "task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine",
119
+ "delivered_outcome": "Detailed 3-day schedule with specific landmarks, restaurants, and practical tips",
120
+ "completion_gaps": "None"
121
+ },
122
+ "success": TRUE
123
+ }
124
+
125
+ ### SUCCESS: TRUE - Example B
126
+
127
+ CONVERSATION_HISTORY:
128
+ User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
129
+
130
+ AGENT_RESPONSE:
131
+ [TOOL_CALL] run_network_diagnostic()
132
+ [TOOL_RESULT] Issue found: Router firmware outdated, DNS servers slow
133
+
134
+ [TOOL_CALL] upgrade_plan(customer_id="12345", plan="unlimited")
135
+ [TOOL_RESULT] Plan upgraded successfully. Unlimited data active immediately.
136
+
137
+ Your internet issues are resolved - update firmware and change DNS as instructed. Your unlimited data plan is now active. You should see stable connection and no data limits immediately.
138
+
139
+ EXPECTED OUTPUT:
140
+ {
141
+ "explanation": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.",
142
+ "details": {
143
+ "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
144
+ "delivered_outcome": "Diagnostic completed with specific fix instructions, data plan upgrade confirmed active",
145
+ "completion_gaps": "None"
146
+ },
147
+ "success": TRUE
148
+ }
149
+
150
+ ### SUCCESS: FALSE - Example A
151
+
152
+ CONVERSATION_HISTORY:
153
+ User: Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine.
154
+
155
+ AGENT_RESPONSE:
156
+ I'd recommend visiting the Louvre and Eiffel Tower. Paris has great food - try French pastries and wine. The Metro is convenient for getting around.
157
+
158
+ EXPECTED OUTPUT:
159
+ {
160
+ "explanation": "Vague suggestions provided but no actual day-by-day itinerary created. Missing specific landmarks, restaurants, and detailed schedule.",
161
+ "details": {
162
+ "task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine",
163
+ "delivered_outcome": "General recommendations without structured itinerary",
164
+ "completion_gaps": "No day-by-day schedule, no specific restaurant recommendations, no detailed cultural landmarks"
165
+ },
166
+ "success": FALSE
167
+ }
168
+
169
+ ### SUCCESS: FALSE - Example B
170
+
171
+ CONVERSATION_HISTORY:
172
+ User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
173
+
174
+ AGENT_RESPONSE:
175
+ [TOOL_CALL] run_network_diagnostic()
176
+ [TOOL_RESULT] Issue found: Router firmware outdated
177
+ I found that your router firmware needs updating. You can usually do this through your router's admin panel. Let me know if you need help with the data plan upgrade after you fix the connection.
178
+
179
+ EXPECTED OUTPUT:
180
+ {
181
+ "explanation": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.",
182
+ "details": {
183
+ "task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
184
+ "delivered_outcome": "Identified router firmware issue but no specific fix instructions, data upgrade not completed",
185
+ "completion_gaps": "No specific firmware update steps, data plan upgrade not processed"
186
+ },
187
+ "success": FALSE
188
+ }
189
+
190
+ ### SUCCESS: FALSE - Example C
191
+
192
+ CONVERSATION_HISTORY:
193
+ User: Analyze our Q3 sales data and generate a summary report.
194
+
195
+ AGENT_RESPONSE:
196
+ I can help you analyze sales data! Sales analysis is important for business growth. You should look at metrics like revenue, customer acquisition, and seasonal trends. Would you like me to explain different types of sales analysis techniques?
197
+
198
+ EXPECTED OUTPUT:
199
+ {
200
+ "explanation": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.",
201
+ "details": {
202
+ "task_requirements": "Analyze Q3 sales data and generate summary report",
203
+ "delivered_outcome": "General information about sales analysis concepts",
204
+ "completion_gaps": "No data analysis performed, no summary report created"
205
+ },
206
+ "success": FALSE
207
+ }
208
+
209
+ KEY PRINCIPLES
210
+ ==============
211
+
212
+ 1. **Outcome Focus**: Judge only the final deliverable, not the process
213
+ 2. **User Readiness**: Can the user proceed with what was delivered?
214
+ 3. **Requirement Matching**: Does the outcome match what was specifically requested?
215
+ 4. **Completeness**: Are all components of the task addressed?
216
+ 5. **Actionability**: Is the deliverable usable in its current form?
217
+
218
+ Remember: A task can be understood correctly and approached properly but still fail if the final outcome doesn't meet requirements.
219
+
220
+ # Output
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator, _TaskNavigationEfficiencyMatchingMode
6
+
7
+ __all__ = ["_TaskNavigationEfficiencyEvaluator", "_TaskNavigationEfficiencyMatchingMode"]
@@ -0,0 +1,384 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from enum import Enum
5
+ from collections import Counter
6
+ import json
7
+ from typing import Dict, List, Union, Any, Tuple
8
+ from typing_extensions import overload, override
9
+
10
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
11
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
12
+ from azure.ai.evaluation._exceptions import (
13
+ ErrorCategory,
14
+ ErrorTarget,
15
+ EvaluationException,
16
+ )
17
+
18
+
19
+ class _TaskNavigationEfficiencyMatchingMode(str, Enum):
20
+ """
21
+ Enumeration of task navigation efficiency matching mode.
22
+
23
+ This enum allows you to specify which single matching technique should be used when evaluating
24
+ the efficiency of an agent's tool calls sequence against a ground truth path.
25
+ """
26
+
27
+ EXACT_MATCH = "exact_match"
28
+ """
29
+ Binary metric indicating whether the agent's tool calls exactly match the ground truth.
30
+
31
+ Returns True only if the agent's tool calls sequence is identical to the expected sequence
32
+ in both order and content (no extra steps, no missing steps, correct order).
33
+ """
34
+
35
+ IN_ORDER_MATCH = "in_order_match"
36
+ """
37
+ Binary metric allowing extra steps but requiring correct order of required tool calls.
38
+
39
+ Returns True if all ground truth steps appear in the agent's sequence in the correct
40
+ order, even if there are additional steps interspersed.
41
+ """
42
+
43
+ ANY_ORDER_MATCH = "any_order_match"
44
+ """
45
+ Binary metric allowing both extra steps and different ordering.
46
+
47
+ Returns True if all ground truth steps appear in the agent's sequence with sufficient
48
+ frequency, regardless of order. Most lenient matching criterion.
49
+ """
50
+
51
+
52
+ class _TaskNavigationEfficiencyEvaluator(EvaluatorBase):
53
+ """
54
+ Evaluates whether an agent's sequence of actions is efficient and follows optimal decision-making patterns.
55
+
56
+ The Task Navigation Efficiency Evaluator returns binary matching results between the agent's tool usage trajectory and the ground truth expected steps.
57
+ It has three matching techniques: exact match, in-order match (allows extra steps), and any-order match (allows extra steps and ignores order).
58
+ It also returns precision, recall, and F1 scores in properties bag.
59
+
60
+ :param matching_mode: The matching mode to use. Default is "exact_match".
61
+ :type matching_mode: enum[str, _TaskNavigationEfficiencyMatchingMode]
62
+
63
+ .. admonition:: Example:
64
+
65
+ .. code-block:: python
66
+
67
+ from azure.ai.evaluation._evaluators._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator
68
+
69
+ task_navigation_efficiency_eval = _TaskNavigationEfficiencyEvaluator(
70
+ matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH
71
+ )
72
+
73
+ # Example 1: Using simple tool names list
74
+ result = path_efficiency_eval(
75
+ response=[
76
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "identify_tools_to_call", "arguments": {}}]},
77
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "call_tool_A", "arguments": {}}]},
78
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]},
79
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]}
80
+ ],
81
+ ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
82
+ )
83
+
84
+ # Example 2: Using tool names with parameters (exact parameter matching required)
85
+ result = path_efficiency_eval(
86
+ response=[
87
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_1", "name": "search", "arguments": {"query": "weather", "location": "NYC"}}]},
88
+ {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_2", "name": "format_result", "arguments": {"format": "json"}}]}
89
+ ],
90
+ ground_truth=(
91
+ ["search", "format_result"],
92
+ {
93
+ "search": {"query": "weather", "location": "NYC"},
94
+ "format_result": {"format": "json"}
95
+ }
96
+ )
97
+ )
98
+ """
99
+
100
+ id = "azureai://built-in/evaluators/task_navigation_efficiency"
101
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
102
+
103
+ matching_mode: _TaskNavigationEfficiencyMatchingMode
104
+ """The matching mode to use."""
105
+
106
+ @override
107
+ def __init__(
108
+ self,
109
+ *,
110
+ matching_mode: Union[
111
+ str, _TaskNavigationEfficiencyMatchingMode
112
+ ] = _TaskNavigationEfficiencyMatchingMode.EXACT_MATCH,
113
+ ):
114
+ # Type checking for metric parameter
115
+ if isinstance(matching_mode, str):
116
+ try:
117
+ self.matching_mode = _TaskNavigationEfficiencyMatchingMode(matching_mode)
118
+ except ValueError:
119
+ raise ValueError(
120
+ f"matching_mode must be one of {[m.value for m in _TaskNavigationEfficiencyMatchingMode]}, got '{matching_mode}'"
121
+ )
122
+ elif isinstance(matching_mode, _TaskNavigationEfficiencyMatchingMode):
123
+ self.matching_mode = matching_mode
124
+ else:
125
+ raise EvaluationException(
126
+ f"matching_mode must be a string with one of {[m.value for m in _TaskNavigationEfficiencyMatchingMode]} or _TaskNavigationEfficiencyMatchingMode enum, got {type(matching_mode)}",
127
+ internal_message=str(matching_mode),
128
+ target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
129
+ category=ErrorCategory.INVALID_VALUE,
130
+ )
131
+
132
+ super().__init__()
133
+
134
+ def _prepare_steps_for_comparison(
135
+ self,
136
+ agent_tool_pairs: List[Tuple[str, Dict[str, Any]]],
137
+ ground_truth: List[str],
138
+ ground_truth_params: Dict[str, Dict[str, Any]],
139
+ use_parameter_matching: bool,
140
+ ) -> Tuple[
141
+ List[Union[str, Tuple[str, Tuple]]],
142
+ List[Union[str, Tuple[str, Tuple]]],
143
+ ]:
144
+ """Prepare agent and ground truth steps for comparison based on parameter matching mode."""
145
+ agent_steps: List[Union[str, Tuple[str, Tuple]]] = []
146
+ ground_truth_steps: List[Union[str, Tuple[str, Tuple]]] = []
147
+ if use_parameter_matching:
148
+ # When parameter matching is enabled, we need to match both tool name and parameters
149
+ agent_steps = [(pair[0], tuple(sorted(pair[1].items()))) for pair in agent_tool_pairs]
150
+ ground_truth_steps = [
151
+ (name, tuple(sorted(ground_truth_params.get(name, {}).items()))) for name in ground_truth
152
+ ]
153
+ else:
154
+ # When parameter matching is disabled, only compare tool names
155
+ agent_steps = [name for name, _ in agent_tool_pairs]
156
+ ground_truth_steps = [step for step in ground_truth]
157
+
158
+ return agent_steps, ground_truth_steps
159
+
160
+ def _calculate_precision_recall_f1_scores(self, agent_steps: List, ground_truth_steps: List) -> Dict[str, float]:
161
+ """Calculate precision, recall, and F1 scores."""
162
+ if not agent_steps:
163
+ return {"precision_score": 0.0, "recall_score": 0.0, "f1_score": 0.0}
164
+
165
+ # Count occurrences of each step in both lists to handle duplicates
166
+ agent_steps_counts = Counter(agent_steps)
167
+ ground_truth_counts = Counter(ground_truth_steps)
168
+
169
+ # Calculate true positives by taking the minimum count for each common element
170
+ # For each step, count the intersection (min count) of agent and ground truth steps
171
+ true_positives = sum(
172
+ min(agent_steps_counts[step], ground_truth_counts[step])
173
+ for step in agent_steps_counts
174
+ if step in ground_truth_counts
175
+ )
176
+
177
+ # Calculate false positives (agent steps not in ground truth or excess occurrences)
178
+ # For each step, count the excess occurrences of agent steps not in (minus) ground truth
179
+ # or zero (agent steps minus agent steps) if agent steps is less than ground truth
180
+ false_positives = sum(
181
+ agent_steps_counts[step] - min(agent_steps_counts[step], ground_truth_counts.get(step, 0))
182
+ for step in agent_steps_counts
183
+ )
184
+
185
+ # Calculate false negatives (ground truth steps not in agent or missing occurrences)
186
+ # For each step, count the excess occurrences of ground truth steps not in (minus) agent steps
187
+ # or zero (ground truth steps minus ground truth steps) if ground truth steps is less than agent steps
188
+ false_negatives = sum(
189
+ ground_truth_counts[step] - min(ground_truth_counts[step], agent_steps_counts.get(step, 0))
190
+ for step in ground_truth_counts
191
+ )
192
+
193
+ # Calculate precision, recall, F1
194
+ precision = (
195
+ true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
196
+ )
197
+ recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
198
+ f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
199
+
200
+ return {
201
+ "precision_score": precision,
202
+ "recall_score": recall,
203
+ "f1_score": f1_score,
204
+ }
205
+
206
+ def _calculate_exact_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
207
+ """Check if agent steps exactly match ground truth (order and content)."""
208
+ return agent_steps == ground_truth_steps
209
+
210
+ def _calculate_in_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
211
+ """Check if all ground truth steps appear in agent steps in correct order (extra steps allowed)."""
212
+ if not ground_truth_steps:
213
+ return True
214
+
215
+ gt_index = 0
216
+ for step in agent_steps:
217
+ if gt_index < len(ground_truth_steps) and step == ground_truth_steps[gt_index]:
218
+ gt_index += 1
219
+
220
+ return gt_index == len(ground_truth_steps)
221
+
222
+ def _calculate_any_order_match(self, agent_steps: List, ground_truth_steps: List) -> bool:
223
+ """Check if all ground truth steps appear in agent steps with sufficient frequency (any order, extra steps allowed)."""
224
+ # Count occurrences of each step in both lists to handle duplicates
225
+ agent_counts = Counter(agent_steps)
226
+ ground_truth_counts = Counter(ground_truth_steps)
227
+
228
+ # Check if agent has at least as many occurrences of each ground truth step
229
+ return all(agent_counts[step] >= ground_truth_counts[step] for step in ground_truth_counts)
230
+
231
+ _TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS = {
232
+ _TaskNavigationEfficiencyMatchingMode.EXACT_MATCH: _calculate_exact_match,
233
+ _TaskNavigationEfficiencyMatchingMode.IN_ORDER_MATCH: _calculate_in_order_match,
234
+ _TaskNavigationEfficiencyMatchingMode.ANY_ORDER_MATCH: _calculate_any_order_match,
235
+ }
236
+
237
+ @override
238
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[str, float]]]:
239
+ """Produce a path efficiency evaluation result.
240
+
241
+ :param eval_input: The input to the evaluation function. Must contain "response" and "ground_truth".
242
+ :type eval_input: Dict
243
+ :return: The evaluation result.
244
+ :rtype: Dict[str, Union[float, str, Dict[str, float]]]
245
+ """
246
+ response = eval_input["response"]
247
+ ground_truth = eval_input["ground_truth"]
248
+
249
+ # Value and type checking for ground truth steps
250
+ if not ground_truth:
251
+ raise ValueError("ground_truth cannot be empty")
252
+
253
+ # Check if ground_truth is a tuple (tool names + parameters) or list (tool names only)
254
+ use_parameter_matching = False
255
+ ground_truth_names = []
256
+ ground_truth_params_dict: Dict[str, Dict[str, Any]] = {}
257
+
258
+ if isinstance(ground_truth, tuple) and len(ground_truth) == 2:
259
+ # Tuple format: (tool_names, parameters_dict)
260
+ tool_names_list, params_dict = ground_truth
261
+
262
+ if not isinstance(tool_names_list, list) or not all(isinstance(name, str) for name in tool_names_list):
263
+ raise TypeError("ground_truth tuple first element must be a list of strings (tool names)")
264
+
265
+ if not isinstance(params_dict, dict):
266
+ raise TypeError(
267
+ "ground_truth tuple second element must be a dictionary mapping tool names to parameters"
268
+ )
269
+
270
+ # Validate that all values in params_dict are dictionaries with string keys and values
271
+ for tool_name, params in params_dict.items():
272
+ if not isinstance(tool_name, str):
273
+ raise TypeError("ground_truth parameters dictionary keys must be strings (tool names)")
274
+ if not isinstance(params, dict):
275
+ raise TypeError(f"ground_truth parameters for tool '{tool_name}' must be a dictionary")
276
+ for k, v in params.items():
277
+ if not isinstance(k, str):
278
+ raise TypeError(f"ground_truth parameters for tool '{tool_name}' must have string keys")
279
+ try:
280
+ json.dumps(v)
281
+ except (TypeError, ValueError):
282
+ raise TypeError(
283
+ f"ground_truth parameters for tool '{tool_name}' must have JSON-serializable values (got type {type(v)} for key '{k}')"
284
+ )
285
+
286
+ ground_truth_names = [name.strip() for name in tool_names_list]
287
+ ground_truth_params_dict = params_dict
288
+ use_parameter_matching = True
289
+ elif isinstance(ground_truth, list) and all(isinstance(step, str) for step in ground_truth):
290
+ # List format: just tool names
291
+ ground_truth_names = [step.strip() for step in ground_truth]
292
+ use_parameter_matching = False
293
+ else:
294
+ raise TypeError(
295
+ "ground_truth must be a list of strings or a tuple of (list[str], dict[str, dict[str, str]])"
296
+ )
297
+
298
+ # Extract tool information from the response
299
+ agent_tool_pairs = self._extract_tool_names_and_params_from_response(response)
300
+
301
+ # Prepare steps for comparison
302
+ agent_steps, ground_truth_steps = self._prepare_steps_for_comparison(
303
+ agent_tool_pairs,
304
+ ground_truth_names,
305
+ ground_truth_params_dict,
306
+ use_parameter_matching,
307
+ )
308
+
309
+ # Calculate precision, recall, and F1 scores
310
+ additional_properties_metrics = self._calculate_precision_recall_f1_scores(agent_steps, ground_truth_steps)
311
+
312
+ # Convert metrics to floats, using nan for None or non-convertible values
313
+ for metric, score in additional_properties_metrics.items():
314
+ additional_properties_metrics[metric] = float(score) if score is not None else float("nan")
315
+
316
+ if self.matching_mode in self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS:
317
+ # Calculate binary match metrics
318
+ match_result = self._TASK_NAVIGATION_EFFICIENCY_MATCHING_MODE_TO_FUNCTIONS[self.matching_mode](
319
+ self, agent_steps, ground_truth_steps
320
+ )
321
+
322
+ return {
323
+ "task_navigation_efficiency_label": match_result,
324
+ "task_navigation_efficiency_result": EVALUATION_PASS_FAIL_MAPPING[match_result],
325
+ "task_navigation_efficiency_details": additional_properties_metrics,
326
+ }
327
+ else:
328
+ raise EvaluationException(
329
+ f"Unsupported matching_mode '{self.matching_mode}'",
330
+ internal_message=str(self.matching_mode),
331
+ target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
332
+ category=ErrorCategory.INVALID_VALUE,
333
+ )
334
+
335
+ @overload
336
+ def __call__( # type: ignore
337
+ self, *, response: Union[str, List[Dict[str, Any]]], ground_truth: List[str]
338
+ ) -> Dict[str, Union[float, str, Dict[str, float]]]:
339
+ """
340
+ Evaluate the task navigation efficiency of an agent's action sequence.
341
+
342
+ :keyword response: The agent's response containing tool calls.
343
+ :paramtype response: Union[str, List[Dict[str, Any]]]
344
+ :keyword ground_truth: List of expected tool/action steps.
345
+ :paramtype ground_truth: List[str]
346
+ :return: The task navigation efficiency scores and results.
347
+ :rtype: Dict[str, Union[float, str, Dict[str, float]]]
348
+ """
349
+
350
+ @overload
351
+ def __call__( # type: ignore
352
+ self,
353
+ *,
354
+ response: Union[str, List[Dict[str, Any]]],
355
+ ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]],
356
+ ) -> Dict[str, Union[float, str, Dict[str, float]]]:
357
+ """
358
+ Evaluate the task navigation efficiency of an agent's action sequence with tool parameters.
359
+
360
+ :keyword response: The agent's response containing tool calls.
361
+ :paramtype response: Union[str, List[Dict[str, Any]]]
362
+ :keyword ground_truth: Tuple of (tool names list, parameters dict) where parameters must match exactly.
363
+ :paramtype ground_truth: Tuple[List[str], Dict[str, Dict[str, str]]]
364
+ :return: The task navigation efficiency scores and results.
365
+ :rtype: Dict[str, Union[float, str, Dict[str, float]]]
366
+ """
367
+
368
+ @override
369
+ def __call__(
370
+ self,
371
+ *args,
372
+ **kwargs,
373
+ ):
374
+ """
375
+ Evaluate task navigation efficiency.
376
+
377
+ :keyword response: The agent's response containing tool calls.
378
+ :paramtype response: Union[str, List[Dict[str, Any]]]
379
+ :keyword ground_truth: List of expected tool/action steps or tuple of (tool names, parameters dict).
380
+ :paramtype ground_truth: Union[List[str], Tuple[List[str], Dict[str, Dict[str, str]]]]
381
+ :return: The task navigation efficiency scores and results.
382
+ :rtype: Dict[str, Union[float, str, Dict[str, float]]]
383
+ """
384
+ return super().__call__(*args, **kwargs)
@@ -0,0 +1,9 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._tool_call_accuracy import ToolCallAccuracyEvaluator
6
+
7
+ __all__ = [
8
+ "ToolCallAccuracyEvaluator",
9
+ ]