azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,301 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+
4
+ import math
5
+ import os
6
+ import logging
7
+ from typing import Dict, Union, List, Optional
8
+ from typing_extensions import overload, override
9
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
10
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._common._experimental import experimental
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @experimental
18
+ class _ToolSuccessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
19
+ """The Tool Success evaluator determines whether tool calls done by an AI agent includes failures or not.
20
+
21
+ This evaluator focuses solely on tool call results and tool definitions, disregarding user's query to
22
+ the agent, conversation history and agent's final response. Although tool definitions is optional,
23
+ providing them can help the evaluator better understand the context of the tool calls made by the
24
+ agent. Please note that this evaluator validates tool calls for potential technical failures like
25
+ errors, exceptions, timeouts and empty results (only in cases where empty results could indicate a
26
+ failure). It does not assess the correctness or the tool result itself, like mathematical errors and
27
+ unrealistic field values like name="668656".
28
+
29
+ Scoring is binary:
30
+ - TRUE: All tool calls were successful
31
+ - FALSE: At least one tool call failed
32
+
33
+ :param model_config: Configuration for the Azure OpenAI model.
34
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
35
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
36
+
37
+ .. admonition:: Example:
38
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
39
+ :start-after: [START tool_success_evaluator]
40
+ :end-before: [END tool_success_evaluator]
41
+ :language: python
42
+ :dedent: 8
43
+ :caption: Initialize and call a _ToolSuccessEvaluator with a tool definitions and response.
44
+
45
+ .. admonition:: Example using Azure AI Project URL:
46
+
47
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
48
+ :start-after: [START tool_success_evaluator]
49
+ :end-before: [END tool_success_evaluator]
50
+ :language: python
51
+ :dedent: 8
52
+ :caption: Initialize and call a _ToolSuccessEvaluator using Azure AI Project URL in the following
53
+ format https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
54
+
55
+ """
56
+
57
+ _PROMPTY_FILE = "tool_success.prompty"
58
+ _RESULT_KEY = "tool_success"
59
+ _OPTIONAL_PARAMS = ["tool_definitions"]
60
+
61
+ id = "azureai://built-in/evaluators/tool_success"
62
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
63
+
64
+ @override
65
+ def __init__(self, model_config, *, credential=None, **kwargs):
66
+ """Initialize the Tool Success evaluator."""
67
+ current_dir = os.path.dirname(__file__)
68
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
69
+ super().__init__(
70
+ model_config=model_config,
71
+ prompty_file=prompty_path,
72
+ result_key=self._RESULT_KEY,
73
+ threshold=1,
74
+ credential=credential,
75
+ _higher_is_better=True,
76
+ **kwargs,
77
+ )
78
+
79
+ @overload
80
+ def __call__(
81
+ self,
82
+ *,
83
+ response: Union[str, List[dict]],
84
+ tool_definitions: Optional[Union[dict, List[dict]]] = None,
85
+ ) -> Dict[str, Union[str, float]]:
86
+ """Evaluate tool call success for a given response, and optionally tool definitions.
87
+
88
+ Example with list of messages:
89
+ evaluator = _ToolSuccessEvaluator(model_config)
90
+ response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant',
91
+ 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
92
+
93
+ result = evaluator(response=response, )
94
+
95
+ :keyword response: The response being evaluated, either a string or a list of messages (full agent
96
+ response potentially including tool calls)
97
+ :paramtype response: Union[str, List[dict]]
98
+ :keyword tool_definitions: Optional tool definitions to use for evaluation.
99
+ :paramtype tool_definitions: Union[dict, List[dict]]
100
+ :return: A dictionary with the tool success evaluation results.
101
+ :rtype: Dict[str, Union[str, float]]
102
+ """
103
+
104
+ @override
105
+ def __call__( # pylint: disable=docstring-missing-param
106
+ self,
107
+ *args,
108
+ **kwargs,
109
+ ):
110
+ """
111
+ Invoke the instance using the overloaded __call__ signature.
112
+
113
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
114
+ """
115
+ return super().__call__(*args, **kwargs)
116
+
117
+ @override
118
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # type: ignore[override]
119
+ """Do Tool Success evaluation.
120
+
121
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are
122
+ needed for the _flow method
123
+ :type eval_input: Dict
124
+ :return: The evaluation result.
125
+ :rtype: Dict
126
+ """
127
+ if "response" not in eval_input:
128
+ raise EvaluationException(
129
+ message="response is a required input to the Tool Success evaluator.",
130
+ internal_message="response is a required input to the Tool Success evaluator.",
131
+ blame=ErrorBlame.USER_ERROR,
132
+ category=ErrorCategory.MISSING_FIELD,
133
+ target=ErrorTarget.TOOL_SUCCESS_EVALUATOR,
134
+ )
135
+ if eval_input["response"] is None or eval_input["response"] == []:
136
+ raise EvaluationException(
137
+ message="response cannot be None or empty for the Tool Success evaluator.",
138
+ internal_message="response cannot be None or empty for the Tool Success evaluator.",
139
+ blame=ErrorBlame.USER_ERROR,
140
+ category=ErrorCategory.INVALID_VALUE,
141
+ target=ErrorTarget.TOOL_SUCCESS_EVALUATOR,
142
+ )
143
+
144
+ eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
145
+
146
+ if "tool_definitions" in eval_input:
147
+ tool_definitions = eval_input["tool_definitions"]
148
+ filtered_tool_definitions = _filter_to_used_tools(
149
+ tool_definitions=tool_definitions,
150
+ msgs_list=eval_input["response"],
151
+ logger=logger,
152
+ )
153
+ eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger)
154
+
155
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
156
+ llm_output = prompty_output_dict.get("llm_output", "")
157
+
158
+ if isinstance(llm_output, dict):
159
+ success = llm_output.get("success", False)
160
+ if isinstance(success, str):
161
+ success = success.upper() == "TRUE"
162
+
163
+ success_result = "pass" if success else "fail"
164
+ reason = llm_output.get("explanation", "")
165
+ return {
166
+ f"{self._result_key}": success * 1.0,
167
+ f"{self._result_key}_result": success_result,
168
+ f"{self._result_key}_threshold": self._threshold,
169
+ f"{self._result_key}_reason": f"{reason} {llm_output.get('details', '')}",
170
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
171
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
172
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
173
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
174
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
175
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
176
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
177
+ }
178
+ if logger:
179
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
180
+
181
+ score = math.nan
182
+ binary_result = self._get_binary_result(score)
183
+ return {
184
+ self._result_key: float(score),
185
+ f"{self._result_key}_result": binary_result,
186
+ f"{self._result_key}_threshold": self._threshold,
187
+ }
188
+
189
+
190
+ def _filter_to_used_tools(tool_definitions, msgs_list, logger=None):
191
+ """Filter the tool definitions to only include those that were actually used in the messages lists."""
192
+ try:
193
+ used_tool_names = set()
194
+ any_tools_used = False
195
+
196
+ for msg in msgs_list:
197
+ if msg.get("role") == "assistant" and "content" in msg:
198
+ for content in msg.get("content", []):
199
+ if content.get("type") == "tool_call":
200
+ any_tools_used = True
201
+ if "tool_call" in content and "function" in content["tool_call"]:
202
+ used_tool_names.add(content["tool_call"]["function"])
203
+ elif "name" in content:
204
+ used_tool_names.add(content["name"])
205
+
206
+ filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names]
207
+ if any_tools_used and not filtered_tools:
208
+ if logger:
209
+ logger.warning("No tool definitions matched the tools used in the messages. Returning original list.")
210
+ filtered_tools = tool_definitions
211
+
212
+ return filtered_tools
213
+ except Exception as e:
214
+ if logger:
215
+ logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}")
216
+ return tool_definitions
217
+
218
+
219
+ def _get_tool_calls_results(agent_response_msgs):
220
+ """Extract formatted agent tool calls and results from response."""
221
+ agent_response_text = []
222
+ tool_results = {}
223
+
224
+ # First pass: collect tool results
225
+
226
+ for msg in agent_response_msgs:
227
+ if msg.get("role") == "tool" and "tool_call_id" in msg:
228
+ for content in msg.get("content", []):
229
+ if content.get("type") == "tool_result":
230
+ result = content.get("tool_result")
231
+ tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
232
+
233
+ # Second pass: parse assistant messages and tool calls
234
+ for msg in agent_response_msgs:
235
+ if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
236
+
237
+ for content in msg.get("content", []):
238
+
239
+ if content.get("type") == "tool_call":
240
+ if "tool_call" in content and "function" in content.get("tool_call", {}):
241
+ tc = content.get("tool_call", {})
242
+ func_name = tc.get("function", {}).get("name", "")
243
+ args = tc.get("function", {}).get("arguments", {})
244
+ tool_call_id = tc.get("id")
245
+ else:
246
+ tool_call_id = content.get("tool_call_id")
247
+ func_name = content.get("name", "")
248
+ args = content.get("arguments", {})
249
+ args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
250
+ call_line = f"[TOOL_CALL] {func_name}({args_str})"
251
+ agent_response_text.append(call_line)
252
+ if tool_call_id in tool_results:
253
+ agent_response_text.append(tool_results[tool_call_id])
254
+
255
+ return agent_response_text
256
+
257
+
258
+ def _reformat_tool_calls_results(response, logger=None):
259
+ try:
260
+ if response is None or response == []:
261
+ return ""
262
+ agent_response = _get_tool_calls_results(response)
263
+ if agent_response == []:
264
+ # If no message could be extracted, likely the format changed,
265
+ # fallback to the original response in that case
266
+ if logger:
267
+ logger.warning(
268
+ f"Empty agent response extracted, likely due to input schema change. "
269
+ f"Falling back to using the original response: {response}"
270
+ )
271
+ return response
272
+ return "\n".join(agent_response)
273
+ except Exception:
274
+ # If the agent response cannot be parsed for whatever
275
+ # reason (e.g. the converter format changed), the original response is returned
276
+ # This is a fallback to ensure that the evaluation can still proceed.
277
+ # See comments on reformat_conversation_history for more details.
278
+ if logger:
279
+ logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
280
+ return response
281
+
282
+
283
+ def _reformat_tool_definitions(tool_definitions, logger=None):
284
+ try:
285
+ output_lines = ["TOOL_DEFINITIONS:"]
286
+ for tool in tool_definitions:
287
+ name = tool.get("name", "unnamed_tool")
288
+ desc = tool.get("description", "").strip()
289
+ params = tool.get("parameters", {}).get("properties", {})
290
+ param_names = ", ".join(params.keys()) if params else "no parameters"
291
+ output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
292
+ return "\n".join(output_lines)
293
+ except Exception:
294
+ # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
295
+ # This is a fallback to ensure that the evaluation can still proceed.
296
+ # See comments on reformat_conversation_history for more details.
297
+ if logger:
298
+ logger.warning(
299
+ f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
300
+ )
301
+ return tool_definitions
@@ -0,0 +1,321 @@
1
+ ---
2
+ name: Tool Success
3
+ description: Evaluates whether a Tool call was successful or resulted in a technical error
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 1500
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: json_object
14
+ inputs:
15
+ tool_calls:
16
+ type: List
17
+ tool_definitions:
18
+ type: Dict
19
+ ---
20
+ system:
21
+
22
+ You are an expert evaluator with strong software development background. You are required to extract the tool result for every tool call then decide for each tool result whether it indicates that the tool succeeded or failed
23
+
24
+ user:
25
+ ROLE
26
+ ====
27
+ You are a judge on tool call success who assesses **each tool call made by an AI agent and decide if the result of the tool call indicates a success or failure**. You only care about technical errors , failures and exceptions , not the business correctness of the tool implementation.
28
+
29
+ You are NOT evaluating:
30
+ - The parameters passed to the tool
31
+ - The rationale behind choosing this tool
32
+ - Whether the successfully returned result from the tool is correct or not business-wise given the tool name and definition
33
+
34
+ You **ARE ONLY** evaluating:
35
+ -Whether tool results indicate the presence of a technical error
36
+
37
+ **INPUT**
38
+ =====
39
+ TOOL_DEFINITIONS: {{tool_definitions}}
40
+ TOOL_CALLS: {{tool_calls}}
41
+
42
+
43
+
44
+ TOOL_CALLS is a list of tool calls that were produced by the AI agent. It includes calls together with the result of every tool call.
45
+ TOOL_DEFINITIONS is a list of definitions for the tools that were called. This definition can contain a description of functionality provided by the tool, the parameters that the tool accepts and the expected return of the tool. This definition can contribute to the assessment of whether a tool call succeeded or failed.
46
+
47
+
48
+ EVALUATION FRAMEWORK
49
+ ====================
50
+
51
+ A. Iterate on the list of tool calls
52
+ B. Examine tool result and definition for the tool being called to check whether the call **succeeded** or **failed** as in the following steps explain:
53
+ 1. A tool result is **failed** if **any** of the following ERROR-CASES applies to it:
54
+ ERROR-CASES:
55
+ ===========
56
+ - The tool call resulted in an error or exception
57
+ - The tool call failed to run or failed to return
58
+ - The tool call returned a result that indicates an error or failure
59
+ - The tool call returned an object or JSON string that has one or more of its fields indicating an error
60
+ - The tool timed-out or returned a result that indicate a time-out
61
+ - The tool result does not make sense, from technical perspective, not business perspective, given the definition of that tool, if the definition is present
62
+ 2. If none of the error cases apply to the tool result , it is considered **succeeded** even if the tool result itself indicates a business mistake
63
+ C. If one or more tool result are **failed** , then you the **evaluation process** has **failed**, otherwise , the **evaluation process** has **succeeded**
64
+ D. You are required to return your **output** in the following format:
65
+ {
66
+ "explanation": "<15-60 words explaining the logic flow of your decision>",
67
+ "details": {
68
+ "failed_tools": "<comma-separated list for the tools that has failed results if any>",
69
+ },
70
+ "success": <True or False based on whether the **evaluation process** has **succeeded** or **failed**>
71
+ }
72
+ E. If no tool calls found at all , the TOOL_CALLS input is empty or the TOOL_CALLS input is not passed , the **evaluation process** has **succeeded**
73
+
74
+
75
+ ## Successful Evaluation Process Examples
76
+ ========================================
77
+
78
+ ### Example - Succeeded
79
+
80
+ [TOOL_CALLS]
81
+ [TOOL_CALL] get_account_balances(user_id="USER456")
82
+ [TOOL_RESULT] {'accounts': [{'account_id': 'CHK001', 'type': 'checking', 'balance': 1250.75}, {'account_id': 'SAV001', 'type': 'savings', 'balance': 3400.20}]}
83
+ [TOOL_CALL] get_weather_info()
84
+ [TOOL_RESULT] "the temperature is 23 C and it is cloudy"
85
+
86
+ EXPECTED OUTPUT
87
+ {
88
+ "explanation": "None of the results indicate an error",
89
+ "details": {
90
+ "failed_tools": "",
91
+ },
92
+ "success": True
93
+ }
94
+
95
+ ### Example - Succeeded
96
+
97
+ [TOOL_CALLS]
98
+ [TOOL_CALL] get_employee_info(employee_id="EMP2568")
99
+ [TOOL_RESULT] {"name":"David", "Age":32}
100
+
101
+ EXPECTED OUTPUT
102
+ {
103
+ "explanation": "None of the results indicate an error",
104
+ "details": {
105
+ "failed_tools": "",
106
+ },
107
+ "success": True
108
+ }
109
+
110
+ ### Example - Succeeded
111
+
112
+ [TOOL_DEFINITIONS] [get_sqrt] gets the square root of the input parameter
113
+ [TOOL_CALLS]
114
+ [TOOL_CALL] get_sqrt(4)
115
+ [TOOL_RESULT] {"value":7}
116
+
117
+ EXPECTED OUTPUT
118
+ {
119
+ "explanation": "Although the returned value 7 is not the square root of 4, this is a business mistake in the tool. The tool did not return a result indicating a technical error",
120
+ "details": {
121
+ "failed_tools": "",
122
+ },
123
+ "success": True
124
+ }
125
+
126
+
127
+ ### Example - Succeeded
128
+
129
+ [TOOL_DEFINITIONS] [get_blocked_usernames] gets comma-separated list of usernames for blocked users
130
+ [TOOL_CALLS]
131
+ [TOOL_CALL] get_blocked_usernames()
132
+ [TOOL_RESULT] "david33;amr_master;phantom5"
133
+
134
+ EXPECTED OUTPUT
135
+ {
136
+ "explanation": "The tool returned a semicolon separated list of names. Although the description in the definition says it should return comma-separated list , this formatting mistake is a business mistake of the tool, not a technical failure. The tool did not return an error",
137
+ "details": {
138
+ "failed_tools": "",
139
+ },
140
+ "success": True
141
+ }
142
+
143
+
144
+
145
+ ### Example - Succeeded
146
+
147
+ [TOOL_DEFINITIONS] [update_user_email] Updates the email of the given user id to the new email specified in the parameters
148
+ [TOOL_CALLS]
149
+ [TOOL_CALL] update_user_email(userId:2251 , newEmail:"david235@mydomain.com")
150
+ [TOOL_RESULT] {}
151
+
152
+ EXPECTED OUTPUT
153
+ {
154
+ "explanation": "The tool returned empty response which is accepted given that this tool functionality does not include returning data to the caller",
155
+ "details": {
156
+ "failed_tools": "",
157
+ },
158
+ "success": True
159
+ }
160
+
161
+
162
+ ## Failed Evaluation Process Examples
163
+ ========================================
164
+
165
+ ### Example - Failed
166
+
167
+ [TOOL_DEFINITIONS] [get_weather_info] return today's the weather information of the specified city
168
+ [TOOL_CALLS]
169
+ [TOOL_CALL] get_weather_info(city:"London")
170
+ [TOOL_RESULT] ""
171
+
172
+ EXPECTED OUTPUT
173
+ {
174
+ "explanation": "The tool returned empty response , however , given the tool definition , it should never return empty response because there should be weather info at any given point in time. An empty response here is considered a technical failure. The conclusion is the get_weather_info failed",
175
+ "details": {
176
+ "failed_tools": "get_weather_info",
177
+ },
178
+ "success": False
179
+ }
180
+
181
+
182
+ ### Example - Failed
183
+
184
+ [TOOL_CALLS]
185
+ [TOOL_CALL] get_current_user_Info()
186
+ [TOOL_RESULT] "failed to get current user information"
187
+
188
+ EXPECTED OUTPUT
189
+ {
190
+ "explanation": "The tool returned a string indicating that it failed",
191
+ "details": {
192
+ "failed_tools": "get_current_user_Info",
193
+ },
194
+ "success": False
195
+ }
196
+
197
+
198
+ ### Example - Failed
199
+
200
+ [TOOL_CALLS]
201
+ [TOOL_CALL] get_current_user_Info()
202
+ [TOOL_RESULT] {"UserName":"", "UserEmail":"", "Message":"failed to get current user information"}
203
+
204
+ EXPECTED OUTPUT
205
+ {
206
+ "explanation": "The tool returned an object with empty fields and a string indicating that it failed",
207
+ "details": {
208
+ "failed_tools": "get_current_user_Info",
209
+ },
210
+ "success": False
211
+ }
212
+
213
+
214
+ ### Example - Failed
215
+
216
+ [TOOL_CALLS]
217
+ [TOOL_CALL] GetWeatherInfo()
218
+ [TOOL_RESULT] {temp:""}
219
+
220
+ EXPECTED OUTPUT
221
+ {
222
+ "explanation": "The call for GetWeatherInfo returned an object containing single property 'temp' that is an empty string. This means the call to GetWeatherInfo returned empty result while weather info should be available at any time",
223
+ "details": {
224
+ "failed_tools": "GetWeatherInfo",
225
+ },
226
+ "success": False
227
+ }
228
+
229
+
230
+ ### Example - Failed
231
+
232
+ [TOOL_CALLS]
233
+ [TOOL_CALL] get_day_of_week(date:"1/1/2023")
234
+ [TOOL_RESULT] time out
235
+
236
+ EXPECTED OUTPUT
237
+ {
238
+ "explanation": "the returned result indicates that the call to get_day_of_week timed out",
239
+ "details": {
240
+ "failed_tools": "get_day_of_week",
241
+ },
242
+ "success": False
243
+ }
244
+
245
+
246
+
247
+ ### Example - Failed
248
+
249
+ [TOOL_DEFINITIONS] [get_day_of_week] Takes date as an input and returns the day of week that this day represents
250
+ [TOOL_CALLS]
251
+ [TOOL_CALL] get_day_of_week(date:"1/1/2023")
252
+ [TOOL_RESULT] null
253
+
254
+ EXPECTED OUTPUT
255
+ {
256
+ "explanation": "null indicates an empty result which cannot be an accepted output of the tool given the tool definition since any given date represents a day of week",
257
+ "details": {
258
+ "failed_tools": "get_day_of_week",
259
+ },
260
+ "success": False
261
+ }
262
+
263
+
264
+
265
+ ### Example - Failed
266
+
267
+ [TOOL_DEFINITIONS] [get_day_of_week] Takes date as an input and returns the day of week that this day represents
268
+ [TOOL_CALLS]
269
+ [TOOL_CALL] get_day_of_week(date:"1/1/2023")
270
+ [TOOL_RESULT] {}
271
+
272
+ EXPECTED OUTPUT
273
+ {
274
+ "explanation": "Empty object cannot be an accepted output of the tool given the tool definition since any given date should represent a day of week",
275
+ "details": {
276
+ "failed_tools": "get_day_of_week",
277
+ },
278
+ "success": False
279
+ }
280
+
281
+
282
+
283
+ ### Example - Failed
284
+
285
+ [TOOL_CALLS]
286
+ [TOOL_CALL] GetWeatherInfo()
287
+ [TOOL_RESULT] {temp:"" }
288
+ [TOOL_CALL] BookTicket(flightId:"FL23" , Seat:"A17")
289
+ [TOOL_RESULT] "Failed to book the ticket"
290
+
291
+ EXPECTED OUTPUT
292
+ {
293
+ "explanation": "GetWeatherInfo returned an empty response while it should return the weather info and BookTicket returned an error.Both tools failed.",
294
+ "details": {
295
+ "failed_tools": "GetWeatherInfo,BookTicket",
296
+ },
297
+ "success": False
298
+ }
299
+
300
+
301
+ ### Example - Failed
302
+
303
+ [TOOL_CALLS]
304
+ [TOOL_CALL] GetWeatherInfo()
305
+ [TOOL_RESULT] {temp:"23 C" }
306
+ [TOOL_CALL] BookTicket(flightId:"FL23" , Seat:"A17")
307
+ [TOOL_RESULT] "Failed to book the ticket"
308
+
309
+ EXPECTED OUTPUT
310
+ {
311
+ "explanation": "Although GetWeatherInfo succeeded, BookTicket returned an error. The final result is failure because one of the tool calls has failed",
312
+ "details": {
313
+ "failed_tools": "BookTicket",
314
+ },
315
+ "success": False
316
+ }
317
+
318
+
319
+
320
+ Now given the **INPUT** you received generate the output
321
+ # Output
@@ -0,0 +1,5 @@
1
+ from ._ungrounded_attributes import UngroundedAttributesEvaluator
2
+
3
+ __all__ = [
4
+ "UngroundedAttributesEvaluator",
5
+ ]