azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +85 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +147 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +87 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +430 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py +7 -0
  155. azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py +306 -0
  156. azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty +321 -0
  157. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  158. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  159. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  160. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  161. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  162. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  163. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  165. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/METADATA +378 -27
  264. azure_ai_evaluation-1.13.5.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,298 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import math
5
+ import os
6
+ import logging
7
+ import re
8
+ from typing import Dict, List, Union, TypeVar, Optional
9
+ from typing_extensions import overload, override
10
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._exceptions import (
12
+ ErrorBlame,
13
+ ErrorCategory,
14
+ ErrorTarget,
15
+ EvaluationException,
16
+ )
17
+ from ..._common.utils import check_score_is_valid
18
+ from azure.ai.evaluation._common._experimental import experimental
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ T_EvalValue = TypeVar("T_EvalValue")
23
+
24
+
25
+ @experimental
26
+ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
27
+ """The Tool Call Accuracy evaluator assesses how accurately an AI uses tools by examining:
28
+ - Relevance to the conversation.
29
+ - Parameter correctness according to tool definitions.
30
+ - Parameter value extraction from the conversation.
31
+
32
+ The evaluator uses a scoring rubric of 1 to 5:
33
+ - Score 1: The tool calls are irrelevant
34
+ - Score 2: The tool calls are partially relevant, but not enough tools were called or the parameters were not correctly passed.
35
+ - Score 3: The tool calls are relevant, but there were unnecessary, excessive tool calls made.
36
+ - Score 4: The tool calls are relevant, but some tools returned errors and agent retried calling them again and succeeded.
37
+ - Score 5: The tool calls are relevant, and all parameters were correctly passed.
38
+
39
+ This evaluation focuses on measuring whether tool calls meaningfully contribute to addressing
40
+ user needs while properly following tool definitions and using information present in the
41
+ conversation history.
42
+
43
+ :param model_config: Configuration for the Azure OpenAI model.
44
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
45
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
46
+
47
+ .. admonition:: Example:
48
+
49
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
50
+ :start-after: [START tool_call_accuracy_evaluator]
51
+ :end-before: [END tool_call_accuracy_evaluator]
52
+ :language: python
53
+ :dedent: 8
54
+ :caption: Initialize and call a ToolCallAccuracyEvaluator.
55
+
56
+ .. admonition:: Example using Azure AI Project URL:
57
+
58
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
59
+ :start-after: [START tool_call_accuracy_evaluator]
60
+ :end-before: [END tool_call_accuracy_evaluator]
61
+ :language: python
62
+ :dedent: 8
63
+ :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
64
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
65
+
66
+ .. note::
67
+
68
+ The output field "details" has been renamed to "tool_call_accuracy_details" for clarity.
69
+
70
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
71
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
72
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
73
+
74
+ """
75
+
76
+ _PROMPTY_FILE = "tool_call_accuracy.prompty"
77
+ _RESULT_KEY = "tool_call_accuracy"
78
+
79
+ _MAX_TOOL_CALL_ACCURACY_SCORE = 5
80
+ _MIN_TOOL_CALL_ACCURACY_SCORE = 1
81
+ _DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
82
+
83
+ _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
84
+ _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
85
+ _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
86
+ _INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
87
+
88
+ _LLM_SCORE_KEY = "tool_calls_success_level"
89
+
90
+ id = "azureai://built-in/evaluators/tool_call_accuracy"
91
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
92
+
93
+ @override
94
+ def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE, credential=None, **kwargs):
95
+ current_dir = os.path.dirname(__file__)
96
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
97
+ self.threshold = threshold
98
+ super().__init__(
99
+ model_config=model_config,
100
+ prompty_file=prompty_path,
101
+ result_key=self._RESULT_KEY,
102
+ credential=credential,
103
+ threshold=threshold,
104
+ **kwargs,
105
+ )
106
+
107
+ @overload
108
+ def __call__(
109
+ self,
110
+ *,
111
+ query: Union[str, List[dict]],
112
+ tool_definitions: Union[dict, List[dict]],
113
+ tool_calls: Union[dict, List[dict]] = None,
114
+ response: Union[str, List[dict]] = None,
115
+ ) -> Dict[str, Union[str, float]]:
116
+ """
117
+ Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
118
+
119
+ :keyword query: Query or Chat history up to the message that has the tool call being evaluated.
120
+ :paramtype query: Union[str, List[dict]]
121
+ :keyword tool_definitions: List of tool definitions whose calls are being evaluated.
122
+ :paramtype tool_definitions: Union[dict, List[dict]]
123
+ :keyword tool_calls: Optional List of tool calls to evaluate. If not provided response should be provided and should have
124
+ tool call(s) in it.
125
+ :paramtype tool_calls: Union[dict, List[dict]]
126
+ :keyword response: Optional response to be evaluated alongside the tool calls.
127
+ If provided all tool calls in response will be evaluated when tool_calls parameter is not provided.
128
+ If provided and tool_calls parameter is provided, only the tool calls in tool_calls parameter will be evaluated.
129
+ If response has extra tool calls they will not be evaluated, response will be used to extract any tool calls that are needed for evaluating a certain tool call.
130
+ Recommended to provide it when there are tool calls that depend on output of a previous tool call.
131
+ :paramtype response: Union[str, List[dict]]
132
+ :return: The tool selection evaluation results.
133
+ :rtype: Dict[str, Union[str, float]]
134
+ """
135
+
136
+ def _convert_kwargs_to_eval_input(self, **kwargs):
137
+ """Convert an arbitrary input into a list of inputs for evaluators.
138
+ It is assumed that evaluators generally make use of their inputs in one of two ways.
139
+ Either they receive a collection of keyname inputs that are all single values
140
+ (like a query and response), or they receive conversation that iss a list of dictionary
141
+ values.
142
+
143
+ The self._singleton_inputs list assigned during initialization is used to find and extract
144
+ singleton keywords, and self._allow_conversation_input is used to determine if a conversation
145
+ is a valid input.
146
+
147
+ If both conversations and singletons are allowed, the function will raise an exception if both
148
+ are inputted.
149
+
150
+ This function must be overridden by child classes IF they need to both a conversation and
151
+ other inputs to be passed in.
152
+
153
+ :keyword kwargs: The inputs to convert.
154
+ :type kwargs: Dict
155
+ :return: A list of arbitrary values that are valid inputs for this evaluator's do_eval function.
156
+ :rtype: List
157
+ """
158
+ # TODO add warning that only tool calls of type function are supported
159
+ # Collect inputs
160
+ tool_calls = kwargs.get("tool_calls")
161
+ tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
162
+ query = kwargs.get("query")
163
+ response = kwargs.get("response")
164
+ # TODO : Support classes that represents tool calls, messages etc once client side definitions are available
165
+ if response:
166
+ parsed_tool_calls = self._parse_tools_from_response(response)
167
+ if parsed_tool_calls:
168
+ tool_calls = parsed_tool_calls
169
+
170
+ if not tool_calls:
171
+ return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
172
+
173
+ if not isinstance(tool_calls, list):
174
+ tool_calls = [tool_calls]
175
+ if not isinstance(tool_definitions, list):
176
+ tool_definitions = [tool_definitions] if tool_definitions else []
177
+
178
+ try:
179
+ needed_tool_definitions = self._extract_needed_tool_definitions(
180
+ tool_calls, tool_definitions, ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR
181
+ )
182
+ except EvaluationException as e:
183
+ # Check if this is because no tool definitions were provided at all
184
+ if len(tool_definitions) == 0:
185
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
186
+ else:
187
+ return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
188
+
189
+ if len(needed_tool_definitions) == 0:
190
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
191
+
192
+ return {
193
+ "query": query,
194
+ "tool_calls": tool_calls,
195
+ "tool_definitions": needed_tool_definitions,
196
+ }
197
+
198
+ @override
199
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
200
+ """Do a tool call accuracy evaluation.
201
+ :param eval_input: The input to the evaluator. Expected to contain
202
+ whatever inputs are needed for the _flow method, including context
203
+ and other fields depending on the child class.
204
+ :type eval_input: Dict
205
+ :return: The evaluation result.
206
+ :rtype: Dict
207
+ """
208
+ # Single LLM call for all tool calls
209
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
210
+ llm_output = prompty_output_dict.get("llm_output", {})
211
+ if isinstance(llm_output, dict):
212
+ score = llm_output.get(self._LLM_SCORE_KEY, None)
213
+ if not score or not check_score_is_valid(
214
+ score,
215
+ ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE,
216
+ ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE,
217
+ ):
218
+ raise EvaluationException(
219
+ message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
220
+ internal_message="Invalid score value.",
221
+ category=ErrorCategory.FAILED_EXECUTION,
222
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
223
+ blame=ErrorBlame.SYSTEM_ERROR,
224
+ )
225
+
226
+ # Format the output
227
+ reason = llm_output.get("chain_of_thought", "")
228
+ score = float(score)
229
+ score_result = "pass" if score >= self.threshold else "fail"
230
+ response_dict = {
231
+ self._result_key: score,
232
+ f"gpt_{self._result_key}": score,
233
+ f"{self._result_key}_result": score_result,
234
+ f"{self._result_key}_threshold": self._threshold,
235
+ f"{self._result_key}_reason": reason,
236
+ f"{self._result_key}_details": llm_output.get("details", {}),
237
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
238
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
239
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
240
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
241
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
242
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
243
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
244
+ }
245
+ return response_dict
246
+
247
+ else:
248
+ raise EvaluationException(
249
+ message="Tool call accuracy evaluator returned invalid output.",
250
+ blame=ErrorBlame.SYSTEM_ERROR,
251
+ category=ErrorCategory.FAILED_EXECUTION,
252
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
253
+ )
254
+
255
+ async def _real_call(self, **kwargs):
256
+ """The asynchronous call where real end-to-end evaluation logic is performed.
257
+
258
+ :keyword kwargs: The inputs to evaluate
259
+ :type kwargs: Dict
260
+ :return: The evaluation result
261
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
262
+ """
263
+ # Convert inputs into list of evaluable inputs.
264
+ eval_input = self._convert_kwargs_to_eval_input(**kwargs)
265
+ if isinstance(eval_input, dict) and eval_input.get("error_message"):
266
+ # If there is an error message, return not applicable result
267
+ return self._not_applicable_result(eval_input.get("error_message"), self.threshold)
268
+ # Do the evaluation
269
+ result = await self._do_eval(eval_input)
270
+ # Return the result
271
+ return result
272
+
273
+ @override
274
+ def __call__( # pylint: disable=docstring-missing-param
275
+ self,
276
+ *args,
277
+ **kwargs,
278
+ ):
279
+ """
280
+ Evaluate tool call accuracy. Accepts a query, tool definitions, and tool calls for evaluation.
281
+
282
+ :keyword query: Query or Chat history up to the message that has the tool call being evaluated.
283
+ :paramtype query: Union[str, List[dict]]
284
+ :keyword tool_definitions: List of tool definitions whose calls are being evaluated.
285
+ :paramtype tool_definitions: Union[dict, List[dict]]
286
+ :keyword tool_calls: Optional List of tool calls to evaluate. If not provided response should be provided and should have
287
+ tool call(s) in it.
288
+ :paramtype tool_calls: Union[dict, List[dict]]
289
+ :keyword response: Optional response to be evaluated alongside the tool calls.
290
+ If provided all tool calls in response will be evaluated when tool_calls parameter is not provided.
291
+ If provided and tool_calls parameter is provided, only the tool calls in tool_calls parameter will be evaluated.
292
+ If response has extra tool calls they will not be evaluated, response will be used to extract any tool calls that are needed for evaluating a certain tool call.
293
+ Recommended to provide it when there are tool calls that depend on output of a previous tool call.
294
+ :paramtype response: Union[str, List[dict]]
295
+ :return: The tool selection evaluation results.
296
+ :rtype: Dict[str, Union[str, float]]
297
+ """
298
+ return super().__call__(*args, **kwargs)
@@ -0,0 +1,166 @@
1
+ ---
2
+ name: Tool Call Accuracy
3
+ description: Evaluates Tool Call Accuracy for tool used by agent
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 3000
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: json_object
14
+
15
+ inputs:
16
+ query:
17
+ type: List
18
+ tool_calls:
19
+ type: List
20
+ tool_definitions:
21
+ type: Dict
22
+
23
+ ---
24
+ system:
25
+ # Instruction
26
+ ## Goal
27
+ ### Your are an expert in evaluating the accuracy of a tool call considering relevance and potential usefulness including syntactic and semantic correctness of a proposed tool call from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
28
+ - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
29
+ - **Data**: Your input data include CONVERSATION , TOOL CALL and TOOL DEFINITION.
30
+ - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways, and you need to be very precise in your evaluation.
31
+
32
+ user:
33
+ # Definition
34
+ # Definition
35
+ **Tool Call Accuracy** refers to the overall effectiveness of ALL TOOL CALLS made by an agent in response to a user's query within an ongoing CONVERSATION.
36
+
37
+ # EVALUATION CRITERIA
38
+ Evaluate based on these factors:
39
+
40
+ 1. **Collective Relevance**: Do the tool calls, taken together, appropriately address the user's query?
41
+ 2. **Parameter Correctness**: Are all parameter values extracted from or reasonably inferred from the CONVERSATION?
42
+ - *Fabricated parameters automatically result in Level 2*
43
+ 3. **Completeness**: Did the agent make all necessary tool calls available in the tool definitions?
44
+ - *Failed calls don't count as missing*
45
+ 4. **Efficiency**: Did the agent avoid unnecessary duplicate tool calls with identical parameters?
46
+ - *Don't penalize single tools returning multiple results (like file_search)*
47
+ 5. **Execution Success**: Were tool calls executed successfully or recovered from errors appropriately?
48
+ 6. **Scope Limitation**: ONLY evaluate tool calls in the "TOOL CALLS TO BE EVALUATED" section.
49
+ - Tool calls in the CONVERSATION section are for context only
50
+ - Focus exclusively on the agent's response to the user's LAST query
51
+ - Use conversation history only to verify parameter correctness and context
52
+
53
+ **Success Criteria**: Tools should retrieve relevant data to help answer the query. Complete final answers are not required from individual tools.
54
+
55
+ **Tool Assessment**: Focus solely on appropriate use of available tools, not on capabilities beyond what tools can provide.
56
+
57
+
58
+ # Ratings
59
+ ## [Tool Call Accuracy: 1] (Irrelevant)
60
+ **Definition:**
61
+ Tool calls were not relevant to the user's query, resulting in an irrelevant or unhelpful final output.
62
+
63
+ **Example:**
64
+ User asks for distance between two cities -> Agent calls a weather function to get the weather in the two cities.
65
+
66
+
67
+ ## [Tool Call Accuracy: 2] (Partially Relevant - Wrong Execution)
68
+ **Definition:**
69
+ Tool calls were somewhat related to the user's query, but the agent was not able to reach information that helps address the user query due to one or more of the following:
70
+ • Parameters passed to the tool were incorrect.
71
+ • Not enough tools (available in the tool definitions) were called to fully help address the query (missing tool calls).
72
+ • Tools returned errors, and no retrials for the tool call were successful.
73
+
74
+
75
+ **Example:**
76
+ The user asks for the coordinates of Chicago. The agent calls the tool that gets the coordinates but passes 'New York' instead of Chicago as parameter.
77
+
78
+ **Example:**
79
+ The user asks for the coordinates of Chicago. The agent calls the tool that gets the coordinates and passes 'Chicago' as the tool parameter, but the tool returns an error.
80
+
81
+ **Example:**
82
+ The user asks a question that needs 3 tool calls for it to be answered. The agent calls only one of the three required tool calls. So this case is a Level 2.
83
+
84
+
85
+ ## [Tool Call Accuracy: 3] (Relevant but Inefficient)
86
+ **Definition:**
87
+ Tool calls were relevant, correct and grounded parameters were passed so that led to a correct output. However, multiple excessive, unnecessary tool calls were made.
88
+
89
+ **Important**: Do NOT penalize built-in tools like file_search that naturally return multiple results in a single call. Only penalize when there are actually multiple separate tool call objects.
90
+
91
+ **Example:**
92
+ The user asked to do a modification in the database. The agent called the tool multiple times, resulting in multiple modifications in the database instead of one.
93
+
94
+ **Example:**
95
+ The user asked for popular hotels in a certain place. The agent calls the same tool with the same parameters multiple times, even though a single tool call that returns an output is sufficient. So there were unnecessary tool calls.
96
+
97
+
98
+ ## [Tool Call Accuracy: 4] (Correct with Retrials)
99
+ **Definition:**
100
+ Tool calls were fully relevant and efficient:
101
+ • Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query.
102
+ • A tool returned an error, but the agent retried calling the tool and successfully got an output.
103
+
104
+ **Example:**
105
+ The user asks for the weather forecast in a certain place. The agent calls the correct tool that retrieves the weather forecast, but the tool returns an error. The agent re-calls the tool once again and it returns the correct output. This is a Level 4.
106
+
107
+
108
+ ## [Tool Call Accuracy: 5] (Optimal Solution)
109
+ **Definition:**
110
+ Tool calls were fully relevant and efficient:
111
+ • Correct tools were called with the correct and grounded parameters, whether they are extracted from the conversation history or the current user query.
112
+ • No unnecessary or excessive tool calls were made.
113
+ • No errors occurred in any of the tools.
114
+ • The tool calls made helped the agent address the user's query without facing any issues.
115
+
116
+ **Example:**
117
+ The user asks for the distance between two places. The agent correctly calls the tools that retrieve the coordinates for the two places respectively, then calls the tool that calculates the distance between the two sets of coordinates, passing the correct arguments to all the tools, without calling other tools excessively or unnecessarily. This is the optimal solution for the user's query.
118
+
119
+ **Example:**
120
+ The user asks for the distance between two places. The agent retrieves the needed coordinates from the outputs of the tool calls in the conversation history, and then correctly passes these coordinates to the tool that calculates the distance to output it to the user. This is also an optimal solution for the user's query.
121
+
122
+ **Example:**
123
+ The user asked to summarize a file on their SharePoint. The agent calls the sharepoint_grounding tool to retrieve the file. This retrieved file will help the agent fulfill the task of summarization. This is a Level 5.
124
+
125
+
126
+ ## Chain of Thought Structure
127
+ Structure your reasoning as follows:
128
+ 1. **Start with the user's last query**: Understand well what the last message that is sent by the user is.
129
+ 2. **Identify relevant available tools**: Look into the TOOL DEFINITIONS and analyze which tools could help answer the user's last query in the conversation.
130
+ 3. **Analyze the actual tool calls made**: Compare what was done in the TOOL CALLS TO BE EVALUATED section vs. What should've been done by the agent.
131
+ 4. **Check parameter grounding** - Ensure all parameters are grounded from the CONVERSATION section and are not hallucinated.
132
+ 5. **Determine the appropriate level** - Be VERY precise and follow the level definitions exactly.
133
+
134
+ # Data
135
+ CONVERSATION : {{query}}
136
+ TOOL CALLS TO BE EVALUATED: {{tool_calls}}
137
+ TOOL DEFINITIONS: {{tool_definitions}}
138
+
139
+
140
+ # Tasks
141
+ ## Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
142
+ Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
143
+ - chain_of_thought: a string that explains your thought process to decide on the tool call accuracy level, based on the Chain of Thought structure. Start this string with 'Let's think step by step:'.
144
+ - tool_calls_success_level: a integer value between 1 and 5 that represents the level of tool call success, based on the level definitions mentioned before. You need to be very precise when deciding on this level. Ensure you are correctly following the rating system based on the description of each level.
145
+ - details: a dictionary that contains the following keys:
146
+ - tool_calls_made_by_agent: total number of tool calls made by the agent
147
+ - correct_tool_calls_made_by_agent: total number of correct tool calls made by the agent
148
+ - per_tool_call_details: a list of dictionaries, each containing:
149
+ - tool_name: name of the tool
150
+ - total_calls_required: total number of calls required for the tool
151
+ - correct_calls_made_by_agent: number of correct calls made by the agent
152
+ - correct_tool_percentage: percentage of correct calls made by the agent for this tool. It is a value between 0.0 and 1.0
153
+ - tool_call_errors: number of errors encountered during the tool call
154
+ - tool_success_result: 'pass' or 'fail' based on the evaluation of the tool call accuracy for this tool
155
+ - excess_tool_calls: a dictionary with the following keys:
156
+ - total: total number of excess, unnecessary tool calls made by the agent
157
+ - details: a list of dictionaries, each containing:
158
+ - tool_name: name of the tool
159
+ - excess_count: number of excess calls made for this query
160
+ - missing_tool_calls: a dictionary with the following keys:
161
+ - total: total number of missing tool calls that should have been made by the agent to be able to answer the query, but were not made by the agent at all.
162
+ - details: a list of dictionaries, each containing:
163
+ - tool_name: name of the tool
164
+ - missing_count: number of missing calls for this query
165
+
166
+ # Output
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._tool_call_success import _ToolCallSuccessEvaluator
6
+
7
+ __all__ = ["_ToolCallSuccessEvaluator"]