azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import logging
6
+ import math
7
+ import json
8
+ from typing import Dict, List, Union, TypeVar, Optional, cast
9
+ from typing_extensions import override
10
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._exceptions import (
12
+ ErrorBlame,
13
+ ErrorCategory,
14
+ ErrorTarget,
15
+ EvaluationException,
16
+ )
17
+ from ..._common.utils import reformat_conversation_history, _get_agent_response
18
+ from azure.ai.evaluation._common._experimental import experimental
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @experimental
24
+ class _ToolInputAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
25
+ """The Tool Input Accuracy evaluator performs a strict binary evaluation (PASS/FAIL) of parameters
26
+ passed to tool calls. It ensures that ALL parameters meet ALL criteria:
27
+
28
+ - Parameter grounding: All parameters must be derived from conversation history/query
29
+ - Type compliance: All parameters must match exact types specified in tool definitions
30
+ - Format compliance: All parameters must follow exact format and structure requirements
31
+ - Completeness: All required parameters must be provided
32
+ - No unexpected parameters: Only defined parameters are allowed
33
+
34
+ The evaluator uses strict binary evaluation:
35
+ - 1: Only when ALL criteria are satisfied perfectly for ALL parameters
36
+ - 0: When ANY criterion fails for ANY parameter
37
+
38
+ This evaluation focuses on ensuring tool call parameters are completely correct without any tolerance
39
+ for partial correctness.
40
+
41
+ :param model_config: Configuration for the Azure OpenAI model.
42
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
43
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
44
+
45
+ .. admonition:: Example:
46
+
47
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
48
+ :start-after: [START tool_input_accuracy_evaluator]
49
+ :end-before: [END tool_input_accuracy_evaluator]
50
+ :language: python
51
+ :dedent: 8
52
+ :caption: Initialize and call a _ToolInputAccuracyEvaluator.
53
+
54
+ .. admonition:: Example using Azure AI Project URL:
55
+
56
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
57
+ :start-after: [START tool_input_accuracy_evaluator]
58
+ :end-before: [END tool_input_accuracy_evaluator]
59
+ :language: python
60
+ :dedent: 8
61
+ :caption: Initialize and call _ToolInputAccuracyEvaluator using Azure AI Project URL in the following format
62
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
63
+
64
+ .. note::
65
+
66
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
67
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
68
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
69
+ """
70
+
71
+ _PROMPTY_FILE = "tool_input_accuracy.prompty"
72
+ _RESULT_KEY = "tool_input_accuracy"
73
+
74
+ _NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
75
+ _NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
76
+ _TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
77
+
78
+ def __init__(
79
+ self,
80
+ model_config,
81
+ *,
82
+ credential=None,
83
+ **kwargs,
84
+ ):
85
+ current_dir = os.path.dirname(__file__)
86
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
87
+ super().__init__(
88
+ model_config=model_config,
89
+ prompty_file=prompty_path,
90
+ result_key=self._RESULT_KEY,
91
+ threshold=1,
92
+ credential=credential,
93
+ **kwargs,
94
+ )
95
+
96
+ def _convert_kwargs_to_eval_input(self, **kwargs):
97
+ """Convert kwargs to evaluation input format.
98
+
99
+ :keyword kwargs: The inputs to convert.
100
+ :type kwargs: Dict
101
+ :return: The formatted evaluation input.
102
+ :rtype: Dict
103
+ """
104
+ # Collect inputs
105
+ tool_definitions = kwargs.get("tool_definitions", []) # Default to empty list
106
+ query = kwargs.get("query")
107
+ response = kwargs.get("response")
108
+
109
+ # Extract tool calls from response
110
+ if not response:
111
+ return {"error_message": "Response parameter is required to extract tool calls."}
112
+
113
+ tool_calls = self._parse_tools_from_response(response)
114
+ if not tool_calls:
115
+ return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
116
+
117
+ if not isinstance(tool_calls, list):
118
+ tool_calls = [tool_calls]
119
+ if not isinstance(tool_definitions, list):
120
+ tool_definitions = [tool_definitions] if tool_definitions else []
121
+
122
+ try:
123
+ # Type cast to satisfy static type checker
124
+ tool_calls_typed = cast(List[Dict], tool_calls)
125
+ needed_tool_definitions = self._extract_needed_tool_definitions(
126
+ tool_calls_typed, tool_definitions, ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR
127
+ )
128
+ except EvaluationException as e:
129
+ # Check if this is because no tool definitions were provided at all
130
+ if len(tool_definitions) == 0:
131
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
132
+ else:
133
+ return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
134
+
135
+ if len(needed_tool_definitions) == 0:
136
+ return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
137
+
138
+ # Get agent response with tool calls and results using _get_agent_response
139
+ agent_response_with_tools = _get_agent_response(response, include_tool_messages=True)
140
+
141
+ return {
142
+ "query": query,
143
+ "tool_calls": agent_response_with_tools,
144
+ "tool_definitions": needed_tool_definitions,
145
+ }
146
+
147
+ @override
148
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
149
+ """Do Tool Input Accuracy evaluation.
150
+
151
+ :param eval_input: The input to the evaluator.
152
+ :type eval_input: Dict
153
+ :return: A dictionary containing the result of the evaluation.
154
+ :rtype: Dict[str, Union[str, float]]
155
+ """
156
+ # Format conversation history for cleaner evaluation
157
+ if "query" in eval_input:
158
+ eval_input["query"] = reformat_conversation_history(
159
+ eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
160
+ )
161
+
162
+ # Call the LLM to evaluate
163
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
164
+ llm_output = prompty_output_dict.get("llm_output", {})
165
+
166
+ if isinstance(llm_output, dict):
167
+ result = llm_output.get("result", None)
168
+ if result not in [0, 1]:
169
+ raise EvaluationException(
170
+ message=f"Invalid result value: {result}. Expected 0 or 1.",
171
+ internal_message="Invalid result value.",
172
+ category=ErrorCategory.FAILED_EXECUTION,
173
+ blame=ErrorBlame.SYSTEM_ERROR,
174
+ )
175
+
176
+ # Add parameter extraction accuracy post-processing
177
+ details = llm_output.get("details", {})
178
+ if details:
179
+ parameter_extraction_accuracy = self._calculate_parameter_extraction_accuracy(details)
180
+ details["parameter_extraction_accuracy"] = parameter_extraction_accuracy
181
+
182
+ # Format the output
183
+ explanation = llm_output.get("chain_of_thought", "")
184
+ score_result = "pass" if result == 1 else "fail"
185
+ response_dict = {
186
+ self._result_key: result,
187
+ f"{self._result_key}_result": score_result,
188
+ f"{self._result_key}_threshold": self._threshold,
189
+ f"{self._result_key}_reason": explanation,
190
+ f"{self._result_key}_details": details,
191
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
192
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
193
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
194
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
195
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
196
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
197
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
198
+ }
199
+ return response_dict
200
+
201
+ else:
202
+ raise EvaluationException(
203
+ message="Tool input accuracy evaluator returned invalid output.",
204
+ blame=ErrorBlame.SYSTEM_ERROR,
205
+ category=ErrorCategory.FAILED_EXECUTION,
206
+ target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
207
+ )
208
+
209
+ async def _real_call(self, **kwargs):
210
+ """The asynchronous call where real end-to-end evaluation logic is performed.
211
+
212
+ :keyword kwargs: The inputs to evaluate.
213
+ :type kwargs: Dict
214
+ :return: The evaluation result.
215
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
216
+ """
217
+ # Convert inputs into list of evaluable inputs.
218
+ eval_input = self._convert_kwargs_to_eval_input(**kwargs)
219
+ if isinstance(eval_input, dict) and eval_input.get("error_message"):
220
+ # If there is an error message, return not applicable result
221
+ error_message = eval_input.get("error_message", "Unknown error")
222
+ return self._not_applicable_result(error_message, 1)
223
+ # Do the evaluation
224
+ result = await self._do_eval(eval_input)
225
+ # Return the result
226
+ return result
227
+
228
+ def _calculate_parameter_extraction_accuracy(self, details):
229
+ """Calculate parameter extraction accuracy from the evaluation details.
230
+
231
+ :param details: The details dictionary from the LLM evaluation output
232
+ :type details: Dict
233
+ :return: Parameter extraction accuracy as a percentage
234
+ :rtype: float
235
+ """
236
+ total_parameters = details.get("total_parameters_passed", 0)
237
+ correct_parameters = details.get("correct_parameters_passed", 0)
238
+
239
+ if total_parameters == 0:
240
+ return 100.0 # If no parameters were passed, accuracy is 100%
241
+
242
+ accuracy = (correct_parameters / total_parameters) * 100
243
+ return round(accuracy, 2)
244
+
245
+ @override
246
+ def __call__( # pylint: disable=docstring-missing-param
247
+ self,
248
+ *args,
249
+ **kwargs,
250
+ ):
251
+ """
252
+ Evaluate parameter correctness of tool calls.
253
+
254
+ :keyword query: Query or Chat history up to the message that has the tool call being evaluated.
255
+ :paramtype query: Union[str, List[dict]]
256
+ :keyword tool_definitions: List of tool definitions whose calls are being evaluated.
257
+ :paramtype tool_definitions: Union[dict, List[dict]]
258
+ :keyword response: Response containing tool calls to be evaluated.
259
+ :paramtype response: Union[str, List[dict]]
260
+ :return: The tool input accuracy evaluation results.
261
+ :rtype: Dict[str, Union[str, float]]
262
+ """
263
+ return super().__call__(*args, **kwargs)
@@ -0,0 +1,76 @@
1
+ ---
2
+ name: Tool Input Accuracy
3
+ description: Evaluates the accuracy of all inputs/parameters passed to the tools by the agent
4
+ model:
5
+ api: chat
6
+ parameters:
7
+ temperature: 0.0
8
+ max_tokens: 1000
9
+ top_p: 1.0
10
+ presence_penalty: 0
11
+ frequency_penalty: 0
12
+ response_format:
13
+ type: json_object
14
+
15
+ inputs:
16
+ query:
17
+ type: List
18
+ tool_calls:
19
+ type: List
20
+ tool_definitions:
21
+ type: Dict
22
+ ---
23
+
24
+ # system:
25
+ You are an AI system designed to evaluate the correctness of parameters passed to tool calls. Your task is to perform a strict binary evaluation (PASS/FAIL) based on whether ALL parameters are correct.
26
+
27
+ The evaluation must check ALL of the following criteria. If ANY criterion fails, the overall result is FAIL:
28
+ 1. **Parameter Groundedness**: ALL parameters must be derived from or supported by information in the conversation history/query. NO fabricated or unsupported values.
29
+ 2. **Type Compliance**: ALL parameters must match the exact type specified in the tool definitions (string, number, boolean, array, object, etc.).
30
+ 3. **Format Compliance**: ALL parameters must follow the exact format, structure, and constraints specified in the tool definitions.
31
+ 4. **Required Parameters**: ALL required parameters must be provided. Missing any required parameter results in FAIL.
32
+ 5. **Unexpected Parameters**: NO parameters should be provided that are not defined in the tool definition. Any extra/unexpected parameters result in FAIL.
33
+ 6. **Value Appropriateness**: ALL parameter values must be contextually appropriate and meaningful for the tool's purpose.
34
+
35
+ ## Evaluation Rules
36
+
37
+ **PASS**: Only when ALL criteria above are satisfied perfectly. Every single parameter must be:
38
+ - Properly grounded in conversation history/query
39
+ - Correct type according to tool definition
40
+ - Proper format and structure
41
+ - Required parameters all present
42
+ - No unexpected/undefined parameters
43
+ - Contextually appropriate values
44
+
45
+ **FAIL**: When ANY of the above criteria fails, including:
46
+ - Any parameter lacks grounding in conversation history
47
+ - Any parameter has wrong type
48
+ - Any parameter has wrong format/structure
49
+ - Any required parameter is missing
50
+ - Any unexpected parameter is present
51
+ - Any parameter value is inappropriate for the context
52
+
53
+ ## Task
54
+ Analyze each tool call and its parameters against the provided tool definitions and conversation context. Provide your evaluation in the following JSON format:
55
+
56
+ {
57
+ "chain_of_thought": "Step-by-step analysis for all parameters passed to all the tools to check for the criteria mentioned above",
58
+ "details": {
59
+ "total_parameters_passed": <number of total parameters that were passed to all tools>,
60
+ "correct_parameters_passed": <number of correct parameters that were passed to all tools in the agent's response>,
61
+ "incorrect_parameters": ["list of incorrect parameters passed with reasons"]
62
+ },
63
+ "result": <0 for FAIL, 1 for PASS>
64
+ }
65
+
66
+
67
+ ## Conversation History/Query:
68
+ {{query}}
69
+
70
+ ## Tool Calls Made:
71
+ {{tool_calls}}
72
+
73
+ ## Tool Definitions:
74
+ {{tool_definitions}}
75
+
76
+ # Output
@@ -0,0 +1,7 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from ._tool_output_utilization import _ToolOutputUtilizationEvaluator
6
+
7
+ __all__ = ["_ToolOutputUtilizationEvaluator"]
@@ -0,0 +1,225 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import math
6
+ import logging
7
+ from typing import Dict, Union, List, Optional
8
+
9
+ from typing_extensions import overload, override
10
+
11
+ from azure.ai.evaluation._exceptions import (
12
+ EvaluationException,
13
+ ErrorBlame,
14
+ ErrorCategory,
15
+ ErrorTarget,
16
+ )
17
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
18
+ from ..._common.utils import (
19
+ reformat_conversation_history,
20
+ reformat_agent_response,
21
+ reformat_tool_definitions,
22
+ filter_to_used_tools,
23
+ )
24
+ from azure.ai.evaluation._model_configurations import Message
25
+ from azure.ai.evaluation._common._experimental import experimental
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @experimental
31
+ class _ToolOutputUtilizationEvaluator(PromptyEvaluatorBase[Union[str, float]]):
32
+ """The Tool Output Utilization Evaluator assesses how effectively an AI agent utilizes the outputs from tools and whether it accurately incorporates this information into its responses.
33
+
34
+ Scoring is based on two levels:
35
+ 1. Pass - The agent effectively utilizes tool outputs and accurately incorporates the information into its response.
36
+ 2. Fail - The agent fails to properly utilize tool outputs or incorrectly incorporates the information into its response.
37
+
38
+ The evaluation includes the score, a brief explanation, and a final pass/fail result.
39
+
40
+
41
+ :param model_config: Configuration for the Azure OpenAI model.
42
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
43
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
44
+
45
+ .. admonition:: Example:
46
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
47
+ :start-after: [START tool_output_utilization_evaluator]
48
+ :end-before: [END tool_output_utilization_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize and call a _ToolOutputUtilizationEvaluator with a query and response.
52
+
53
+ .. admonition:: Example using Azure AI Project URL:
54
+
55
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
56
+ :start-after: [START tool_output_utilization_evaluator]
57
+ :end-before: [END tool_output_utilization_evaluator]
58
+ :language: python
59
+ :dedent: 8
60
+ :caption: Initialize and call _ToolOutputUtilizationEvaluator using Azure AI Project URL in the following format
61
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
62
+
63
+ """
64
+
65
+ _PROMPTY_FILE = "tool_output_utilization.prompty"
66
+ _RESULT_KEY = "tool_output_utilization"
67
+ _OPTIONAL_PARAMS = ["tool_definitions"]
68
+
69
+ _DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE = 1
70
+
71
+ id = "azureai://built-in/evaluators/tool_output_utilization"
72
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
73
+
74
+ @override
75
+ def __init__(
76
+ self,
77
+ model_config,
78
+ *,
79
+ credential=None,
80
+ **kwargs,
81
+ ):
82
+ current_dir = os.path.dirname(__file__)
83
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
84
+ super().__init__(
85
+ model_config=model_config,
86
+ prompty_file=prompty_path,
87
+ result_key=self._RESULT_KEY,
88
+ threshold=self._DEFAULT_TOOL_OUTPUT_UTILIZATION_SCORE,
89
+ credential=credential,
90
+ _higher_is_better=True,
91
+ **kwargs,
92
+ )
93
+
94
+ @overload
95
+ def __call__(
96
+ self,
97
+ *,
98
+ query: Union[str, List[dict]],
99
+ response: Union[str, List[dict]],
100
+ tool_definitions: Union[dict, List[dict]],
101
+ ) -> Dict[str, Union[str, float]]:
102
+ """Evaluate tool output utilization for a given query, response, and optional tool defintions.
103
+ The query and response can be either a string or a list of messages.
104
+
105
+
106
+ Example with string inputs and no tools:
107
+ evaluator = _ToolOutputUtilizationEvaluator(model_config)
108
+ query = "What is the weather today?"
109
+ response = "The weather is sunny."
110
+
111
+ result = evaluator(query=query, response=response)
112
+
113
+ Example with list of messages:
114
+ evaluator = _ToolOutputUtilizationEvaluator(model_config)
115
+ query = [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
116
+ response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
117
+ tool_definitions = [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
118
+
119
+ result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
120
+
121
+ :keyword query: The query being evaluated, either a string or a list of messages.
122
+ :paramtype query: Union[str, List[dict]]
123
+ :keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
124
+ :paramtype response: Union[str, List[dict]]
125
+ :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
126
+ :paramtype tool_definitions: Union[dict, List[dict]]
127
+ :return: A dictionary with the tool output utilization evaluation results.
128
+ :rtype: Dict[str, Union[str, float]]
129
+ """
130
+
131
+ @override
132
+ def __call__( # pylint: disable=docstring-missing-param
133
+ self,
134
+ *args,
135
+ **kwargs,
136
+ ):
137
+ """
138
+ Invokes the instance using the overloaded __call__ signature.
139
+
140
+ For detailed parameter types and return value documentation, see the overloaded __call__ definition.
141
+ """
142
+ return super().__call__(*args, **kwargs)
143
+
144
+ @override
145
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
146
+ """Do Tool Output Utilization evaluation.
147
+ :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
148
+ :type eval_input: Dict
149
+ :return: The evaluation result.
150
+ :rtype: Dict
151
+ """
152
+ # we override the _do_eval method as we want the output to be a dictionary,
153
+ # which is a different schema than _base_prompty_eval.py
154
+ if ("query" not in eval_input) and ("response" not in eval_input) and ("tool_definitions" not in eval_input):
155
+ raise EvaluationException(
156
+ message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.",
157
+ internal_message="Query, response, and tool_definitions are required inputs to the Tool Output Utilization evaluator.",
158
+ blame=ErrorBlame.USER_ERROR,
159
+ category=ErrorCategory.MISSING_FIELD,
160
+ target=ErrorTarget.TOOL_OUTPUT_UTILIZATION_EVALUATOR,
161
+ )
162
+
163
+ tool_definitions = eval_input["tool_definitions"]
164
+ filtered_tool_definitions = filter_to_used_tools(
165
+ tool_definitions=tool_definitions,
166
+ msgs_lists=[eval_input["query"], eval_input["response"]],
167
+ logger=logger,
168
+ )
169
+ eval_input["tool_definitions"] = reformat_tool_definitions(filtered_tool_definitions, logger)
170
+
171
+ eval_input["query"] = reformat_conversation_history(
172
+ eval_input["query"],
173
+ logger,
174
+ include_system_messages=True,
175
+ include_tool_messages=True,
176
+ )
177
+ eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
178
+
179
+ prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
180
+ llm_output = prompty_output_dict.get("llm_output", "")
181
+ if isinstance(llm_output, dict):
182
+ output_label = llm_output.get("label", None)
183
+ if output_label is None:
184
+ if logger:
185
+ logger.warning("LLM output does not contain 'label' key, returning NaN for the score.")
186
+ output_label = "fail"
187
+
188
+ output_label = output_label.lower()
189
+ if output_label not in ["pass", "fail"]:
190
+ if logger:
191
+ logger.warning(
192
+ f"LLM output label is not 'pass' or 'fail' (got '{output_label}'), returning NaN for the score."
193
+ )
194
+
195
+ score = 1.0 if output_label == "pass" else 0.0
196
+ score_result = output_label
197
+ reason = llm_output.get("reason", "")
198
+
199
+ faulty_details = llm_output.get("faulty_details", [])
200
+ if faulty_details:
201
+ reason += " Issues found: " + "; ".join(faulty_details)
202
+
203
+ return {
204
+ f"{self._result_key}": score,
205
+ f"{self._result_key}_reason": reason,
206
+ f"{self._result_key}_result": score_result,
207
+ f"{self._result_key}_threshold": self._threshold,
208
+ f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
209
+ f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
210
+ f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
211
+ f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
212
+ f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
213
+ f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
214
+ f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
215
+ }
216
+ if logger:
217
+ logger.warning("LLM output is not a dictionary, returning NaN for the score.")
218
+
219
+ score = math.nan
220
+ binary_result = self._get_binary_result(score)
221
+ return {
222
+ self._result_key: float(score),
223
+ f"{self._result_key}_result": binary_result,
224
+ f"{self._result_key}_threshold": self._threshold,
225
+ }