azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -1,58 +1,99 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ from typing import Dict
4
5
  from nltk.translate.gleu_score import sentence_gleu
5
- from promptflow._utils.async_utils import async_run_allowing_running_loop
6
+ from typing_extensions import overload, override
6
7
 
7
8
  from azure.ai.evaluation._common.utils import nltk_tokenize
8
9
 
10
+ from azure.ai.evaluation._evaluators._common import EvaluatorBase
11
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
9
12
 
10
- class _AsyncGleuScoreEvaluator:
11
- def __init__(self):
12
- pass
13
13
 
14
- async def __call__(self, *, ground_truth: str, response: str, **kwargs):
15
- reference_tokens = nltk_tokenize(ground_truth)
16
- hypothesis_tokens = nltk_tokenize(response)
17
-
18
- score = sentence_gleu([reference_tokens], hypothesis_tokens)
19
-
20
- return {
21
- "gleu_score": score,
22
- }
23
-
24
-
25
- class GleuScoreEvaluator:
14
+ class GleuScoreEvaluator(EvaluatorBase):
26
15
  """
27
- Evaluator that computes the BLEU Score between two strings.
16
+ Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
28
17
 
29
18
  The GLEU (Google-BLEU) score evaluator measures the similarity between generated and reference texts by
30
19
  evaluating n-gram overlap, considering both precision and recall. This balanced evaluation, designed for
31
20
  sentence-level assessment, makes it ideal for detailed analysis of translation quality. GLEU is well-suited for
32
21
  use cases such as machine translation, text summarization, and text generation.
33
22
 
34
- **Usage**
23
+ GLEU scores range from 0 to 1, where a value of 1 represents perfect overlap between the response and
24
+ the ground truth and a value of 0 indicates no overlap.
35
25
 
36
- .. code-block:: python
26
+ :param threshold: The threshold for the GLEU evaluator. Default is 0.5.
27
+ :type threshold: float
37
28
 
38
- eval_fn = GleuScoreEvaluator()
39
- result = eval_fn(
40
- response="Tokyo is the capital of Japan.",
41
- ground_truth="The capital of Japan is Tokyo.")
29
+ .. admonition:: Example:
42
30
 
43
- **Output format**
31
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
32
+ :start-after: [START gleu_score_evaluator]
33
+ :end-before: [END gleu_score_evaluator]
34
+ :language: python
35
+ :dedent: 8
36
+ :caption: Initialize and call a GleuScoreEvaluator.
44
37
 
45
- .. code-block:: python
38
+ .. admonition:: Example with Threshold:
46
39
 
47
- {
48
- "gleu_score": 0.41
49
- }
40
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
41
+ :start-after: [START threshold_gleu_score_evaluator]
42
+ :end-before: [END threshold_gleu_score_evaluator]
43
+ :language: python
44
+ :dedent: 8
45
+ :caption: Initialize with threshold and call a GleuScoreEvaluator.
46
+
47
+ .. admonition:: Example using Azure AI Project URL:
48
+
49
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
50
+ :start-after: [START gleu_score_evaluator]
51
+ :end-before: [END gleu_score_evaluator]
52
+ :language: python
53
+ :dedent: 8
54
+ :caption: Initialize and call GleuScoreEvaluator using Azure AI Project URL in the following format
55
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
50
56
  """
51
57
 
52
- def __init__(self):
53
- self._async_evaluator = _AsyncGleuScoreEvaluator()
58
+ id = "azureai://built-in/evaluators/gleu_score"
59
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
54
60
 
55
- def __call__(self, *, ground_truth: str, response: str, **kwargs):
61
+ @override
62
+ def __init__(self, *, threshold=0.5):
63
+ self._threshold = threshold
64
+ self._higher_is_better = True
65
+ super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
66
+
67
+ @override
68
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
69
+ """Produce a glue score evaluation result.
70
+
71
+ :param eval_input: The input to the evaluation function.
72
+ :type eval_input: Dict
73
+ :return: The evaluation result.
74
+ :rtype: Dict
75
+ """
76
+ ground_truth = eval_input["ground_truth"]
77
+ response = eval_input["response"]
78
+ reference_tokens = nltk_tokenize(ground_truth)
79
+ hypothesis_tokens = nltk_tokenize(response)
80
+
81
+ score = sentence_gleu([reference_tokens], hypothesis_tokens)
82
+ binary_result = False
83
+ if self._higher_is_better:
84
+ if score >= self._threshold:
85
+ binary_result = True
86
+ else:
87
+ if score <= self._threshold:
88
+ binary_result = True
89
+ return {
90
+ "gleu_score": score,
91
+ "gleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
92
+ "gleu_threshold": self._threshold,
93
+ }
94
+
95
+ @overload # type: ignore
96
+ def __call__(self, *, ground_truth: str, response: str):
56
97
  """
57
98
  Evaluate the GLEU score between the response and the ground truth.
58
99
 
@@ -61,11 +102,23 @@ class GleuScoreEvaluator:
61
102
  :keyword ground_truth: The ground truth to be compared against.
62
103
  :paramtype ground_truth: str
63
104
  :return: The GLEU score.
64
- :rtype: dict
105
+ :rtype: Dict[str, float]
106
+ """
107
+
108
+ @override
109
+ def __call__( # pylint: disable=docstring-missing-param
110
+ self,
111
+ *args,
112
+ **kwargs,
113
+ ):
65
114
  """
66
- return async_run_allowing_running_loop(
67
- self._async_evaluator, ground_truth=ground_truth, response=response, **kwargs
68
- )
115
+ Evaluate the GLEU score between the response and the ground truth.
69
116
 
70
- def _to_async(self):
71
- return self._async_evaluator
117
+ :keyword response: The response to be evaluated.
118
+ :paramtype response: str
119
+ :keyword ground_truth: The ground truth to be compared against.
120
+ :paramtype ground_truth: str
121
+ :return: The GLEU score.
122
+ :rtype: Dict[str, float]
123
+ """
124
+ return super().__call__(*args, **kwargs)
@@ -1,118 +1,354 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
+ import os, logging
5
+ from typing import Dict, List, Optional, Union, Any, Tuple
4
6
 
5
- import os
6
- import re
7
+ from typing_extensions import overload, override
8
+ from azure.ai.evaluation._legacy.prompty import AsyncPrompty
7
9
 
8
- import numpy as np
9
- from promptflow._utils.async_utils import async_run_allowing_running_loop
10
- from promptflow.core import AsyncPrompty
11
-
12
- from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13
-
14
- from ..._common.utils import ensure_api_version_in_aoai_model_config, ensure_user_agent_in_aoai_model_config
10
+ from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
11
+ from azure.ai.evaluation._model_configurations import Conversation
12
+ from ..._common.utils import (
13
+ ErrorBlame,
14
+ ErrorTarget,
15
+ EvaluationException,
16
+ ErrorCategory,
17
+ construct_prompty_model_config,
18
+ validate_model_config,
19
+ simplify_messages,
20
+ )
15
21
 
16
22
  try:
17
- from ..._user_agent import USER_AGENT
23
+ from ..._user_agent import UserAgentSingleton
18
24
  except ImportError:
19
- USER_AGENT = None
20
-
21
-
22
- class _AsyncGroundednessEvaluator:
23
- # Constants must be defined within eval's directory to be save/loadable
24
- PROMPTY_FILE = "groundedness.prompty"
25
- LLM_CALL_TIMEOUT = 600
26
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
27
25
 
28
- def __init__(self, model_config: dict):
29
- ensure_api_version_in_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
26
+ class UserAgentSingleton:
27
+ @property
28
+ def value(self) -> str:
29
+ return "None"
30
30
 
31
- prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
32
31
 
33
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
34
- # https://github.com/encode/httpx/discussions/2959
35
- prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
32
+ logger = logging.getLogger(__name__)
36
33
 
37
- ensure_user_agent_in_aoai_model_config(
38
- model_config,
39
- prompty_model_config,
40
- USER_AGENT,
41
- )
42
34
 
43
- current_dir = os.path.dirname(__file__)
44
- prompty_path = os.path.join(current_dir, "groundedness.prompty")
45
- self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
35
+ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
36
+ """Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
37
+ including reasoning.
46
38
 
47
- async def __call__(self, *, response: str, context: str, **kwargs):
48
- # Validate input parameters
49
- response = str(response or "")
50
- context = str(context or "")
39
+ The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
40
+ context, making sure that these claims are substantiated by the context. Even if the responses from LLM are
41
+ factually correct, they'll be considered ungrounded if they can't be verified against the provided sources
42
+ (such as your input source or your database). Use the groundedness metric when you need to verify that
43
+ AI-generated responses align with and are validated by the provided context.
51
44
 
52
- if not response.strip() or not context.strip():
53
- msg = "Both 'response' and 'context' must be non-empty strings."
54
- raise EvaluationException(
55
- message=msg,
56
- internal_message=msg,
57
- error_category=ErrorCategory.MISSING_FIELD,
58
- error_blame=ErrorBlame.USER_ERROR,
59
- error_target=ErrorTarget.F1_EVALUATOR,
60
- )
45
+ Groundedness scores range from 1 to 5, with 1 being the least grounded and 5 being the most grounded.
61
46
 
62
- # Run the evaluation flow
63
- llm_output = await self._flow(response=response, context=context, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
47
+ :param model_config: Configuration for the Azure OpenAI model.
48
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
49
+ ~azure.ai.evaluation.OpenAIModelConfiguration]
50
+ :param threshold: The threshold for the groundedness evaluator. Default is 3.
51
+ :type threshold: int
52
+ :param credential: The credential for authenticating to Azure AI service.
53
+ :type credential: ~azure.core.credentials.TokenCredential
54
+ :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
55
+ This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
56
+ :paramtype is_reasoning_model: bool
64
57
 
65
- score = np.nan
66
- if llm_output:
67
- match = re.search(r"\d", llm_output)
68
- if match:
69
- score = float(match.group())
58
+ .. admonition:: Example:
70
59
 
71
- return {"gpt_groundedness": float(score)}
60
+ .. literalinclude:: ../samples/evaluation_samples_evaluate.py
61
+ :start-after: [START groundedness_evaluator]
62
+ :end-before: [END groundedness_evaluator]
63
+ :language: python
64
+ :dedent: 8
65
+ :caption: Initialize and call a GroundednessEvaluator.
72
66
 
67
+ .. admonition:: Example with Threshold:
73
68
 
74
- class GroundednessEvaluator:
75
- """
76
- Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
69
+ .. literalinclude:: ../samples/evaluation_samples_threshold.py
70
+ :start-after: [START threshold_groundedness_evaluator]
71
+ :end-before: [END threshold_groundedness_evaluator]
72
+ :language: python
73
+ :dedent: 8
74
+ :caption: Initialize with threshold and call a GroundednessEvaluator.
77
75
 
78
- :param model_config: Configuration for the Azure OpenAI model.
79
- :type model_config: Union[~azure.ai.evalation.AzureOpenAIModelConfiguration,
80
- ~azure.ai.evalation.OpenAIModelConfiguration]
76
+ .. admonition:: Example using Azure AI Project URL:
81
77
 
82
- **Usage**
78
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
79
+ :start-after: [START groundedness_evaluator]
80
+ :end-before: [END groundedness_evaluator]
81
+ :language: python
82
+ :dedent: 8
83
+ :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
84
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
83
85
 
84
- .. code-block:: python
86
+ .. note::
85
87
 
86
- eval_fn = GroundednessEvaluator(model_config)
87
- result = eval_fn(
88
- response="The capital of Japan is Tokyo.",
89
- context="Tokyo is Japan's capital, known for its blend of traditional culture \
90
- and technological advancements.")
88
+ To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
89
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
90
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
91
+ """
91
92
 
92
- **Output format**
93
+ _PROMPTY_FILE_NO_QUERY = "groundedness_without_query.prompty"
94
+ _PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
95
+ _RESULT_KEY = "groundedness"
96
+ _OPTIONAL_PARAMS = ["query"]
97
+ _SUPPORTED_TOOLS = ["file_search"]
93
98
 
94
- .. code-block:: python
99
+ id = "azureai://built-in/evaluators/groundedness"
100
+ """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
95
101
 
96
- {
97
- "gpt_groundedness": 5
98
- }
99
- """
102
+ @override
103
+ def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
104
+ current_dir = os.path.dirname(__file__)
105
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY) # Default to no query
100
106
 
101
- def __init__(self, model_config: dict):
102
- self._async_evaluator = _AsyncGroundednessEvaluator(model_config)
107
+ self._higher_is_better = True
108
+ super().__init__(
109
+ model_config=model_config,
110
+ prompty_file=prompty_path,
111
+ result_key=self._RESULT_KEY,
112
+ threshold=threshold,
113
+ credential=credential,
114
+ _higher_is_better=self._higher_is_better,
115
+ **kwargs,
116
+ )
117
+ self._model_config = model_config
118
+ self.threshold = threshold
119
+ # Needs to be set because it's used in call method to re-validate prompt if `query` is provided
103
120
 
104
- def __call__(self, *, response: str, context: str, **kwargs):
105
- """
106
- Evaluate groundedness of the response in the context.
121
+ @overload
122
+ def __call__(
123
+ self,
124
+ *,
125
+ response: str,
126
+ context: str,
127
+ query: Optional[str] = None,
128
+ ) -> Dict[str, Union[str, float]]:
129
+ """Evaluate groundedness for given input of response, context
107
130
 
108
131
  :keyword response: The response to be evaluated.
109
132
  :paramtype response: str
110
- :keyword context: The context in which the response is evaluated.
133
+ :keyword context: The context to be evaluated.
111
134
  :paramtype context: str
135
+ :keyword query: The query to be evaluated. Optional parameter for use with the `response`
136
+ and `context` parameters. If provided, a different prompt template will be used for evaluation.
137
+ :paramtype query: Optional[str]
138
+ :return: The groundedness score.
139
+ :rtype: Dict[str, float]
140
+ """
141
+
142
+ @overload
143
+ def __call__(
144
+ self,
145
+ *,
146
+ query: str,
147
+ response: List[dict],
148
+ tool_definitions: List[dict],
149
+ ) -> Dict[str, Union[str, float]]:
150
+ """Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
151
+
152
+ :keyword query: The query to be evaluated.
153
+ :paramtype query: str
154
+ :keyword response: The response from the agent to be evaluated.
155
+ :paramtype response: List[dict]
156
+ :keyword tool_definitions: The tool definitions used by the agent.
157
+ :paramtype tool_definitions: List[dict]
158
+ :return: The groundedness score.
159
+ :rtype: Dict[str, Union[str, float]]
160
+ """
161
+
162
+ @overload
163
+ def __call__(
164
+ self,
165
+ *,
166
+ conversation: Conversation,
167
+ ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
168
+ """Evaluate groundedness for a conversation
169
+
170
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
171
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
172
+ to be dictionaries with keys "content", "role", and possibly "context".
173
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
112
174
  :return: The groundedness score.
113
- :rtype: dict
175
+ :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
176
+ """
177
+
178
+ @override
179
+ def __call__( # pylint: disable=docstring-missing-param
180
+ self,
181
+ *args,
182
+ **kwargs,
183
+ ):
184
+ """Evaluate groundedness. Accepts either a query, response, and context for a single evaluation,
185
+ or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
186
+ the evaluator will aggregate the results of each turn.
187
+
188
+ :keyword query: The query to be evaluated. Mutually exclusive with `conversation`. Optional parameter for use
189
+ with the `response` and `context` parameters. If provided, a different prompt template will be used for
190
+ evaluation.
191
+ :paramtype query: Optional[str]
192
+ :keyword response: The response to be evaluated. Mutually exclusive with the `conversation` parameter.
193
+ :paramtype response: Optional[str]
194
+ :keyword context: The context to be evaluated. Mutually exclusive with the `conversation` parameter.
195
+ :paramtype context: Optional[str]
196
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
197
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
198
+ to be dictionaries with keys "content", "role", and possibly "context".
199
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
200
+ :return: The relevance score.
201
+ :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
202
+ """
203
+
204
+ if kwargs.get("query", None):
205
+ self._ensure_query_prompty_loaded()
206
+
207
+ return super().__call__(*args, **kwargs)
208
+
209
+ def _ensure_query_prompty_loaded(self):
210
+ """Switch to the query prompty file if not already loaded."""
211
+
212
+ current_dir = os.path.dirname(__file__)
213
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
214
+
215
+ self._prompty_file = prompty_path
216
+ prompty_model_config = construct_prompty_model_config(
217
+ validate_model_config(self._model_config),
218
+ self._DEFAULT_OPEN_API_VERSION,
219
+ UserAgentSingleton().value,
220
+ )
221
+ self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
222
+
223
+ def _has_context(self, eval_input: dict) -> bool:
224
+ """
225
+ Return True if eval_input contains a non-empty 'context' field.
226
+ Treats None, empty strings, empty lists, and lists of empty strings as no context.
227
+ """
228
+ context = eval_input.get("context", None)
229
+ if not context:
230
+ return False
231
+ if context == "<>": # Special marker for no context
232
+ return False
233
+ if isinstance(context, list):
234
+ return any(str(c).strip() for c in context)
235
+ if isinstance(context, str):
236
+ return bool(context.strip())
237
+ return True
238
+
239
+ @override
240
+ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
241
+ if eval_input.get("query", None) is None:
242
+ return await super()._do_eval(eval_input)
243
+
244
+ contains_context = self._has_context(eval_input)
245
+
246
+ simplified_query = simplify_messages(eval_input["query"], drop_tool_calls=contains_context)
247
+ simplified_response = simplify_messages(eval_input["response"], drop_tool_calls=False)
248
+
249
+ # Build simplified input
250
+ simplified_eval_input = {
251
+ "query": simplified_query,
252
+ "response": simplified_response,
253
+ "context": eval_input["context"],
254
+ }
255
+
256
+ # Replace and call the parent method
257
+ return await super()._do_eval(simplified_eval_input)
258
+
259
+ async def _real_call(self, **kwargs):
260
+ """The asynchronous call where real end-to-end evaluation logic is performed.
261
+
262
+ :keyword kwargs: The inputs to evaluate.
263
+ :type kwargs: Dict
264
+ :return: The evaluation result.
265
+ :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
114
266
  """
115
- return async_run_allowing_running_loop(self._async_evaluator, response=response, context=context, **kwargs)
267
+ # Convert inputs into list of evaluable inputs.
268
+ try:
269
+ return await super()._real_call(**kwargs)
270
+ except EvaluationException as ex:
271
+ if ex.category == ErrorCategory.NOT_APPLICABLE:
272
+ return {
273
+ self._result_key: self._NOT_APPLICABLE_RESULT,
274
+ f"{self._result_key}_result": "pass",
275
+ f"{self._result_key}_threshold": self.threshold,
276
+ f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
277
+ }
278
+ else:
279
+ raise ex
280
+
281
+ def _convert_kwargs_to_eval_input(self, **kwargs):
282
+ if kwargs.get("context") or kwargs.get("conversation"):
283
+ return super()._convert_kwargs_to_eval_input(**kwargs)
284
+ query = kwargs.get("query")
285
+ response = kwargs.get("response")
286
+ tool_definitions = kwargs.get("tool_definitions")
287
+
288
+ if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
289
+ self._ensure_query_prompty_loaded()
290
+
291
+ if (not query) or (not response): # or not tool_definitions:
292
+ msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
293
+ raise EvaluationException(
294
+ message=msg,
295
+ blame=ErrorBlame.USER_ERROR,
296
+ category=ErrorCategory.INVALID_VALUE,
297
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
298
+ )
299
+ context = self._get_context_from_agent_response(response, tool_definitions)
300
+
301
+ filtered_response = self._filter_file_search_results(response)
302
+ return super()._convert_kwargs_to_eval_input(response=filtered_response, context=context, query=query)
303
+
304
+ def _filter_file_search_results(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
305
+ """Filter out file_search tool results from the messages."""
306
+ file_search_ids = self._get_file_search_tool_call_ids(messages)
307
+ return [
308
+ msg for msg in messages if not (msg.get("role") == "tool" and msg.get("tool_call_id") in file_search_ids)
309
+ ]
310
+
311
+ def _get_context_from_agent_response(self, response, tool_definitions):
312
+ """Extract context text from file_search tool results in the agent response."""
313
+ NO_CONTEXT = "<>"
314
+ context = ""
315
+ try:
316
+ logger.debug("Extracting context from response")
317
+ tool_calls = self._parse_tools_from_response(response=response)
318
+ logger.debug(f"Tool Calls parsed successfully: {tool_calls}")
319
+
320
+ if not tool_calls:
321
+ return NO_CONTEXT
322
+
323
+ context_lines = []
324
+ for tool_call in tool_calls:
325
+ if not isinstance(tool_call, dict) or tool_call.get("type") != "tool_call":
326
+ continue
327
+
328
+ tool_name = tool_call.get("name")
329
+ if tool_name != "file_search":
330
+ continue
331
+
332
+ # Extract tool results
333
+ for result in tool_call.get("tool_result", []):
334
+ results = result if isinstance(result, list) else [result]
335
+ for r in results:
336
+ file_name = r.get("file_name", "Unknown file name")
337
+ for content in r.get("content", []):
338
+ text = content.get("text")
339
+ if text:
340
+ context_lines.append(f"{file_name}:\n- {text}---\n\n")
341
+
342
+ context = "\n".join(context_lines) if len(context_lines) > 0 else None
343
+
344
+ except Exception as ex:
345
+ logger.debug(f"Error extracting context from agent response : {str(ex)}")
346
+ context = None
347
+
348
+ context = context if context else NO_CONTEXT
349
+ return context
116
350
 
117
- def _to_async(self):
118
- return self._async_evaluator
351
+ def _get_file_search_tool_call_ids(self, query_or_response):
352
+ """Return a list of tool_call_ids for file search tool calls."""
353
+ tool_calls = self._parse_tools_from_response(query_or_response)
354
+ return [tc.get("tool_call_id") for tc in tool_calls if tc.get("name") == "file_search"]