azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,899 @@
1
+ import json
2
+ from abc import abstractmethod
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
+
5
+ from azure.ai.projects import __version__ as projects_version
6
+ from azure.ai.projects import AIProjectClient
7
+
8
+ from typing import List, Union
9
+
10
+ from azure.ai.evaluation._common._experimental import experimental
11
+ from packaging.version import Version
12
+
13
+ # Constants.
14
+ from ._models import (
15
+ _USER,
16
+ _AGENT,
17
+ _TOOL,
18
+ _TOOL_CALL,
19
+ _TOOL_CALLS,
20
+ _FUNCTION,
21
+ _BUILT_IN_DESCRIPTIONS,
22
+ _BUILT_IN_PARAMS,
23
+ _OPENAPI,
24
+ OpenAPIToolDefinition,
25
+ )
26
+
27
+ # Message instances.
28
+ from ._models import Message, SystemMessage, UserMessage, AssistantMessage, ToolCall
29
+
30
+ # Intermediate definitions to hold results.
31
+ from ._models import ToolDefinition, EvaluatorData
32
+
33
+ # Utilities.
34
+ from ._models import break_tool_call_into_messages, convert_message
35
+
36
+
37
+ @experimental
38
+ class AIAgentConverter:
39
+ """
40
+ A converter for AI agent data. Data retrieval classes handle getting agent data depending on
41
+ agent version.
42
+
43
+ :param project_client: The AI project client used for API interactions.
44
+ :type project_client: AIProjectClient
45
+ """
46
+
47
+ # Maximum number of workers allowed to make API calls at the same time.
48
+ _MAX_WORKERS = 10
49
+
50
+ def __init__(self, project_client: AIProjectClient):
51
+ """
52
+ Initializes the AIAgentConverter with the given AI project client.
53
+
54
+ :param project_client: The AI project client used for API interactions.
55
+ :type project_client: AIProjectClient
56
+ """
57
+ self.project_client = project_client
58
+ self._data_retriever = AIAgentConverter._get_data_retriever(project_client=project_client)
59
+
60
+ @staticmethod
61
+ def _get_data_retriever(project_client: AIProjectClient):
62
+ if project_client is None:
63
+ return None
64
+ if Version(projects_version) > Version("1.0.0b10"):
65
+ return FDPAgentDataRetriever(project_client=project_client)
66
+ else:
67
+ return LegacyAgentDataRetriever(project_client=project_client)
68
+
69
+ def _list_tool_calls_chronological(self, thread_id: str, run_id: str) -> List[ToolCall]:
70
+ """
71
+ Lists tool calls in chronological order for a given thread and run.
72
+
73
+ :param thread_id: The ID of the thread.
74
+ :type thread_id: str
75
+ :param run_id: The ID of the run.
76
+ :type run_id: str
77
+ :return: A list of tool calls in chronological order.
78
+ :rtype: List[ToolCall]
79
+ """
80
+ # This is the other API request that we need to make to AI service, such that we can get the details about
81
+ # the tool calls and results. Since the list is given in reverse chronological order, we need to reverse it.
82
+ run_steps_chronological = self._data_retriever._list_run_steps_chronological(thread_id=thread_id, run_id=run_id)
83
+
84
+ # Let's accumulate the function calls in chronological order. Function calls
85
+ tool_calls_chronological: List[ToolCall] = []
86
+ for run_step_chronological in run_steps_chronological:
87
+ if run_step_chronological.type != _TOOL_CALLS:
88
+ continue
89
+ step_details: object = run_step_chronological.step_details
90
+ if step_details.type != _TOOL_CALLS:
91
+ continue
92
+ if len(step_details.tool_calls) < 1:
93
+ continue
94
+ for tool_call in step_details.tool_calls:
95
+ # We need to add the tool call and the result as two separate messages.
96
+ tool_calls_chronological.append(
97
+ ToolCall(
98
+ created=run_step_chronological.created_at,
99
+ completed=run_step_chronological.completed_at,
100
+ details=tool_call,
101
+ )
102
+ )
103
+
104
+ return tool_calls_chronological
105
+
106
+ @staticmethod
107
+ def _extract_function_tool_definitions(thread_run: object) -> List[Union[ToolDefinition, OpenAPIToolDefinition]]:
108
+ """
109
+ Extracts tool definitions from a thread run.
110
+
111
+ :param thread_run: The thread run containing tool definitions.
112
+ :type thread_run: object
113
+ :return: A list of tool definitions extracted from the thread run.
114
+ :rtype: List[ToolDefinition]
115
+ """
116
+ final_tools: List[ToolDefinition] = []
117
+ for tool in thread_run.tools:
118
+ # Here we handle the custom functions and create tool definitions out of them.
119
+ if tool.type == _FUNCTION:
120
+ tool_function: FunctionDefinition = tool.function
121
+ parameters = tool_function.parameters
122
+
123
+ # The target schema doesn't support required fields, so we omit it for now.
124
+ if "required" in parameters:
125
+ del parameters["required"]
126
+
127
+ final_tools.append(
128
+ ToolDefinition(
129
+ type="function",
130
+ name=tool_function.name,
131
+ description=tool_function.description,
132
+ parameters=parameters,
133
+ )
134
+ )
135
+ elif tool.type == _OPENAPI:
136
+ openapi_tool = tool.openapi
137
+ tool_definition = OpenAPIToolDefinition(
138
+ name=openapi_tool.name,
139
+ description=openapi_tool.description,
140
+ type=_OPENAPI,
141
+ spec=openapi_tool.spec,
142
+ auth=openapi_tool.auth.as_dict(),
143
+ default_params=openapi_tool.default_params.as_dict() if openapi_tool.default_params else None,
144
+ functions=[
145
+ ToolDefinition(
146
+ name=func.get("name"),
147
+ description=func.get("description"),
148
+ parameters=func.get("parameters"),
149
+ type="function",
150
+ )
151
+ for func in openapi_tool.get("functions")
152
+ ],
153
+ )
154
+ final_tools.append(tool_definition)
155
+ else:
156
+ # Add limited support for built-in tools. Descriptions and parameters
157
+ # are not published, but we'll include placeholders.
158
+ if tool.type in _BUILT_IN_DESCRIPTIONS and tool.type in _BUILT_IN_PARAMS:
159
+ final_tools.append(
160
+ ToolDefinition(
161
+ type=tool.type,
162
+ name=tool.type,
163
+ description=_BUILT_IN_DESCRIPTIONS[tool.type],
164
+ parameters=_BUILT_IN_PARAMS[tool.type],
165
+ )
166
+ )
167
+ return final_tools
168
+
169
+ @staticmethod
170
+ def _break_into_query_responses(messages: List[Message], run_id: str) -> (List[Message], List[Message]):
171
+ """
172
+ Breaks a list of messages into query and response messages based on the run ID.
173
+
174
+ :param messages: The list of messages to be broken into query and response.
175
+ :type messages: List[Message]
176
+ :param run_id: The ID of the run to distinguish response messages.
177
+ :type run_id: str
178
+ :return: A tuple containing two lists - the first list contains query messages, and the second list contains response messages.
179
+ :rtype: (List[Message], List[Message])
180
+ """
181
+ query: List[Message] = [what for what in messages if what.run_id != run_id]
182
+ responses: List[Message] = [what for what in messages if what.run_id == run_id]
183
+ return query, responses
184
+
185
+ @staticmethod
186
+ def _filter_run_ids_up_to_run_id(run_ids: List[str], run_id: str, include_run_id: bool = True) -> List[str]:
187
+ """
188
+ Filters run IDs up to a specific run ID.
189
+
190
+ This method processes a list of run IDs and filters out run IDs that come after the specified run ID.
191
+ It ensures that only run IDs up to and including the specified run ID are included in the result.
192
+
193
+ :param run_ids: The list of run IDs in chronological order.
194
+ :type run_ids: List[str]
195
+ :param run_id: The ID of the run to filter messages up to.
196
+ :type run_id: str
197
+ :return: The filtered list of run IDs up to the specified run ID.
198
+ :rtype: List[str]
199
+ """
200
+ for index, single_run_id in enumerate(run_ids):
201
+ # Since this is the conversation of the entire thread and we are interested in a given run, we need to
202
+ # filter out the messages that came after the run.
203
+ if single_run_id == run_id:
204
+ if include_run_id:
205
+ return run_ids[: index + 1]
206
+ return run_ids[:index]
207
+
208
+ # If we didn't find the run_id, we return an empty list.
209
+ return []
210
+
211
+ @staticmethod
212
+ def _filter_messages_up_to_run_id(
213
+ chronological_messages, run_id: str, include_run_id: bool = True
214
+ ) -> List[Message]:
215
+ """
216
+ Filters messages up to a specific run ID.
217
+
218
+ This method processes a list of messages in chronological order and filters out messages that come after the specified run ID.
219
+ It ensures that only messages up to and including the specified run ID are included in the result.
220
+
221
+ :param chronological_messages: The list of messages in chronological order.
222
+ :type chronological_messages: List[Message]
223
+ :param run_id: The ID of the run to filter messages up to.
224
+ :type run_id: str
225
+ :return: The filtered list of messages up to the specified run ID.
226
+ :rtype: List[Message]
227
+ """
228
+ filtered_messages = []
229
+ in_my_current_run = False
230
+ for single_turn in chronological_messages:
231
+ # Since this is the conversation of the entire thread and we are interested in a given run, we need to
232
+ # filter out the messages that came after the run.
233
+ if single_turn.run_id is not None:
234
+ if single_turn.run_id == run_id:
235
+ in_my_current_run = True
236
+
237
+ # If we entered our current run and its the run that we have requested to filter up to, but
238
+ # not including, we can break out of the loop.
239
+ if not include_run_id:
240
+ break
241
+
242
+ # Then, if we think that we are currently in our run and we have a message that is not from our run,
243
+ # it means that we have left our run.
244
+ if in_my_current_run and single_turn.run_id != run_id:
245
+ break
246
+
247
+ # We're good to add it.
248
+ filtered_messages.append(single_turn)
249
+
250
+ return filtered_messages
251
+
252
+ @staticmethod
253
+ def _extract_typed_messages(ai_services_messages) -> List[Message]:
254
+ """
255
+ Extracts and converts AI service messages to a list of typed Message objects.
256
+
257
+ This method processes a list of messages from the AI service, converting them into
258
+ appropriate Message subclass instances (UserMessage, AssistantMessage) based on their role.
259
+ It filters out messages without content and handles different message roles accordingly.
260
+
261
+ :param ai_services_messages: A list of messages from the AI service.
262
+ :type ai_services_messages: _models.OpenAIPageableListOfThreadMessage (some internal type from ai projects)
263
+ :return: A list of typed Message objects.
264
+ :rtype: List[Message]
265
+ """
266
+ # We will collect messages in this accumulator.
267
+ final_messages: List[Message] = []
268
+
269
+ # Each visible message in the conversation is a message from the user or the assistant, we collect
270
+ # both the text and timestamp, so we can recreate the chronological order.
271
+ for single_turn in ai_services_messages:
272
+ # This shouldn't really happen, ever. What's the point of a message without content? But to avoid a nasty
273
+ # crash on one of the historical messages, let's check for it and bail out from this iteration.
274
+ if len(single_turn.content) < 1:
275
+ continue
276
+
277
+ content_list = []
278
+ # If content is a list, process all content items.
279
+ for content_item in single_turn.content:
280
+ if content_item.type == "text":
281
+ content_list.append(
282
+ {
283
+ "type": "text",
284
+ "text": content_item.text.value,
285
+ }
286
+ )
287
+ elif content_item.type == "image":
288
+ content_list.append(
289
+ {
290
+ "type": "image",
291
+ "image": {
292
+ "file_id": content_item.image_file.file_id,
293
+ },
294
+ }
295
+ )
296
+
297
+ # If we have a user message, then we save it as such and since it's a human message, there is no
298
+ # run_id associated with it.
299
+ if single_turn.role == _USER:
300
+ final_messages.append(UserMessage(content=content_list, createdAt=single_turn.created_at))
301
+ continue
302
+
303
+ # In this case, we have an assistant message. Unfortunately, this would only have the user-facing
304
+ # agent's response, without any details on what tool was called, with what parameters, and what
305
+ # the result was. That will be added later in the method.
306
+ if single_turn.role == _AGENT:
307
+ # We are required to put the run_id in the assistant message.
308
+ final_messages.append(
309
+ AssistantMessage(content=content_list, run_id=single_turn.run_id, createdAt=single_turn.created_at)
310
+ )
311
+ continue
312
+
313
+ return final_messages
314
+
315
+ def _fetch_tool_calls(self, thread_id: str, run_id: str) -> List[Message]:
316
+ """
317
+ Fetches tool calls for a given thread and run, and converts them into messages.
318
+
319
+ This method retrieves tool calls for a specified thread and run, converts them into messages using the
320
+ `break_tool_call_into_messages` utility function, and returns the list of messages.
321
+
322
+ :param thread_id: The ID of the thread.
323
+ :type thread_id: str
324
+ :param run_id: The ID of the run.
325
+ :type run_id: str
326
+ :return: A list of messages converted from tool calls.
327
+ :rtype: List[Message]
328
+ """
329
+ tool_calls: List[Message] = []
330
+ for chrono_tool_call in self._list_tool_calls_chronological(thread_id, run_id):
331
+ tool_calls.extend(break_tool_call_into_messages(chrono_tool_call, run_id))
332
+ return tool_calls
333
+
334
+ def _retrieve_tool_calls_up_to_including_run_id(
335
+ self, thread_id: str, run_id: str, exclude_tool_calls_previous_runs: bool = False
336
+ ) -> List[Message]:
337
+ """
338
+ Converts tool calls to messages for a given thread and run.
339
+
340
+ This method retrieves tool calls for a specified thread and run, converts them into messages,
341
+ and optionally includes tool calls from previous runs.
342
+
343
+ :param thread_id: The ID of the thread.
344
+ :type thread_id: str
345
+ :param run_id: The ID of the run.
346
+ :type run_id: str
347
+ :param exclude_tool_calls_previous_runs: Whether to exclude tool calls from previous runs in the conversion. Default is False.
348
+ :type exclude_tool_calls_previous_runs: bool
349
+ :return: A list of messages converted from tool calls.
350
+ :rtype: List[Message]
351
+ """
352
+ to_return: List[Message] = []
353
+
354
+ # Add all the tool calls and results of this run as messages.
355
+ for tool_call in self._list_tool_calls_chronological(thread_id, run_id):
356
+ # We need to add the tool call and the result as two separate messages.
357
+ to_return.extend(break_tool_call_into_messages(tool_call, run_id))
358
+
359
+ # We also request to add all the tool calls and results of the previous runs into the chat history. This is
360
+ # a bit of an expensive operation, but the requirement is to support this functionality, even at the penalty
361
+ # in latency in performance. New agents api is to include these details cheaply through a single API call in
362
+ # list_messages, but until that is available, we need to do this. User can also opt-out of this functionality
363
+ # by setting the exclude_tool_calls_previous_runs flag to True.
364
+ if not exclude_tool_calls_previous_runs:
365
+ # These are all the assistant (any number) in the thread.
366
+ # We set the include_run_id to False, since we don't want to include the current run's tool calls, which
367
+ # are already included in the previous step.
368
+ run_ids_up_to_run_id = AIAgentConverter._filter_run_ids_up_to_run_id(
369
+ self._data_retriever._list_run_ids_chronological(thread_id), run_id, include_run_id=False
370
+ )
371
+
372
+ # Since each _list_tool_calls_chronological call is expensive, we can use a thread pool to speed
373
+ # up the process by parallelizing the AI Services API requests.
374
+ with ThreadPoolExecutor(max_workers=self._MAX_WORKERS) as executor:
375
+ futures = {
376
+ executor.submit(self._fetch_tool_calls, thread_id, run_id): run_id
377
+ for run_id in run_ids_up_to_run_id
378
+ }
379
+ for future in as_completed(futures):
380
+ to_return.extend(future.result())
381
+
382
+ return to_return
383
+
384
+ def _retrieve_all_tool_calls(self, thread_id: str, run_ids: List[str]) -> List[Message]:
385
+ """
386
+ Converts all tool calls to messages for a given thread and list of run IDs.
387
+
388
+ This method retrieves tool calls for a specified thread and list of run IDs, converts them into messages,
389
+ and returns the list of messages.
390
+
391
+ :param thread_id: The ID of the thread.
392
+ :type thread_id: str
393
+ :param run_ids: The list of run IDs.
394
+ :type run_ids: List[str]
395
+ :return: A list of messages converted from tool calls.
396
+ :rtype: List[Message]
397
+ """
398
+ to_return: List[Message] = []
399
+
400
+ with ThreadPoolExecutor(max_workers=self._MAX_WORKERS) as executor:
401
+ futures = {executor.submit(self._fetch_tool_calls, thread_id, run_id): run_id for run_id in run_ids}
402
+ for future in as_completed(futures):
403
+ to_return.extend(future.result())
404
+
405
+ return to_return
406
+
407
+ @staticmethod
408
+ def _is_agent_tool_call(message: Message) -> bool:
409
+ """
410
+ Determines if a message is an agent tool call.
411
+
412
+ :param message: The message to be checked.
413
+ :type message: Message
414
+ :return: True if the message is an agent tool call, False otherwise.
415
+ :rtype: bool
416
+ """
417
+ return (
418
+ message.role == _AGENT # Any other agent that this run's.
419
+ and isinstance(message.content, list) # Content is of expected type.
420
+ and len(message.content) > 0 # There are messages/calls/results present.
421
+ and "type" in message.content[0] # Being safe here.
422
+ and message.content[0]["type"] == _TOOL_CALL # Not interested in assistant's toolcalls.
423
+ )
424
+
425
+ @staticmethod
426
+ def _sort_messages(messages: List[Message]) -> List[Message]:
427
+ """
428
+ Sorts a list of messages, placing messages with `createdAt` set to None at the beginning.
429
+
430
+ :param messages: The list of messages to be sorted.
431
+ :type messages: List[Message]
432
+ :return: The sorted list of messages.
433
+ :rtype: List[Message]
434
+ """
435
+ # Separate messages with createdAt set to None
436
+ none_created_at = [message for message in messages if message.createdAt is None]
437
+
438
+ # Filter out messages with createdAt set to None and sort the remaining messages
439
+ sorted_messages = sorted(
440
+ [message for message in messages if message.createdAt is not None],
441
+ key=lambda x: (x.createdAt, x.role == _AGENT),
442
+ )
443
+
444
+ # Combine the lists, placing messages with None createdAt at the beginning
445
+ return none_created_at + sorted_messages
446
+
447
+ def convert(self, thread_id: str, run_id: str, exclude_tool_calls_previous_runs: bool = False) -> dict:
448
+ """
449
+ Converts the agent run to a format suitable for the OpenAI API.
450
+
451
+ :param thread_id: The ID of the thread.
452
+ :type thread_id: str
453
+ :param run_id: The ID of the run.
454
+ :type run_id: str
455
+ :param exclude_tool_calls_previous_runs: Whether to exclude tool calls from previous runs in the conversion.
456
+ :type exclude_tool_calls_previous_runs: bool
457
+ :return: The converted data in dictionary format.
458
+ :rtype: dict
459
+ """
460
+ # Make the API call once and reuse the result.
461
+ thread_run: object = self._data_retriever._get_run(thread_id=thread_id, run_id=run_id)
462
+
463
+ # Walk through the "user-facing" conversation history and start adding messages.
464
+ chronological_conversation = self._data_retriever._list_messages_chronological(thread_id)
465
+
466
+ # Since this is Xth run of out possibly N runs, we are only interested is messages that are before the run X.
467
+ chrono_until_run_id = AIAgentConverter._filter_messages_up_to_run_id(chronological_conversation, run_id)
468
+
469
+ # Messages are now still in hidden AI services' type, so to get finer control over our typing, we need to
470
+ # convert the message to a friendly schema.
471
+ final_messages = AIAgentConverter._extract_typed_messages(chrono_until_run_id)
472
+
473
+ # Third, add all the tool calls and results as messages.
474
+ final_messages.extend(
475
+ self._retrieve_tool_calls_up_to_including_run_id(thread_id, run_id, exclude_tool_calls_previous_runs)
476
+ )
477
+
478
+ # All of our final messages have to be in chronological order. We use a secondary sorting key,
479
+ # since the tool_result and assistant events would come with the same timestamp, so we need to
480
+ # sort them by role, such that the assistant's message would come after the tool result it's sending.
481
+ final_messages = AIAgentConverter._sort_messages(final_messages)
482
+
483
+ # Finally, we want to force the system message to be the first one in the list.
484
+ # First, we need to create the first system message of the thread.
485
+ instructions = thread_run.instructions
486
+ if instructions:
487
+ # The system message will have a string content.
488
+ final_messages.insert(0, SystemMessage(content=instructions))
489
+
490
+ # We need to collect all the messages that are not the current run's response.
491
+ query, responses = AIAgentConverter._break_into_query_responses(final_messages, run_id)
492
+
493
+ # Collect it into the final result and dump it to JSON.
494
+ final_result = EvaluatorData(
495
+ query=query,
496
+ response=responses,
497
+ tool_definitions=AIAgentConverter._extract_function_tool_definitions(thread_run),
498
+ )
499
+
500
+ return json.loads(final_result.to_json())
501
+
502
+ def _prepare_single_thread_evaluation_data(self, thread_id: str, filename: str = None) -> List[dict]:
503
+ """
504
+ Prepares evaluation data for a given thread and optionally writes it to a file.
505
+
506
+ This method retrieves all run IDs and messages for the specified thread, processes them to create evaluation data,
507
+ and optionally writes the evaluation data to a JSONL file. The evaluation data includes query and response messages
508
+ as well as tool definitions.
509
+
510
+ :param thread_id: The ID of the thread.
511
+ :type thread_id: str
512
+ :param filename: The name of the file to write the evaluation data to. If None, the data is not written to a file.
513
+ :type filename: str, optional
514
+ :return: A list of evaluation data dictionaries.
515
+ :rtype: List[dict]
516
+ """
517
+ list_of_run_evaluations: List[dict] = []
518
+
519
+ # These are all the run IDs.
520
+ run_ids = self._data_retriever._list_run_ids_chronological(thread_id)
521
+
522
+ # If there were no messages in the thread, we can return an empty list.
523
+ if len(run_ids) < 1:
524
+ return list_of_run_evaluations
525
+
526
+ # These are all the messages.
527
+ chronological_conversation = self._data_retriever._list_messages_chronological(thread_id)
528
+
529
+ # If there are no messages in the thread, we can return an empty list.
530
+ if len(chronological_conversation) < 1:
531
+ return list_of_run_evaluations
532
+
533
+ # These are all the tool calls.
534
+ all_sorted_tool_calls = AIAgentConverter._sort_messages(self._retrieve_all_tool_calls(thread_id, run_ids))
535
+
536
+ # The last run should have all the tool definitions.
537
+ thread_run = self._data_retriever._get_run(thread_id=thread_id, run_id=run_ids[-1])
538
+ instructions = thread_run.instructions
539
+
540
+ # So then we can get the tool definitions.
541
+ tool_definitions = AIAgentConverter._extract_function_tool_definitions(thread_run)
542
+
543
+ # Now, we create a new evaluator object for each run.
544
+ for run_id in run_ids:
545
+ # We need to filter out the messages that are not from the current run.
546
+ simple_messages = AIAgentConverter._filter_messages_up_to_run_id(chronological_conversation, run_id)
547
+
548
+ # Now we need to convert from OpenAI's general ThreadMessage model into our Azure Agents models.
549
+ typed_simple_messages = AIAgentConverter._extract_typed_messages(simple_messages)
550
+
551
+ # We also need to filter out the tool calls that are not from the current run.
552
+ sorted_tool_calls = AIAgentConverter._filter_messages_up_to_run_id(all_sorted_tool_calls, run_id)
553
+
554
+ # Build the big list.
555
+ this_runs_messages = []
556
+ this_runs_messages.extend(typed_simple_messages)
557
+ this_runs_messages.extend(sorted_tool_calls)
558
+
559
+ # Sort it, so it looks nicely in chronological order.
560
+ this_runs_messages = AIAgentConverter._sort_messages(this_runs_messages)
561
+
562
+ # If we have a system message, we need to put it at the top of the list.
563
+ if instructions:
564
+ # The system message will have a string content.
565
+ this_runs_messages.insert(0, SystemMessage(content=instructions))
566
+
567
+ # Since now we have the messages in the expected order, we need to break them into the query and
568
+ # responses.
569
+ query, responses = AIAgentConverter._break_into_query_responses(this_runs_messages, run_id)
570
+
571
+ # Finally, let's pack it up into the final result.
572
+ final_result = EvaluatorData(
573
+ query=query,
574
+ response=responses,
575
+ tool_definitions=tool_definitions,
576
+ )
577
+
578
+ # Add it to the list of evaluations.
579
+ list_of_run_evaluations.append(json.loads(final_result.to_json()))
580
+
581
+ # So, if we have the filename, we can write it to the file, which is expected to be a JSONL file.
582
+ if filename:
583
+ with open(filename, mode="a", encoding="utf-8") as file:
584
+ for evaluation in list_of_run_evaluations:
585
+ file.write(json.dumps(evaluation) + "\n")
586
+
587
+ # We always return the list of evaluations, even if we didn't or did write it to a file.
588
+ return list_of_run_evaluations
589
+
590
+ def prepare_evaluation_data(self, thread_ids=Union[str, List[str]], filename: str = None) -> List[dict]:
591
+ """
592
+ Prepares evaluation data for a given thread or list of threads and optionally writes it to a file.
593
+
594
+ This method retrieves all run IDs and messages for the specified thread(s), processes them to create evaluation data,
595
+ and optionally writes the evaluation data to a JSONL file. The evaluation data includes query and response messages
596
+ as well as tool definitions.
597
+
598
+ :param thread_ids: The ID(s) of the thread(s). Can be a single thread ID or a list of thread IDs.
599
+ :type thread_ids: Union[str, List[str]]
600
+ :param filename: The name of the file to write the evaluation data to. If None, the data is not written to a file.
601
+ :type filename: str, optional
602
+ :return: A list of evaluation data dictionaries.
603
+ :rtype: List[dict]
604
+ """
605
+ # Single instance, pretty much the same as the list.
606
+ if isinstance(thread_ids, str):
607
+ return self._prepare_single_thread_evaluation_data(thread_id=thread_ids, filename=filename)
608
+
609
+ evaluations = []
610
+ with ThreadPoolExecutor(max_workers=self._MAX_WORKERS) as executor:
611
+ # We override the filename, because we don't want to write the file for each thread, having to handle
612
+ # threading issues and file being opened from multiple threads, instead, we just want to write it once
613
+ # at the end.
614
+ futures = {
615
+ executor.submit(self._prepare_single_thread_evaluation_data, str(thread_id), None): thread_id
616
+ for thread_id in thread_ids
617
+ }
618
+ for future in as_completed(futures):
619
+ evaluations.extend(future.result())
620
+
621
+ # So, if we have the filename, we can write it to the file, which is expected to be a JSONL file.
622
+ if filename:
623
+ with open(filename, mode="a", encoding="utf-8") as file:
624
+ for evaluation in evaluations:
625
+ file.write(json.dumps(evaluation) + "\n")
626
+
627
+ return evaluations
628
+
629
+ @staticmethod
630
+ def _run_ids_from_conversation(conversation: dict) -> List[str]:
631
+ """
632
+ Extracts a list of unique run IDs from a conversation dictionary.
633
+
634
+ :param conversation: The conversation dictionary containing messages.
635
+ :type conversation: dict
636
+ :return: A list of unique run IDs in the order they appear.
637
+ :rtype: List[str]
638
+ """
639
+ if not isinstance(conversation, dict) or "messages" not in conversation:
640
+ return []
641
+ run_ids_with_repetitions = [message["run_id"] for message in conversation["messages"] if "run_id" in message]
642
+ # Removes duplicates, requires Python 3.7+ to ensure order is preserved
643
+ run_ids = list(dict.fromkeys(run_ids_with_repetitions))
644
+ return run_ids
645
+
646
+ @staticmethod
647
+ def _convert_from_conversation(
648
+ conversation: dict, run_id: str, exclude_tool_calls_previous_runs: bool = False
649
+ ) -> dict:
650
+ """
651
+ Converts the agent run from a conversation dictionary object (a loaded thread) to a format suitable for the OpenAI API.
652
+
653
+ :param conversation: The conversation dictionary object.
654
+ The expected schema for the conversation dictionary is as follows:
655
+ {
656
+ "messages": [
657
+ {
658
+ "role": str, # The role of the message sender, e.g., "user", "assistant", "tool".
659
+ "content": list, # A list of content dictionaries.
660
+ "run_id": str, # The ID of the run associated with the message.
661
+ "createdAt": str, # The timestamp when the message was created.
662
+ ...
663
+ },
664
+ ...
665
+ ],
666
+ "tools": [
667
+ {
668
+ "name": str, # The name of the tool.
669
+ "description": str, # The description of the tool.
670
+ "parameters": dict # The parameters for the tool.
671
+ },
672
+ ...
673
+ ]
674
+ }
675
+ :type conversation: dict
676
+ :param run_id: The ID of the run.
677
+ :type run_id: str
678
+ :param exclude_tool_calls_previous_runs: Whether to exclude tool calls from previous runs in the conversion.
679
+ :type exclude_tool_calls_previous_runs: bool
680
+ :return: The converted data in dictionary format serialized as string.
681
+ :rtype: dict
682
+ """
683
+ # We need to type our messages to the correct type, so we can sliced and dice the way we like it.
684
+ messages: List[dict] = conversation.get("messages", [])
685
+ converted_messages: List[Message] = [convert_message(message) for message in messages]
686
+
687
+ # Accumulate the messages in the correct order, but only up to the run_id.
688
+ final_messages: List[Message] = []
689
+ for converted_message in AIAgentConverter._filter_messages_up_to_run_id(converted_messages, run_id):
690
+ # By default, we want to add all the messages, even if we are on the 10th run of the thread, we want to know
691
+ # what the assistant said, what the assistant called, and what the result was.
692
+ if exclude_tool_calls_previous_runs:
693
+ # We would not be interested in tool call messages in the query, unless it's the current run id.
694
+ if converted_message.run_id != run_id:
695
+ # Anything with tool, we can throw out, since we don't care about the tooling of possibly other agents
696
+ # that came before the run we're interested in.
697
+ if converted_message.role == _TOOL:
698
+ continue
699
+
700
+ # We also don't want anything that is an assistant calling a tool.
701
+ if AIAgentConverter._is_agent_tool_call(converted_message):
702
+ continue
703
+
704
+ # We're good to add it.
705
+ final_messages.append(converted_message)
706
+
707
+ # Just in case, sort them all out by putting the messages without createdAt, like SystemMessage's at the
708
+ # top of the list, so they appear first.
709
+ final_messages = AIAgentConverter._sort_messages(final_messages)
710
+
711
+ # Create the tool definitions.
712
+ tools = conversation.get("tools", [])
713
+ tool_definitions = [
714
+ ToolDefinition(name=tool["name"], description=tool.get("description"), parameters=tool["parameters"])
715
+ for tool in tools
716
+ ]
717
+
718
+ # Separate into the chat history, with all other user-assistant messages, and the assistant's response, where
719
+ # the latter would include
720
+ query, responses = AIAgentConverter._break_into_query_responses(final_messages, run_id)
721
+
722
+ # Create the final result
723
+ final_result = EvaluatorData(query=query, response=responses, tool_definitions=tool_definitions)
724
+
725
+ return json.loads(final_result.to_json())
726
+
727
+ @staticmethod
728
+ def _convert_from_file(filename: str, run_id: str) -> dict:
729
+ """
730
+ Converts the agent run from a JSON file to a format suitable for the OpenAI API, the JSON file being a thread.
731
+
732
+ :param filename: The path to the JSON file.
733
+ The expected schema for the JSON file is as follows:
734
+ {
735
+ "messages": [
736
+ {
737
+ "role": str, # The role of the message sender, e.g., "user", "assistant", "tool".
738
+ "content": list, # A list of content dictionaries.
739
+ "run_id": str, # The ID of the run associated with the message.
740
+ "createdAt": str, # The timestamp when the message was created.
741
+ ...
742
+ },
743
+ ...
744
+ ],
745
+ "tools": [
746
+ {
747
+ "name": str, # The name of the tool.
748
+ "description": str, # The description of the tool.
749
+ "parameters": dict # The parameters for the tool.
750
+ },
751
+ ...
752
+ ]
753
+ }
754
+ :type filename: str
755
+ :param run_id: The ID of the run.
756
+ :type run_id: str
757
+ :return: The converted data in dictionary format serialized as string.
758
+ :rtype: dict
759
+ """
760
+
761
+ with open(filename, mode="r", encoding="utf-8") as file:
762
+ data = json.load(file)
763
+
764
+ return AIAgentConverter._convert_from_conversation(data, run_id)
765
+
766
+
767
+ @experimental
768
+ class AIAgentDataRetriever:
769
+ # Maximum items to fetch in a single AI Services API call (imposed by the service).
770
+ _AI_SERVICES_API_MAX_LIMIT = 100
771
+
772
+ def __init__(self, project_client: AIProjectClient):
773
+ """
774
+ Initializes the AIAgentDataRetriever with the given AI project client.
775
+
776
+ :param project_client: The AI project client used for API interactions.
777
+ :type project_client: AIProjectClient
778
+ """
779
+ self.project_client = project_client
780
+
781
+ @abstractmethod
782
+ def _get_run(self, thread_id: str, run_id: str):
783
+ pass
784
+
785
+ @abstractmethod
786
+ def _list_messages_chronological(self, thread_id: str):
787
+ pass
788
+
789
+ @abstractmethod
790
+ def _list_run_steps_chronological(self, thread_id: str, run_id: str):
791
+ pass
792
+
793
+ @abstractmethod
794
+ def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
795
+ pass
796
+
797
+
798
+ @experimental
799
+ class LegacyAgentDataRetriever(AIAgentDataRetriever):
800
+
801
+ def __init__(self, **kwargs):
802
+ super(LegacyAgentDataRetriever, self).__init__(**kwargs)
803
+
804
+ def _list_messages_chronological(self, thread_id: str):
805
+ """
806
+ Lists messages in chronological order for a given thread.
807
+
808
+ :param thread_id: The ID of the thread.
809
+ :type thread_id: str
810
+ :return: A list of messages in chronological order.
811
+ """
812
+ to_return = []
813
+
814
+ has_more = True
815
+ after = None
816
+ while has_more:
817
+ messages = self.project_client.agents.list_messages(
818
+ thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc", after=after
819
+ )
820
+ has_more = messages.has_more
821
+ after = messages.last_id
822
+ if messages.data:
823
+ # We need to add the messages to the accumulator.
824
+ to_return.extend(messages.data)
825
+
826
+ return to_return
827
+
828
+ def _list_run_steps_chronological(self, thread_id: str, run_id: str):
829
+ run_steps_chronological: List[object] = []
830
+ has_more = True
831
+ after = None
832
+ while has_more:
833
+ run_steps = self.project_client.agents.list_run_steps(
834
+ thread_id=thread_id,
835
+ run_id=run_id,
836
+ limit=self._AI_SERVICES_API_MAX_LIMIT,
837
+ order="asc",
838
+ after=after,
839
+ include=["step_details.tool_calls[*].file_search.results[*].content"],
840
+ )
841
+ has_more = run_steps.has_more
842
+ after = run_steps.last_id
843
+ if run_steps.data:
844
+ # We need to add the run steps to the accumulator.
845
+ run_steps_chronological.extend(run_steps.data)
846
+ return run_steps_chronological
847
+
848
+ def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
849
+ """
850
+ Lists run IDs in chronological order for a given thread.
851
+
852
+ :param thread_id: The ID of the thread.
853
+ :type thread_id: str
854
+ :return: A list of run IDs in chronological order.
855
+ :rtype: List[str]
856
+ """
857
+ runs = self.project_client.agents.list_runs(thread_id=thread_id, order="asc")
858
+ run_ids = [run["id"] for run in runs["data"]]
859
+ return run_ids
860
+
861
+ def _get_run(self, thread_id: str, run_id: str):
862
+ return self.project_client.agents.get_run(thread_id=thread_id, run_id=run_id)
863
+
864
+
865
+ @experimental
866
+ class FDPAgentDataRetriever(AIAgentDataRetriever):
867
+
868
+ def __init__(self, **kwargs):
869
+ super(FDPAgentDataRetriever, self).__init__(**kwargs)
870
+
871
+ def _list_messages_chronological(self, thread_id: str):
872
+ """
873
+ Lists messages in chronological order for a given thread.
874
+
875
+ :param thread_id: The ID of the thread.
876
+ :type thread_id: str
877
+ :return: A list of messages in chronological order.
878
+ """
879
+ message_iter = self.project_client.agents.messages.list(
880
+ thread_id=thread_id, limit=self._AI_SERVICES_API_MAX_LIMIT, order="asc"
881
+ )
882
+ return [message for message in message_iter]
883
+
884
+ def _list_run_steps_chronological(self, thread_id: str, run_id: str):
885
+
886
+ return self.project_client.agents.run_steps.list(
887
+ thread_id=thread_id,
888
+ run_id=run_id,
889
+ limit=self._AI_SERVICES_API_MAX_LIMIT,
890
+ order="asc",
891
+ include=["step_details.tool_calls[*].file_search.results[*].content"],
892
+ )
893
+
894
+ def _list_run_ids_chronological(self, thread_id: str) -> List[str]:
895
+ runs = self.project_client.agents.runs.list(thread_id=thread_id, order="asc")
896
+ return [run.id for run in runs]
897
+
898
+ def _get_run(self, thread_id: str, run_id: str):
899
+ return self.project_client.agents.runs.get(thread_id=thread_id, run_id=run_id)