azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,467 @@
1
+ import datetime
2
+ import json
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from typing import TYPE_CHECKING, Any, List, Optional, Union
7
+
8
+ # Models moved in a later version of agents SDK, so try a few different locations
9
+ # Only import for type checking to avoid runtime import errors
10
+ if TYPE_CHECKING:
11
+ try:
12
+ from azure.ai.projects.models import RunStepFunctionToolCall
13
+ except ImportError:
14
+ try:
15
+ from azure.ai.agents.models import RunStepFunctionToolCall
16
+ except ImportError:
17
+ # Create a protocol for type checking when the real class isn't available
18
+ from typing import Protocol
19
+
20
+ class RunStepFunctionToolCall(Protocol):
21
+ """Protocol defining the expected interface for RunStepFunctionToolCall."""
22
+
23
+ id: str
24
+ type: str
25
+
26
+ def get(self, key: str, default: Any = None) -> Any: ...
27
+
28
+ else:
29
+ # At runtime, we don't need the actual class since it's only used in type annotations
30
+ RunStepFunctionToolCall = Any
31
+
32
+ # Message roles constants.
33
+ _SYSTEM = "system"
34
+ _USER = "user"
35
+ _AGENT = "assistant"
36
+ _TOOL = "tool"
37
+ _DEVELOPER = "developer" # part of the semantic kernel
38
+
39
+ # Constant definitions for what tool details include.
40
+ _TOOL_CALL = "tool_call"
41
+ _TOOL_RESULT = "tool_result"
42
+ _FUNCTION = "function"
43
+
44
+ # This is returned by AI services in the API to filter against tool invocations.
45
+ _TOOL_CALLS = "tool_calls"
46
+
47
+ # Constants to only be used internally in this file for the built-in tools.
48
+ _CODE_INTERPRETER = "code_interpreter"
49
+ _BING_GROUNDING = "bing_grounding"
50
+ _BING_CUSTOM_SEARCH = "bing_custom_search"
51
+ _FILE_SEARCH = "file_search"
52
+ _AZURE_AI_SEARCH = "azure_ai_search"
53
+ _SHAREPOINT_GROUNDING = "sharepoint_grounding"
54
+ _FABRIC_DATAAGENT = "fabric_dataagent"
55
+ _OPENAPI = "openapi"
56
+
57
+ # Built-in tool descriptions and parameters are hidden, but we include basic descriptions
58
+ # for evaluation purposes.
59
+ _BUILT_IN_DESCRIPTIONS = {
60
+ _CODE_INTERPRETER: "Use code interpreter to read and interpret information from datasets, "
61
+ + "generate code, and create graphs and charts using your data. Supports "
62
+ + "up to 20 files.",
63
+ _BING_GROUNDING: "Enhance model output with web data.",
64
+ _BING_CUSTOM_SEARCH: "Enables agents to retrieve content from a curated subset of websites, enhancing relevance and reducing noise from public web searches.",
65
+ _FILE_SEARCH: "Search for data across uploaded files. A single call can return multiple results/files in the 'results' field.",
66
+ _AZURE_AI_SEARCH: "Search an Azure AI Search index for relevant data.",
67
+ _SHAREPOINT_GROUNDING: "Allows agents to access and retrieve relevant content from Microsoft SharePoint document libraries, grounding responses in organizational knowledge.",
68
+ _FABRIC_DATAAGENT: "Connect to Microsoft Fabric data agents to retrieve data across different data sources.",
69
+ }
70
+
71
+ # Built-in tool parameters are hidden, but we include basic parameters for evaluation purposes.
72
+ _BUILT_IN_PARAMS = {
73
+ _CODE_INTERPRETER: {
74
+ "type": "object",
75
+ "properties": {"input": {"type": "string", "description": "Generated code to be executed."}},
76
+ },
77
+ _BING_GROUNDING: {
78
+ "type": "object",
79
+ "properties": {"requesturl": {"type": "string", "description": "URL used in Bing Search API."}},
80
+ },
81
+ _BING_CUSTOM_SEARCH: {
82
+ "type": "object",
83
+ "properties": {
84
+ "requesturl": {
85
+ "type": "string",
86
+ "description": "Search queries, along with pre-configured site restrictions or domain filters.",
87
+ }
88
+ },
89
+ },
90
+ _FILE_SEARCH: {
91
+ "type": "object",
92
+ "properties": {
93
+ "ranking_options": {
94
+ "type": "object",
95
+ "properties": {
96
+ "ranker": {"type": "string", "description": "Ranking algorithm to use."},
97
+ "score_threshold": {"type": "number", "description": "Threshold for search results."},
98
+ },
99
+ "description": "Ranking options for search results.",
100
+ }
101
+ },
102
+ },
103
+ _AZURE_AI_SEARCH: {
104
+ "type": "object",
105
+ "properties": {"input": {"type": "string", "description": "Search terms to use."}},
106
+ },
107
+ _SHAREPOINT_GROUNDING: {
108
+ "type": "object",
109
+ "properties": {
110
+ "input": {"type": "string", "description": "A natural language query to search SharePoint content."}
111
+ },
112
+ },
113
+ _FABRIC_DATAAGENT: {
114
+ "type": "object",
115
+ "properties": {"input": {"type": "string", "description": "Search terms to use."}},
116
+ },
117
+ }
118
+
119
+
120
+ class Message(BaseModel):
121
+ """Represents a message in a conversation with agents, assistants, and tools. We need to export these structures
122
+ to JSON for evaluators and we have custom fields such as createdAt, run_id, and tool_call_id, so we cannot use
123
+ the standard pydantic models provided by OpenAI.
124
+
125
+ :param createdAt: The timestamp when the message was created.
126
+ :type createdAt: datetime.datetime
127
+ :param run_id: The ID of the run associated with the message. Optional.
128
+ :type run_id: Optional[str]
129
+ :param role: The role of the message sender (e.g., system, user, tool, assistant).
130
+ :type role: str
131
+ :param content: The content of the message, which can be a string or a list of dictionaries.
132
+ :type content: Union[str, List[dict]]
133
+ """
134
+
135
+ createdAt: Optional[Union[datetime.datetime, int]] = None # SystemMessage wouldn't have this
136
+ run_id: Optional[str] = None
137
+ tool_call_id: Optional[str] = None # see ToolMessage
138
+ role: str
139
+ content: Union[str, List[dict]]
140
+
141
+
142
+ class SystemMessage(Message):
143
+ """Represents a system message in a conversation with agents, assistants, and tools.
144
+
145
+ :param role: The role of the message sender, which is always 'system'.
146
+ :type role: str
147
+ """
148
+
149
+ role: str = _SYSTEM
150
+
151
+
152
+ class UserMessage(Message):
153
+ """Represents a user message in a conversation with agents, assistants, and tools.
154
+
155
+ :param role: The role of the message sender, which is always 'user'.
156
+ :type role: str
157
+ """
158
+
159
+ role: str = _USER
160
+
161
+
162
+ class SKDeveloperMessage(Message):
163
+ """Represents a developer message in a conversation with agents, assistants, and tools.
164
+ This is used in the context of Semantic Kernel (SK) agents.
165
+
166
+ :param role: The role of the message sender, which is always 'developer'.
167
+ :type role: str
168
+ """
169
+
170
+ role: str = _DEVELOPER
171
+
172
+
173
+ class ToolMessage(Message):
174
+ """Represents a tool message in a conversation with agents, assistants, and tools.
175
+
176
+ :param run_id: The ID of the run associated with the message.
177
+ :type run_id: str
178
+ :param role: The role of the message sender, which is always 'tool'.
179
+ :type role: str
180
+ :param tool_call_id: The ID of the tool call associated with the message. Optional.
181
+ :type tool_call_id: Optional[str]
182
+ """
183
+
184
+ run_id: str
185
+ role: str = _TOOL
186
+ tool_call_id: Optional[str] = None
187
+
188
+
189
+ class SKToolMessage(Message):
190
+ """Represents a tool message in the context of a Semantic Kernel (SK) agent.
191
+
192
+ :param role: The role of the message sender, which is always 'tool'.
193
+ :type role: str
194
+ :param tool_call_id: The ID of the tool call associated with the message. Optional.
195
+ :type tool_call_id: Optional[str]
196
+ """
197
+
198
+ role: str = _TOOL
199
+ tool_call_id: Optional[str] = None
200
+
201
+
202
+ class AssistantMessage(Message):
203
+ """Represents an assistant message.
204
+
205
+ :param run_id: The ID of the run associated with the message.
206
+ :type run_id: str
207
+ :param role: The role of the message sender, which is always 'assistant'.
208
+ :type role: str
209
+ """
210
+
211
+ run_id: str
212
+ role: str = _AGENT
213
+
214
+
215
+ class SKAssistantMessage(Message):
216
+ """Represents an assistant message in the context of a Semantic Kernel (SK) agent.
217
+
218
+ :param role: The role of the message sender, which is always 'assistant'.
219
+ :type role: str
220
+ """
221
+
222
+ role: str = _AGENT
223
+
224
+
225
+ class SKAssistantMessage(Message):
226
+ """Represents an assistant message in the context of a Semantic Kernel (SK) agent.
227
+
228
+ :param role: The role of the message sender, which is always 'assistant'.
229
+ :type role: str
230
+ """
231
+
232
+ role: str = _AGENT
233
+
234
+
235
+ class ToolDefinition(BaseModel):
236
+ """Represents a tool definition that will be used in the agent.
237
+
238
+ :param name: The name of the tool.
239
+ :type name: str
240
+ :param type: The type of the tool.
241
+ :type type: str
242
+ :param description: A description of the tool.
243
+ :type description: str
244
+ :param parameters: The parameters required by the tool.
245
+ :type parameters: dict
246
+ """
247
+
248
+ name: str
249
+ type: str
250
+ description: Optional[str] = None
251
+ parameters: dict
252
+
253
+
254
+ class OpenAPIToolDefinition(BaseModel):
255
+ """Represents OpenAPI tool definition that will be used in the agent.
256
+ :param name: The name of the tool.
257
+ :type name: str
258
+ :param type: The type of the tool.
259
+ :type type: str
260
+ :param description: A description of the tool.
261
+ :type description: str
262
+ :param parameters: The parameters required by the tool.
263
+ :type parameters: dict
264
+ """
265
+
266
+ name: str
267
+ type: str
268
+ description: Optional[str] = None
269
+ spec: object
270
+ auth: object
271
+ default_params: Optional[list[str]] = None
272
+ functions: list[ToolDefinition]
273
+
274
+
275
+ class ToolCall:
276
+ """Represents a tool call, used as an intermediate step in the conversion process.
277
+
278
+ :param created: The timestamp when the tool call was created.
279
+ :type created: datetime.datetime
280
+ :param completed: The timestamp when the tool call was completed.
281
+ :type completed: datetime.datetime
282
+ :param details: The details of the tool call.
283
+ :type details: RunStepFunctionToolCall
284
+ """
285
+
286
+ def __init__(self, created: datetime.datetime, completed: datetime.datetime, details: RunStepFunctionToolCall):
287
+ self.created = created
288
+ self.completed = completed
289
+ self.details = details
290
+
291
+
292
+ class EvaluatorData(BaseModel):
293
+ """Represents the result of a conversion.
294
+
295
+ :param query: A list of messages representing the system message, chat history, and user query.
296
+ :type query: List[Message]
297
+ :param response: A list of messages representing the assistant's response, including tool calls and results.
298
+ :type response: List[Message]
299
+ :param tool_definitions: A list of tool definitions used in the agent.
300
+ :type tool_definitions: List[ToolDefinition]
301
+ """
302
+
303
+ query: List[Message]
304
+ response: List[Message]
305
+ tool_definitions: List[Union[ToolDefinition, OpenAPIToolDefinition]]
306
+
307
+ def to_json(self):
308
+ """Converts the result to a JSON string.
309
+
310
+ :return: The JSON representation of the result.
311
+ :rtype: str
312
+ """
313
+ return self.model_dump_json(exclude={}, exclude_none=True)
314
+
315
+
316
+ def break_tool_call_into_messages(tool_call: ToolCall, run_id: str) -> List[Message]:
317
+ """
318
+ Breaks a tool call into a list of messages, including the tool call and its result.
319
+
320
+ :param tool_call: The tool call to be broken into messages.
321
+ :type tool_call: ToolCall
322
+ :param run_id: The ID of the run associated with the messages.
323
+ :type run_id: str
324
+ :return: A list of messages representing the tool call and its result.
325
+ :rtype: List[Message]
326
+ """
327
+ # We will use this as our accumulator.
328
+ messages: List[Message] = []
329
+
330
+ # As of March 17th, 2025, we only support custom functions due to built-in code interpreters and bing grounding
331
+ # tooling not reporting their function calls in the same way. Code interpreters don't include the tool call at
332
+ # all in most of the cases, and bing would only show the API URL, without arguments or results.
333
+ # Bing grounding would have "bing_grounding" in details with "requesturl" that will just be the API path with query.
334
+ # TODO: Work with AI Services to add converter support for BingGrounding and CodeInterpreter.
335
+ if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
336
+ # This is the internals of the content object that will be included with the tool call.
337
+ tool_call_id = tool_call.details.id
338
+ content_tool_call = {
339
+ "type": _TOOL_CALL,
340
+ "tool_call_id": tool_call_id,
341
+ "name": tool_call.details.get(_FUNCTION).get("name") if tool_call.details.get(_FUNCTION) else None,
342
+ "arguments": safe_loads(
343
+ tool_call.details.get(_FUNCTION).get("arguments") if tool_call.details.get(_FUNCTION) else None
344
+ ),
345
+ }
346
+ else:
347
+ # Treat built-in tools separately. Object models may be unique so handle each case separately
348
+ # Just converting to dicts here rather than custom serializers for simplicity for now.
349
+ # Don't fail if we run into a newly seen tool, just skip
350
+ if tool_call.details["type"] == "code_interpreter":
351
+ arguments = {"input": tool_call.details.code_interpreter.input}
352
+ elif tool_call.details["type"] == "bing_grounding":
353
+ arguments = {"requesturl": tool_call.details["bing_grounding"]["requesturl"]}
354
+ elif tool_call.details["type"] == "file_search":
355
+ options = tool_call.details["file_search"]["ranking_options"]
356
+ arguments = {
357
+ "ranking_options": {"ranker": options["ranker"], "score_threshold": options["score_threshold"]}
358
+ }
359
+ elif tool_call.details["type"] == "azure_ai_search":
360
+ arguments = {"input": tool_call.details["azure_ai_search"]["input"]}
361
+ elif tool_call.details["type"] == "fabric_dataagent":
362
+ arguments = {"input": tool_call.details["fabric_dataagent"]["input"]}
363
+ else:
364
+ # unsupported tool type, skip
365
+ return messages
366
+ try:
367
+ tool_call_id = tool_call.details.id
368
+ content_tool_call = {
369
+ "type": _TOOL_CALL,
370
+ "tool_call_id": tool_call_id,
371
+ "name": tool_call.details.type,
372
+ "arguments": arguments,
373
+ }
374
+ except:
375
+ return messages
376
+
377
+ # We format it into an assistant message, where the content is a singleton list of the content object.
378
+ # It should be a tool message, since this is the call, but the given schema treats this message as
379
+ # assistant's action of calling the tool.
380
+ messages.append(AssistantMessage(run_id=run_id, content=[to_dict(content_tool_call)], createdAt=tool_call.created))
381
+
382
+ if hasattr(tool_call.details, _FUNCTION) or tool_call.details.get("function"):
383
+ output = safe_loads(tool_call.details.get("function")["output"])
384
+ else:
385
+ try:
386
+ # Some built-ins may have output, others may not
387
+ # Try to retrieve it, but if we don't find anything, skip adding the message
388
+ # Just manually converting to dicts for easy serialization for now rather than custom serializers
389
+ if tool_call.details.type == _CODE_INTERPRETER:
390
+ output = [result.as_dict() for result in tool_call.details.code_interpreter.outputs]
391
+ elif tool_call.details.type == _BING_GROUNDING:
392
+ return messages # not supported yet from bing grounding tool
393
+ elif tool_call.details.type == _FILE_SEARCH:
394
+ output = [result.as_dict() for result in tool_call.details.file_search.results]
395
+ elif tool_call.details.type == _AZURE_AI_SEARCH:
396
+ output = tool_call.details.azure_ai_search["output"]
397
+ elif tool_call.details.type == _FABRIC_DATAAGENT:
398
+ output = tool_call.details.fabric_dataagent["output"]
399
+ except:
400
+ return messages
401
+
402
+ # Now, onto the tool result, which only includes the result of the function call.
403
+ content_tool_call_result = {"type": _TOOL_RESULT, _TOOL_RESULT: output}
404
+
405
+ # Since this is a tool's action of returning, we put it as a tool message.
406
+ messages.append(
407
+ ToolMessage(
408
+ run_id=run_id,
409
+ tool_call_id=tool_call_id,
410
+ content=[to_dict(content_tool_call_result)],
411
+ createdAt=tool_call.completed,
412
+ )
413
+ )
414
+ return messages
415
+
416
+
417
+ def to_dict(obj) -> dict:
418
+ """
419
+ Converts an object to a dictionary.
420
+
421
+ :param obj: The object to be converted.
422
+ :type obj: Any
423
+ :return: The dictionary representation of the object.
424
+ :rtype: dict
425
+ """
426
+ return json.loads(json.dumps(obj))
427
+
428
+
429
+ def safe_loads(data: str) -> Union[dict, str]:
430
+ """
431
+ Safely loads a JSON string into a Python dictionary or returns the original string if loading fails.
432
+ :param data: The JSON string to be loaded.
433
+ :type data: str
434
+ :return: The loaded dictionary or the original string.
435
+ :rtype: Union[dict, str]
436
+ """
437
+ try:
438
+ return json.loads(data)
439
+ except json.JSONDecodeError:
440
+ return data
441
+
442
+
443
+ def convert_message(msg: dict) -> Message:
444
+ """
445
+ Converts a dictionary to the appropriate Message subclass.
446
+
447
+ :param msg: The message dictionary.
448
+ :type msg: dict
449
+ :return: The Message object.
450
+ :rtype: Message
451
+ """
452
+ role = msg["role"]
453
+ if role == "system":
454
+ return SystemMessage(content=str(msg["content"]))
455
+ elif role == "user":
456
+ return UserMessage(content=msg["content"], createdAt=msg["createdAt"])
457
+ elif role == "assistant":
458
+ return AssistantMessage(run_id=str(msg["run_id"]), content=msg["content"], createdAt=msg["createdAt"])
459
+ elif role == "tool":
460
+ return ToolMessage(
461
+ run_id=str(msg["run_id"]),
462
+ tool_call_id=str(msg["tool_call_id"]),
463
+ content=msg["content"],
464
+ createdAt=msg["createdAt"],
465
+ )
466
+ else:
467
+ raise ValueError(f"Unknown role: {role}")