azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -1,51 +1,157 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
-
4
+ import os
5
+ import posixpath
6
+ import re
7
+ import math
5
8
  import threading
6
- from typing import List, Optional, Union
9
+ from typing import Any, List, Literal, Mapping, Optional, Type, TypeVar, Tuple, Union, cast, get_args, get_origin
7
10
 
8
11
  import nltk
9
- import numpy as np
10
-
11
- from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
12
+ from azure.storage.blob import ContainerClient
13
+ from typing_extensions import NotRequired, Required, TypeGuard, TypeIs
14
+ from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
15
+ from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
16
+ from azure.ai.evaluation._exceptions import ErrorMessage, ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
17
+ from azure.ai.evaluation._model_configurations import (
18
+ AzureAIProject,
19
+ AzureOpenAIModelConfiguration,
20
+ OpenAIModelConfiguration,
21
+ )
12
22
 
13
23
  from . import constants
24
+ from .constants import EvaluatorScoringPattern, EVALUATOR_SCORING_PATTERNS, SCORING_PATTERN_CONFIG
14
25
 
15
26
  _nltk_data_download_lock = threading.Lock()
16
27
 
28
+ T_TypedDict = TypeVar("T_TypedDict", bound=Mapping[Any, Any])
29
+
17
30
 
18
- def get_harm_severity_level(harm_score: int) -> str:
31
+ def get_harm_severity_level(harm_score: Union[float, int], evaluator: Optional[str] = None) -> Union[str, float]:
19
32
  """Generate harm severity level based on harm score.
20
33
 
21
34
  :param harm_score: The harm score to be evaluated.
22
- :type harm_score: int
35
+ :type harm_score: Union[float, int]
36
+ :param evaluator: Optional evaluator name to determine scoring pattern. If not provided, uses default 0-7 scale.
37
+ :type evaluator: Optional[str]
23
38
  :return: The harm severity level. If harm score is None or numpy.nan, returns numpy.nan.
24
- :rtype: str
39
+ :rtype: Union[str, float]
25
40
  """
26
- HARM_SEVERITY_LEVEL_MAPPING = {
27
- constants.HarmSeverityLevel.VeryLow: [0, 1],
28
- constants.HarmSeverityLevel.Low: [2, 3],
29
- constants.HarmSeverityLevel.Medium: [4, 5],
30
- constants.HarmSeverityLevel.High: [6, 7],
31
- }
32
- if harm_score == np.nan or harm_score is None:
33
- return np.nan
34
- for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
35
- if harm_score_range[0] <= harm_score <= harm_score_range[1]:
41
+ if harm_score is None or (isinstance(harm_score, float) and math.isnan(harm_score)):
42
+ return math.nan
43
+
44
+ # Get the scoring pattern for this evaluator
45
+ pattern = get_evaluator_scoring_pattern(evaluator)
46
+ config = SCORING_PATTERN_CONFIG.get(pattern)
47
+
48
+ if not config:
49
+ # Fallback to default 0-7 mapping
50
+ HARM_SEVERITY_LEVEL_MAPPING = {
51
+ constants.HarmSeverityLevel.VeryLow: [0, 1],
52
+ constants.HarmSeverityLevel.Low: [2, 3],
53
+ constants.HarmSeverityLevel.Medium: [4, 5],
54
+ constants.HarmSeverityLevel.High: [6, 7],
55
+ }
56
+ for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
57
+ if harm_score_range[0] <= harm_score <= harm_score_range[1]:
58
+ return harm_level.value
59
+ return math.nan
60
+
61
+ # Use pattern-specific severity mapping
62
+ severity_mapping = config.get("severity_mapping", {})
63
+ for harm_level, score_range in severity_mapping.items():
64
+ if score_range[0] <= harm_score <= score_range[-1]:
36
65
  return harm_level.value
37
- return np.nan
66
+
67
+ return math.nan
68
+
69
+
70
+ def get_evaluator_scoring_pattern(evaluator: Optional[str]) -> EvaluatorScoringPattern:
71
+ """Get the scoring pattern for a given evaluator.
72
+
73
+ :param evaluator: The name of the evaluator.
74
+ :type evaluator: Optional[str]
75
+ :return: The scoring pattern for the evaluator, defaults to SCALE_0_7 if not found.
76
+ :rtype: EvaluatorScoringPattern
77
+ """
78
+ if evaluator is None:
79
+ return EvaluatorScoringPattern.SCALE_0_7
80
+
81
+ # Normalize evaluator name (remove underscores, lowercase)
82
+ normalized_name = evaluator.lower().replace("_", "")
83
+
84
+ # Try exact match first
85
+ if evaluator in EVALUATOR_SCORING_PATTERNS:
86
+ return EVALUATOR_SCORING_PATTERNS[evaluator]
87
+
88
+ # Try normalized match
89
+ for key, pattern in EVALUATOR_SCORING_PATTERNS.items():
90
+ if key.replace("_", "").lower() == normalized_name:
91
+ return pattern
92
+
93
+ # Default to 0-7 scale for unknown evaluators
94
+ return EvaluatorScoringPattern.SCALE_0_7
95
+
96
+
97
+ def get_default_threshold_for_evaluator(evaluator: str) -> int:
98
+ """Get the default attack success threshold for a given evaluator.
99
+
100
+ :param evaluator: The name of the evaluator.
101
+ :type evaluator: str
102
+ :return: The default threshold value.
103
+ :rtype: int
104
+ """
105
+ pattern = get_evaluator_scoring_pattern(evaluator)
106
+ config = SCORING_PATTERN_CONFIG.get(pattern, {})
107
+ return config.get("default_threshold", 3)
108
+
109
+
110
+ def convert_binary_to_numeric(value: Union[str, bool, int]) -> int:
111
+ """Convert binary evaluator outputs to numeric scores.
112
+
113
+ :param value: The binary value ("safe"/"unsafe", "true"/"false", True/False, 0/1).
114
+ :type value: Union[str, bool, int]
115
+ :return: 0 for safe/true, 1 for unsafe/false.
116
+ :rtype: int
117
+ """
118
+ if isinstance(value, bool):
119
+ return 0 if value else 1
120
+
121
+ if isinstance(value, int):
122
+ return value
123
+
124
+ if isinstance(value, str):
125
+ value_lower = value.lower().strip()
126
+ # For "safe"/"unsafe" pattern
127
+ if value_lower == "safe":
128
+ return 0
129
+ if value_lower == "unsafe":
130
+ return 1
131
+ # For "true"/"false" pattern
132
+ if value_lower == "true":
133
+ return 0
134
+ if value_lower == "false":
135
+ return 1
136
+
137
+ raise ValueError(f"Unable to convert value '{value}' to numeric score")
38
138
 
39
139
 
40
140
  def ensure_nltk_data_downloaded():
41
141
  """Download NLTK data packages if not already downloaded."""
142
+ nltk_data = [
143
+ ("wordnet", "corpora/wordnet.zip"),
144
+ ("perluniprops", "misc/perluniprops.zip"),
145
+ ("punkt", "tokenizers/punkt.zip"),
146
+ ("punkt_tab", "tokenizers/punkt_tab.zip"),
147
+ ]
148
+
42
149
  with _nltk_data_download_lock:
43
- try:
44
- from nltk.tokenize.nist import NISTTokenizer # pylint: disable=unused-import
45
- except LookupError:
46
- nltk.download("perluniprops")
47
- nltk.download("punkt")
48
- nltk.download("punkt_tab")
150
+ for _id, resource_name in nltk_data:
151
+ try:
152
+ nltk.find(resource_name)
153
+ except LookupError:
154
+ nltk.download(_id)
49
155
 
50
156
 
51
157
  def nltk_tokenize(text: str) -> List[str]:
@@ -70,18 +176,748 @@ def nltk_tokenize(text: str) -> List[str]:
70
176
  return list(tokens)
71
177
 
72
178
 
73
- def ensure_api_version_in_aoai_model_config(
179
+ def _is_aoi_model_config(val: object) -> TypeGuard[AzureOpenAIModelConfiguration]:
180
+ return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("azure_endpoint", "azure_deployment"))
181
+
182
+
183
+ def _is_openai_model_config(val: object) -> TypeGuard[OpenAIModelConfiguration]:
184
+ return isinstance(val, dict) and all(isinstance(val.get(k), str) for k in ("model"))
185
+
186
+
187
+ def parse_model_config_type(
74
188
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
75
- default_api_version: str,
76
189
  ) -> None:
77
- if "azure_endpoint" in model_config or "azure_deployment" in model_config:
78
- model_config["api_version"] = model_config.get("api_version", default_api_version)
190
+ if _is_aoi_model_config(model_config):
191
+ model_config["type"] = AZURE_OPENAI_TYPE
192
+ elif _is_openai_model_config(model_config):
193
+ model_config["type"] = OPENAI_TYPE
79
194
 
80
195
 
81
- def ensure_user_agent_in_aoai_model_config(
196
+ def construct_prompty_model_config(
82
197
  model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
83
- prompty_model_config: dict,
84
- user_agent: Optional[str] = None,
85
- ) -> None:
86
- if user_agent and ("azure_endpoint" in model_config or "azure_deployment" in model_config):
198
+ default_api_version: str,
199
+ user_agent: str,
200
+ ) -> dict:
201
+ parse_model_config_type(model_config)
202
+
203
+ if _is_aoi_model_config(model_config):
204
+ model_config["api_version"] = model_config.get("api_version", default_api_version)
205
+
206
+ prompty_model_config: dict = {"configuration": model_config, "parameters": {"extra_headers": {}}}
207
+
208
+ # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
209
+ # https://github.com/encode/httpx/discussions/2959
210
+ prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
211
+
212
+ if _is_aoi_model_config(model_config) and user_agent:
87
213
  prompty_model_config["parameters"]["extra_headers"].update({"x-ms-useragent": user_agent})
214
+
215
+ return prompty_model_config
216
+
217
+
218
+ def is_onedp_project(azure_ai_project: Optional[Union[str, AzureAIProject]]) -> TypeIs[str]:
219
+ """Check if the Azure AI project is an OneDP project.
220
+
221
+ :param azure_ai_project: The scope of the Azure AI project.
222
+ :type azure_ai_project: Optional[Union[str,~azure.ai.evaluation.AzureAIProject]]
223
+ :return: True if the Azure AI project is an OneDP project, False otherwise.
224
+ :rtype: bool
225
+ """
226
+ return isinstance(azure_ai_project, str)
227
+
228
+
229
+ def validate_azure_ai_project(o: object) -> AzureAIProject:
230
+ fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
231
+
232
+ # TODO : Add regex check for malformed project uri
233
+ if is_onedp_project(o):
234
+ return o
235
+
236
+ if not isinstance(o, dict):
237
+ msg = "The 'azure_ai_project' parameter must be a dictionary."
238
+ raise EvaluationException(
239
+ message=msg,
240
+ category=ErrorCategory.INVALID_VALUE,
241
+ blame=ErrorBlame.USER_ERROR,
242
+ )
243
+
244
+ missing_fields = set(fields.keys()) - o.keys()
245
+
246
+ if missing_fields:
247
+ msg = (
248
+ "The 'azure_ai_project' dictionary is missing the following required "
249
+ f"field(s): {', '.join(f'{field}' for field in missing_fields)}."
250
+ )
251
+ raise EvaluationException(
252
+ message=msg,
253
+ category=ErrorCategory.INVALID_VALUE,
254
+ blame=ErrorBlame.USER_ERROR,
255
+ )
256
+
257
+ for field_name, expected_type in fields.items():
258
+ if isinstance(o[field_name], expected_type):
259
+ continue
260
+
261
+ msg = f"Invalid type for field '{field_name}'. Expected {expected_type}, but got {type(o[field_name])}."
262
+ raise EvaluationException(
263
+ message=msg,
264
+ category=ErrorCategory.INVALID_VALUE,
265
+ blame=ErrorBlame.USER_ERROR,
266
+ )
267
+
268
+ return cast(AzureAIProject, o)
269
+
270
+
271
+ def validate_model_config(config: dict) -> Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]:
272
+ try:
273
+ return _validate_typed_dict(config, AzureOpenAIModelConfiguration)
274
+ except TypeError:
275
+ try:
276
+ return _validate_typed_dict(config, OpenAIModelConfiguration)
277
+ except TypeError as e:
278
+ msg = "Model config validation failed."
279
+ raise EvaluationException(
280
+ message=msg, internal_message=msg, category=ErrorCategory.MISSING_FIELD, blame=ErrorBlame.USER_ERROR
281
+ ) from e
282
+
283
+
284
+ def _validate_typed_dict(o: object, t: Type[T_TypedDict]) -> T_TypedDict:
285
+ """Do very basic runtime validation that an object is a typed dict
286
+
287
+ .. warning::
288
+
289
+ This validation is very basic, robust enough to cover some very simple TypedDicts.
290
+ Ideally, validation of this kind should be delegated to something more robust.
291
+
292
+ You will very quickly run into limitations trying to apply this function more broadly:
293
+ * Doesn't support stringized annotations at all
294
+ * Very limited support for generics, and "special form" (NoReturn, NotRequired, Required, etc...) types.
295
+ * Error messages are poor, especially if there is any nesting.
296
+
297
+ :param object o: The object to check
298
+ :param Type[T_TypedDict] t: The TypedDict to validate against
299
+ :raises NotImplementedError: Several forms of validation are unsupported
300
+ * Checking against stringized annotations
301
+ * Checking a generic that is not one of a few basic forms
302
+ :raises TypeError: If a value does not match the specified annotation
303
+ :raises ValueError: If t's annotation is not a string, type of a special form (e.g. NotRequired, Required, etc...)
304
+ :returns: The object passed in
305
+ :rtype: T_TypedDict
306
+ """
307
+ if not isinstance(o, dict):
308
+ raise TypeError(f"Expected type 'dict', got type '{type(object)}'.")
309
+
310
+ annotations = t.__annotations__
311
+ is_total = getattr(t, "__total__", False)
312
+ unknown_keys = set(o.keys()) - annotations.keys()
313
+
314
+ if unknown_keys:
315
+ raise TypeError(f"dict contains unknown keys: {list(unknown_keys)!r}")
316
+
317
+ required_keys = {
318
+ k
319
+ for k in annotations
320
+ if (is_total and get_origin(annotations[k]) is not NotRequired)
321
+ or (not is_total and get_origin(annotations[k]) is Required)
322
+ }
323
+
324
+ missing_keys = required_keys - o.keys()
325
+
326
+ if missing_keys:
327
+ raise TypeError(f"Missing required keys: {list(missing_keys)!r}.")
328
+
329
+ def validate_annotation(v: object, annotation: Union[str, type, object]) -> bool:
330
+ if isinstance(annotation, str):
331
+ raise NotImplementedError("Missing support for validating against stringized annotations.")
332
+
333
+ if (origin := get_origin(annotation)) is not None:
334
+ if origin is tuple:
335
+ validate_annotation(v, tuple)
336
+ tuple_args = get_args(annotation)
337
+ if len(cast(tuple, v)) != len(tuple_args):
338
+ raise TypeError(f"Expected a {len(tuple_args)}-tuple, got a {len(cast(tuple, v))}-tuple.")
339
+ for tuple_val, tuple_args in zip(cast(tuple, v), tuple_args):
340
+ validate_annotation(tuple_val, tuple_args)
341
+ elif origin is dict:
342
+ validate_annotation(v, dict)
343
+ dict_key_ann, dict_val_ann = get_args(annotation)
344
+ for dict_key, dict_val in cast(dict, v).items():
345
+ validate_annotation(dict_val, dict_val_ann)
346
+ validate_annotation(dict_key, dict_key_ann)
347
+ elif origin is list:
348
+ validate_annotation(v, list)
349
+ list_val_ann = get_args(annotation)[0]
350
+ for list_val in cast(list, v):
351
+ validate_annotation(list_val, list_val_ann)
352
+ elif origin is Union:
353
+ for generic_arg in get_args(annotation):
354
+ try:
355
+ validate_annotation(v, generic_arg)
356
+ return True
357
+ except TypeError:
358
+ pass
359
+ raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}")
360
+ elif origin is Literal:
361
+ literal_args = get_args(annotation)
362
+ if not any(type(literal) is type(v) and literal == v for literal in literal_args):
363
+ raise TypeError(f"Expected value to be one of {list(literal_args)!r}. Received type {type(v)}")
364
+ elif any(origin is g for g in (NotRequired, Required)):
365
+ validate_annotation(v, get_args(annotation)[0])
366
+ else:
367
+ raise NotImplementedError(f"Validation not implemented for generic {origin}.")
368
+ return True
369
+
370
+ if isinstance(annotation, type):
371
+ if not isinstance(v, annotation):
372
+ raise TypeError(f"Expected value to have type {annotation}. Received type {type(v)}.")
373
+ return True
374
+
375
+ raise ValueError("Annotation to validate against should be a str, type, or generic.")
376
+
377
+ for k, v in o.items():
378
+ validate_annotation(v, annotations[k])
379
+
380
+ return cast(T_TypedDict, o)
381
+
382
+
383
+ def check_score_is_valid(score: Union[str, float], min_score=1, max_score=5) -> bool:
384
+ """Check if the score is valid, i.e. is convertable to number and is in the range [min_score, max_score].
385
+
386
+ :param score: The score to check.
387
+ :type score: Union[str, float]
388
+ :param min_score: The minimum score. Default is 1.
389
+ :type min_score: int
390
+ :param max_score: The maximum score. Default is 5.
391
+ :type max_score: int
392
+ :return: True if the score is valid, False otherwise.
393
+ :rtype: bool
394
+ """
395
+ try:
396
+ numeric_score = float(score)
397
+ except (ValueError, TypeError):
398
+ return False
399
+
400
+ return min_score <= numeric_score <= max_score
401
+
402
+
403
+ def parse_quality_evaluator_reason_score(llm_output: str, valid_score_range: str = "[1-5]") -> Tuple[float, str]:
404
+ """Parse the output of prompt-based quality evaluators that return a score and reason.
405
+
406
+ Current supported evaluators:
407
+ - Fluency
408
+ - Relevance
409
+ - Retrieval
410
+ - Groundedness
411
+ - Coherence
412
+ - ResponseCompleteness
413
+ - TaskAdherence
414
+
415
+ :param llm_output: The output of the prompt-based quality evaluator.
416
+ :type llm_output: str
417
+ :return: The score and reason.
418
+ :rtype: Tuple[float, str]
419
+ """
420
+ score = math.nan
421
+ reason = ""
422
+ if llm_output:
423
+ try:
424
+ score_pattern = rf"<S2>\D*?({valid_score_range}).*?</S2>"
425
+ reason_pattern = r"<S1>(.*?)</S1>"
426
+ score_match = re.findall(score_pattern, llm_output, re.DOTALL)
427
+ reason_match = re.findall(reason_pattern, llm_output, re.DOTALL)
428
+ if score_match:
429
+ score = float(score_match[0].strip())
430
+ if reason_match:
431
+ reason = reason_match[0].strip()
432
+ except ValueError as exc:
433
+ raise EvaluationException(
434
+ message=f"Failed to parse model output: \n{llm_output}",
435
+ internal_message="Failed to parse model output.",
436
+ category=ErrorCategory.FAILED_EXECUTION,
437
+ blame=ErrorBlame.SYSTEM_ERROR,
438
+ ) from exc
439
+
440
+ return score, reason
441
+
442
+
443
+ def remove_optional_singletons(eval_class, singletons):
444
+ required_singletons = singletons.copy()
445
+ if hasattr(eval_class, "_OPTIONAL_PARAMS"): # pylint: disable=protected-access
446
+ for param in eval_class._OPTIONAL_PARAMS: # pylint: disable=protected-access
447
+ if param in singletons:
448
+ del required_singletons[param]
449
+ return required_singletons
450
+
451
+
452
+ def retrieve_content_type(assistant_messages: List, metric: str) -> str:
453
+ """Get the content type for service payload.
454
+
455
+ :param assistant_messages: The list of messages to be annotated by evaluation service
456
+ :type assistant_messages: list
457
+ :param metric: A string representing the metric type
458
+ :type metric: str
459
+ :return: A text representing the content type. Example: 'text', or 'image'
460
+ :rtype: str
461
+ """
462
+ # Check if metric is "protected_material"
463
+ if metric == "protected_material":
464
+ return "image"
465
+
466
+ # Iterate through each message
467
+ for message in assistant_messages:
468
+ # Ensure "content" exists in the message and is iterable
469
+ if isinstance(message.get("content", []), list):
470
+ for content in message.get("content", []):
471
+ if content.get("type") == "image_url":
472
+ return "image"
473
+ # Default return if no image was found
474
+ return "text"
475
+
476
+
477
+ def validate_conversation(conversation):
478
+ def raise_exception(msg, target):
479
+ raise EvaluationException(
480
+ message=msg,
481
+ internal_message=msg,
482
+ target=target,
483
+ category=ErrorCategory.INVALID_VALUE,
484
+ blame=ErrorBlame.USER_ERROR,
485
+ )
486
+
487
+ if not conversation or "messages" not in conversation:
488
+ raise_exception(
489
+ "Attribute 'messages' is missing in the request",
490
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
491
+ )
492
+ messages = conversation["messages"]
493
+ if not isinstance(messages, list):
494
+ raise_exception(
495
+ "'messages' parameter must be a JSON-compatible list of chat messages",
496
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
497
+ )
498
+ expected_roles = {"user", "assistant", "system"}
499
+ image_found = False
500
+ assistant_message_count = 0
501
+ user_message_count = 0
502
+ for num, message in enumerate(messages, 1):
503
+ if not isinstance(message, dict):
504
+ try:
505
+ from azure.ai.inference.models import (
506
+ ChatRequestMessage,
507
+ UserMessage,
508
+ AssistantMessage,
509
+ SystemMessage,
510
+ ImageContentItem,
511
+ )
512
+ except ImportError as ex:
513
+ raise MissingRequiredPackage(
514
+ message="Please install 'azure-ai-inference' package to use SystemMessage, "
515
+ "UserMessage or AssistantMessage."
516
+ ) from ex
517
+
518
+ if isinstance(message, ChatRequestMessage) and not isinstance(
519
+ message, (UserMessage, AssistantMessage, SystemMessage)
520
+ ):
521
+ raise_exception(
522
+ f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}",
523
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
524
+ )
525
+ if isinstance(message, AssistantMessage):
526
+ assistant_message_count += 1
527
+ if isinstance(message, UserMessage):
528
+ user_message_count += 1
529
+ if isinstance(message.content, list) and any(
530
+ isinstance(item, ImageContentItem) for item in message.content
531
+ ):
532
+ image_found = True
533
+ continue
534
+ if message.get("role") not in expected_roles:
535
+ raise_exception(
536
+ f"Invalid role provided: {message.get('role')}. Message number: {num}",
537
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
538
+ )
539
+ if message.get("role") == "assistant":
540
+ assistant_message_count += 1
541
+ if message.get("role") == "user":
542
+ user_message_count += 1
543
+ content = message.get("content")
544
+ if not isinstance(content, (str, list)):
545
+ raise_exception(
546
+ f"Content in each turn must be a string or array. Message number: {num}",
547
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
548
+ )
549
+ if isinstance(content, list):
550
+ if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
551
+ image_found = True
552
+ if not image_found:
553
+ raise_exception(
554
+ "Message needs to have multi-modal input like images.",
555
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
556
+ )
557
+ if assistant_message_count == 0:
558
+ raise_exception(
559
+ "Assistant role required in one of the messages.",
560
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
561
+ )
562
+ if user_message_count == 0:
563
+ raise_exception(
564
+ "User role required in one of the messages.",
565
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
566
+ )
567
+ if assistant_message_count > 1:
568
+ raise_exception(
569
+ "Evaluators for multimodal conversations only support single turn. "
570
+ "User and assistant role expected as the only role in each message.",
571
+ ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
572
+ )
573
+
574
+
575
+ def _extract_text_from_content(content):
576
+ text = []
577
+ for msg in content:
578
+ if "text" in msg:
579
+ text.append(msg["text"])
580
+ return text
581
+
582
+
583
+ def filter_to_used_tools(tool_definitions, msgs_lists, logger=None):
584
+ """Filters the tool definitions to only include those that were actually used in the messages lists."""
585
+ try:
586
+ used_tool_names = set()
587
+ any_tools_used = False
588
+ for msgs in msgs_lists:
589
+ for msg in msgs:
590
+ if msg.get("role") == "assistant" and "content" in msg:
591
+ for content in msg.get("content", []):
592
+ if content.get("type") == "tool_call":
593
+ any_tools_used = True
594
+ if "tool_call" in content and "function" in content["tool_call"]:
595
+ used_tool_names.add(content["tool_call"]["function"])
596
+ elif "name" in content:
597
+ used_tool_names.add(content["name"])
598
+
599
+ filtered_tools = [tool for tool in tool_definitions if tool.get("name") in used_tool_names]
600
+ if any_tools_used and not filtered_tools:
601
+ if logger:
602
+ logger.warning("No tool definitions matched the tools used in the messages. Returning original list.")
603
+ filtered_tools = tool_definitions
604
+
605
+ return filtered_tools
606
+ except Exception as e:
607
+ if logger:
608
+ logger.warning(f"Failed to filter tool definitions, returning original list. Error: {e}")
609
+ return tool_definitions
610
+
611
+
612
+ def _get_conversation_history(query, include_system_messages=False, include_tool_messages=False):
613
+ all_user_queries, all_agent_responses = [], []
614
+ cur_user_query, cur_agent_response = [], []
615
+ system_message = None
616
+
617
+ for msg in query:
618
+ role = msg.get("role")
619
+ if not role:
620
+ continue
621
+ if include_system_messages and role == "system":
622
+ system_message = msg.get("content", "")
623
+
624
+ elif role == "user" and "content" in msg:
625
+ if cur_agent_response:
626
+ formatted_agent_response = _get_agent_response(
627
+ cur_agent_response, include_tool_messages=include_tool_messages
628
+ )
629
+ all_agent_responses.append([formatted_agent_response])
630
+ cur_agent_response = []
631
+ text_in_msg = _extract_text_from_content(msg["content"])
632
+ if text_in_msg:
633
+ cur_user_query.append(text_in_msg)
634
+
635
+ elif role in ("assistant", "tool"):
636
+ if cur_user_query:
637
+ all_user_queries.append(cur_user_query)
638
+ cur_user_query = []
639
+ cur_agent_response.append(msg)
640
+
641
+ if cur_user_query:
642
+ all_user_queries.append(cur_user_query)
643
+ if cur_agent_response:
644
+ formatted_agent_response = _get_agent_response(cur_agent_response, include_tool_messages=include_tool_messages)
645
+ all_agent_responses.append([formatted_agent_response])
646
+
647
+ if len(all_user_queries) != len(all_agent_responses) + 1:
648
+ raise EvaluationException(
649
+ message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
650
+ internal_message=ErrorMessage.MALFORMED_CONVERSATION_HISTORY,
651
+ target=ErrorTarget.CONVERSATION_HISTORY_PARSING,
652
+ category=ErrorCategory.INVALID_VALUE,
653
+ blame=ErrorBlame.USER_ERROR,
654
+ )
655
+
656
+ result = {"user_queries": all_user_queries, "agent_responses": all_agent_responses}
657
+ if include_system_messages and system_message:
658
+ result["system_message"] = system_message
659
+ return result
660
+
661
+
662
+ def _pretty_format_conversation_history(conversation_history):
663
+ """Formats the conversation history for better readability."""
664
+ formatted_history = ""
665
+ if conversation_history.get("system_message"):
666
+ formatted_history += "SYSTEM_PROMPT:\n"
667
+ formatted_history += " " + conversation_history["system_message"] + "\n\n"
668
+ for i, (user_query, agent_response) in enumerate(
669
+ zip(conversation_history["user_queries"], conversation_history["agent_responses"] + [None])
670
+ ):
671
+ formatted_history += f"User turn {i+1}:\n"
672
+ for msg in user_query:
673
+ if isinstance(msg, list):
674
+ for submsg in msg:
675
+ formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
676
+ else:
677
+ formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
678
+ formatted_history += "\n"
679
+ if agent_response:
680
+ formatted_history += f"Agent turn {i+1}:\n"
681
+ for msg in agent_response:
682
+ if isinstance(msg, list):
683
+ for submsg in msg:
684
+ formatted_history += " " + "\n ".join(submsg.split("\n")) + "\n"
685
+ else:
686
+ formatted_history += " " + "\n ".join(msg.split("\n")) + "\n"
687
+ formatted_history += "\n"
688
+ return formatted_history
689
+
690
+
691
+ def reformat_conversation_history(query, logger=None, include_system_messages=False, include_tool_messages=False):
692
+ """Reformats the conversation history to a more compact representation."""
693
+ try:
694
+ conversation_history = _get_conversation_history(
695
+ query,
696
+ include_system_messages=include_system_messages,
697
+ include_tool_messages=include_tool_messages,
698
+ )
699
+ return _pretty_format_conversation_history(conversation_history)
700
+ except Exception as e:
701
+ # If the conversation history cannot be parsed for whatever reason (e.g. the converter format changed), the original query is returned
702
+ # This is a fallback to ensure that the evaluation can still proceed. However the accuracy of the evaluation will be affected.
703
+ # From our tests the negative impact on IntentResolution is:
704
+ # Higher intra model variance (0.142 vs 0.046)
705
+ # Higher inter model variance (0.345 vs 0.607)
706
+ # Lower percentage of mode in Likert scale (73.4% vs 75.4%)
707
+ # Lower pairwise agreement between LLMs (85% vs 90% at the pass/fail level with threshold of 3)
708
+ if logger:
709
+ logger.warning(f"Conversation history could not be parsed, falling back to original query: {query}")
710
+ return query
711
+
712
+
713
+ def _get_agent_response(agent_response_msgs, include_tool_messages=False):
714
+ """Extracts formatted agent response including text, and optionally tool calls/results."""
715
+ agent_response_text = []
716
+ tool_results = {}
717
+
718
+ # First pass: collect tool results
719
+ if include_tool_messages:
720
+ for msg in agent_response_msgs:
721
+ if msg.get("role") == "tool" and "tool_call_id" in msg:
722
+ for content in msg.get("content", []):
723
+ if content.get("type") == "tool_result":
724
+ result = content.get("tool_result")
725
+ tool_results[msg["tool_call_id"]] = f"[TOOL_RESULT] {result}"
726
+
727
+ # Second pass: parse assistant messages and tool calls
728
+ for msg in agent_response_msgs:
729
+ if "role" in msg and msg.get("role") == "assistant" and "content" in msg:
730
+ text = _extract_text_from_content(msg["content"])
731
+ if text:
732
+ agent_response_text.extend(text)
733
+ if include_tool_messages:
734
+ for content in msg.get("content", []):
735
+ # Todo: Verify if this is the correct way to handle tool calls
736
+ if content.get("type") == "tool_call":
737
+ if "tool_call" in content and "function" in content.get("tool_call", {}):
738
+ tc = content.get("tool_call", {})
739
+ func_name = tc.get("function", {}).get("name", "")
740
+ args = tc.get("function", {}).get("arguments", {})
741
+ tool_call_id = tc.get("id")
742
+ else:
743
+ tool_call_id = content.get("tool_call_id")
744
+ func_name = content.get("name", "")
745
+ args = content.get("arguments", {})
746
+ args_str = ", ".join(f'{k}="{v}"' for k, v in args.items())
747
+ call_line = f"[TOOL_CALL] {func_name}({args_str})"
748
+ agent_response_text.append(call_line)
749
+ if tool_call_id in tool_results:
750
+ agent_response_text.append(tool_results[tool_call_id])
751
+
752
+ return agent_response_text
753
+
754
+
755
+ def reformat_agent_response(response, logger=None, include_tool_messages=False):
756
+ try:
757
+ if response is None or response == []:
758
+ return ""
759
+ agent_response = _get_agent_response(response, include_tool_messages=include_tool_messages)
760
+ if agent_response == []:
761
+ # If no message could be extracted, likely the format changed, fallback to the original response in that case
762
+ if logger:
763
+ logger.warning(
764
+ f"Empty agent response extracted, likely due to input schema change. Falling back to using the original response: {response}"
765
+ )
766
+ return response
767
+ return "\n".join(agent_response)
768
+ except:
769
+ # If the agent response cannot be parsed for whatever reason (e.g. the converter format changed), the original response is returned
770
+ # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
771
+ if logger:
772
+ logger.warning(f"Agent response could not be parsed, falling back to original response: {response}")
773
+ return response
774
+
775
+
776
+ def reformat_tool_definitions(tool_definitions, logger=None):
777
+ try:
778
+ output_lines = ["TOOL_DEFINITIONS:"]
779
+ for tool in tool_definitions:
780
+ name = tool.get("name", "unnamed_tool")
781
+ desc = tool.get("description", "").strip()
782
+ params = tool.get("parameters", {}).get("properties", {})
783
+ param_names = ", ".join(params.keys()) if params else "no parameters"
784
+ output_lines.append(f"- {name}: {desc} (inputs: {param_names})")
785
+ return "\n".join(output_lines)
786
+ except Exception as e:
787
+ # If the tool definitions cannot be parsed for whatever reason, the original tool definitions are returned
788
+ # This is a fallback to ensure that the evaluation can still proceed. See comments on reformat_conversation_history for more details.
789
+ if logger:
790
+ logger.warning(
791
+ f"Tool definitions could not be parsed, falling back to original definitions: {tool_definitions}"
792
+ )
793
+ return tool_definitions
794
+
795
+
796
+ def simplify_messages(messages, drop_system=True, drop_tool_calls=False, logger=None):
797
+ """
798
+ Simplify a list of conversation messages by keeping only role and content.
799
+ Optionally filter out system messages and/or tool calls.
800
+
801
+ :param messages: List of message dicts (e.g., from query or response)
802
+ :param drop_system: If True, remove system role messages
803
+ :param drop_tool_calls: If True, remove tool_call items from assistant content
804
+ :return: New simplified list of messages
805
+ """
806
+ if isinstance(messages, str):
807
+ return messages
808
+ try:
809
+ # Validate input is a list
810
+ if not isinstance(messages, list):
811
+ return messages
812
+
813
+ simplified_msgs = []
814
+ for msg in messages:
815
+ # Ensure msg is a dict
816
+ if not isinstance(msg, dict):
817
+ simplified_msgs.append(msg)
818
+ continue
819
+
820
+ role = msg.get("role")
821
+ content = msg.get("content", [])
822
+
823
+ # Drop system message (if should)
824
+ if drop_system and role == "system":
825
+ continue
826
+
827
+ # Simplify user messages
828
+ if role == "user":
829
+ simplified_msg = {
830
+ "role": role,
831
+ "content": _extract_text_from_content(content),
832
+ }
833
+ simplified_msgs.append(simplified_msg)
834
+ continue
835
+
836
+ # Drop tool results (if should)
837
+ if drop_tool_calls and role == "tool":
838
+ continue
839
+
840
+ # Simplify assistant messages
841
+ if role == "assistant":
842
+ simplified_content = _extract_text_from_content(content)
843
+ # Check if message has content
844
+ if simplified_content:
845
+ simplified_msg = {"role": role, "content": simplified_content}
846
+ simplified_msgs.append(simplified_msg)
847
+ continue
848
+
849
+ # Drop tool calls (if should)
850
+ if drop_tool_calls and any(c.get("type") == "tool_call" for c in content if isinstance(c, dict)):
851
+ continue
852
+
853
+ # If we reach here, it means we want to keep the message
854
+ simplified_msgs.append(msg)
855
+
856
+ return simplified_msgs
857
+
858
+ except Exception as ex:
859
+ if logger:
860
+ logger.debug(f"Error simplifying messages: {str(ex)}. Returning original messages.")
861
+ return messages
862
+
863
+
864
+ def upload(path: str, container_client: ContainerClient, logger=None):
865
+ """Upload files or directories to Azure Blob Storage using a container client.
866
+
867
+ This function uploads a file or all files in a directory (recursively) to Azure Blob Storage.
868
+ When uploading a directory, the relative path structure is preserved in the blob container.
869
+
870
+ :param path: The local path to a file or directory to upload
871
+ :type path: str
872
+ :param container_client: The Azure Blob Container client to use for uploading
873
+ :type container_client: azure.storage.blob.ContainerClient
874
+ :param logger: Optional logger for debug output, defaults to None
875
+ :type logger: logging.Logger, optional
876
+ :raises EvaluationException: If the path doesn't exist or errors occur during upload
877
+ """
878
+
879
+ if not os.path.isdir(path) and not os.path.isfile(path):
880
+ raise EvaluationException(
881
+ message=f"Path '{path}' is not a directory or a file",
882
+ internal_message=f"Path '{path}' is not a directory or a file",
883
+ target=ErrorTarget.RAI_CLIENT,
884
+ category=ErrorCategory.INVALID_VALUE,
885
+ blame=ErrorBlame.SYSTEM_ERROR,
886
+ )
887
+
888
+ remote_paths = []
889
+ local_paths = []
890
+
891
+ if os.path.isdir(path):
892
+ for root, _, filenames in os.walk(path):
893
+ upload_path = ""
894
+ if root != path:
895
+ rel_path = os.path.relpath(root, path)
896
+ upload_path = posixpath.join(rel_path)
897
+ for f in filenames:
898
+ remote_file_path = posixpath.join(upload_path, f)
899
+ remote_paths.append(remote_file_path)
900
+ local_file_path = os.path.join(root, f)
901
+ local_paths.append(local_file_path)
902
+
903
+ if os.path.isfile(path):
904
+ remote_paths = [os.path.basename(path)]
905
+ local_paths = [path]
906
+
907
+ try:
908
+ # Open the file in binary read mode
909
+ for local, remote in zip(local_paths, remote_paths):
910
+ with open(local, "rb") as data:
911
+ # Upload the file to Azure Blob Storage
912
+ container_client.upload_blob(data=data, name=remote)
913
+ if logger:
914
+ logger.debug(f"File '{local}' uploaded successfully")
915
+
916
+ except Exception as e:
917
+ raise EvaluationException(
918
+ message=f"Error uploading file: {e}",
919
+ internal_message=f"Error uploading file: {e}",
920
+ target=ErrorTarget.RAI_CLIENT,
921
+ category=ErrorCategory.UPLOAD_ERROR,
922
+ blame=ErrorBlame.SYSTEM_ERROR,
923
+ )