azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -3,61 +3,123 @@
3
3
  # ---------------------------------------------------------
4
4
  import asyncio
5
5
  import importlib.metadata
6
+ import math
6
7
  import re
7
8
  import time
9
+ import json
10
+ import html
8
11
  from ast import literal_eval
9
- from typing import Dict, List
12
+ from typing import Dict, List, Optional, Union, cast
10
13
  from urllib.parse import urlparse
14
+ from string import Template
15
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
16
+ from azure.ai.evaluation._common.onedp.models import QueryResponseInlineMessage
17
+ from azure.core.exceptions import HttpResponseError
11
18
 
12
19
  import jwt
13
- import numpy as np
14
20
 
21
+ from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
15
22
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
16
- from azure.ai.evaluation._http_utils import get_async_http_client
23
+ from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
17
24
  from azure.ai.evaluation._model_configurations import AzureAIProject
25
+ from azure.ai.evaluation._user_agent import UserAgentSingleton
26
+ from azure.ai.evaluation._common.utils import is_onedp_project
18
27
  from azure.core.credentials import TokenCredential
19
- from azure.identity import DefaultAzureCredential
28
+ from azure.core.exceptions import HttpResponseError
29
+ from azure.core.pipeline.policies import AsyncRetryPolicy, UserAgentPolicy
20
30
 
21
31
  from .constants import (
22
32
  CommonConstants,
23
33
  EvaluationMetrics,
24
34
  RAIService,
25
35
  Tasks,
26
- _InternalAnnotationTasks,
27
36
  _InternalEvaluationMetrics,
28
37
  )
29
- from .utils import get_harm_severity_level
38
+ from .utils import get_harm_severity_level, retrieve_content_type
30
39
 
31
- try:
32
- version = importlib.metadata.version("azure-ai-evaluation")
33
- except importlib.metadata.PackageNotFoundError:
34
- version = "unknown"
35
- USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
36
40
 
41
+ USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
42
+ "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
43
+ }
44
+ ML_WORKSPACE = "https://management.azure.com/.default"
45
+ COG_SRV_WORKSPACE = "https://ai.azure.com/.default"
37
46
 
38
- def get_common_headers(token: str) -> Dict:
47
+ INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"
48
+
49
+
50
+ def get_formatted_template(data: dict, annotation_task: str) -> str:
51
+ """Given the task and input data, produce a formatted string that will serve as the main
52
+ payload for the RAI service. Requires specific per-task logic.
53
+
54
+ :param data: The data to incorporate into the payload.
55
+ :type data: dict
56
+ :param annotation_task: The annotation task to use. This determines the template to use.
57
+ :type annotation_task: str
58
+ :return: The formatted based on the data and task template.
59
+ :rtype: str
60
+ """
61
+ # Template class doesn't play nice with json dumping/loading, just handle groundedness'
62
+ # JSON format manually.
63
+ # Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'),
64
+ if annotation_task == Tasks.GROUNDEDNESS:
65
+ as_dict = {
66
+ "question": data.get("query", ""),
67
+ "answer": data.get("response", ""),
68
+ "context": data.get("context", ""),
69
+ }
70
+ return json.dumps(as_dict)
71
+ if annotation_task == Tasks.CODE_VULNERABILITY:
72
+ as_dict = {"context": data.get("query", ""), "completion": data.get("response", "")}
73
+ return json.dumps(as_dict)
74
+ if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
75
+ as_dict = {
76
+ "query": data.get("query", ""),
77
+ "response": data.get("response", ""),
78
+ "context": data.get("context", ""),
79
+ }
80
+ return json.dumps(as_dict)
81
+ as_dict = {
82
+ "query": html.escape(data.get("query", "")),
83
+ "response": html.escape(data.get("response", "")),
84
+ }
85
+ user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict)
86
+ return user_text.replace("'", '\\"')
87
+
88
+
89
+ def get_common_headers(token: str, evaluator_name: Optional[str] = None) -> Dict:
39
90
  """Get common headers for the HTTP request
40
91
 
41
92
  :param token: The Azure authentication token.
42
93
  :type token: str
94
+ :param evaluator_name: The evaluator name. Default is None.
95
+ :type evaluator_name: str
43
96
  :return: The common headers.
44
97
  :rtype: Dict
45
98
  """
99
+ user_agent = (
100
+ f"{UserAgentSingleton().value} (type=evaluator; subtype={evaluator_name})"
101
+ if evaluator_name
102
+ else UserAgentSingleton().value
103
+ )
46
104
  return {
47
105
  "Authorization": f"Bearer {token}",
48
- "Content-Type": "application/json",
49
- "User-Agent": USER_AGENT,
50
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
51
- # https://github.com/encode/httpx/discussions/2959
52
- "Connection": "close",
106
+ "User-Agent": user_agent,
53
107
  }
54
108
 
55
109
 
56
- async def ensure_service_availability(rai_svc_url: str, token: str, capability: str = None) -> None:
110
+ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
111
+ return get_async_http_client().with_policies(
112
+ retry_policy=AsyncRetryPolicy(timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT)
113
+ )
114
+
115
+
116
+ async def ensure_service_availability_onedp(
117
+ client: AIProjectClient, token: str, capability: Optional[str] = None
118
+ ) -> None:
57
119
  """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
58
120
 
59
- :param rai_svc_url: The Responsible AI service URL.
60
- :type rai_svc_url: str
121
+ :param client: The AI project client.
122
+ :type client: AIProjectClient
61
123
  :param token: The Azure authentication token.
62
124
  :type token: str
63
125
  :param capability: The capability to check. Default is None.
@@ -65,37 +127,65 @@ async def ensure_service_availability(rai_svc_url: str, token: str, capability:
65
127
  :raises Exception: If the service is not available in the region or the capability is not available.
66
128
  """
67
129
  headers = get_common_headers(token)
68
- svc_liveness_url = rai_svc_url + "/checkannotation"
69
-
70
- async with get_async_http_client() as client:
71
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
72
- svc_liveness_url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
73
- )
74
-
75
- if response.status_code != 200:
76
- msg = f"RAI service is not available in this region. Status Code: {response.status_code}"
77
- raise EvaluationException(
78
- message=msg,
79
- internal_message=msg,
80
- target=ErrorTarget.UNKNOWN,
81
- category=ErrorCategory.SERVICE_UNAVAILABLE,
82
- blame=ErrorBlame.USER_ERROR,
83
- )
84
-
85
- capabilities = response.json()
130
+ capabilities = client.evaluations.check_annotation(headers=headers)
86
131
 
87
132
  if capability and capability not in capabilities:
88
- msg = f"Capability '{capability}' is not available in this region"
133
+ msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
89
134
  raise EvaluationException(
90
135
  message=msg,
91
136
  internal_message=msg,
92
137
  target=ErrorTarget.RAI_CLIENT,
93
138
  category=ErrorCategory.SERVICE_UNAVAILABLE,
94
139
  blame=ErrorBlame.USER_ERROR,
140
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
95
141
  )
96
142
 
97
143
 
98
- def generate_payload(normalized_user_text: str, metric: str) -> Dict:
144
+ async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
145
+ """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
146
+
147
+ :param rai_svc_url: The Responsible AI service URL.
148
+ :type rai_svc_url: str
149
+ :param token: The Azure authentication token.
150
+ :type token: str
151
+ :param capability: The capability to check. Default is None.
152
+ :type capability: str
153
+ :raises Exception: If the service is not available in the region or the capability is not available.
154
+ """
155
+ headers = get_common_headers(token)
156
+ svc_liveness_url = rai_svc_url + "/checkannotation"
157
+
158
+ async with get_async_http_client() as client:
159
+ response = await client.get(svc_liveness_url, headers=headers)
160
+
161
+ if response.status_code != 200:
162
+ msg = (
163
+ f"RAI service is unavailable in this region, or you lack the necessary permissions "
164
+ f"to access the AI project. Status Code: {response.status_code}"
165
+ )
166
+ raise EvaluationException(
167
+ message=msg,
168
+ internal_message=msg,
169
+ target=ErrorTarget.RAI_CLIENT,
170
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
171
+ blame=ErrorBlame.USER_ERROR,
172
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
173
+ )
174
+
175
+ capabilities = response.json()
176
+ if capability and capability not in capabilities:
177
+ msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
178
+ raise EvaluationException(
179
+ message=msg,
180
+ internal_message=msg,
181
+ target=ErrorTarget.RAI_CLIENT,
182
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
183
+ blame=ErrorBlame.USER_ERROR,
184
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
185
+ )
186
+
187
+
188
+ def generate_payload(normalized_user_text: str, metric: str, annotation_task: str) -> Dict:
99
189
  """Generate the payload for the annotation request
100
190
 
101
191
  :param normalized_user_text: The normalized user text to be entered as the "UserTextList" in the payload.
@@ -103,19 +193,20 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
103
193
  :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
104
194
  in the payload.
105
195
  :type metric: str
196
+ :param annotation_task: The annotation task to be passed to service
197
+ :type annotation_task: str
106
198
  :return: The payload for the annotation request.
107
199
  :rtype: Dict
108
200
  """
109
201
  include_metric = True
110
- task = Tasks.CONTENT_HARM
202
+ task = annotation_task
111
203
  if metric == EvaluationMetrics.PROTECTED_MATERIAL:
112
- task = Tasks.PROTECTED_MATERIAL
204
+ include_metric = False
205
+ elif metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
113
206
  include_metric = False
114
207
  elif metric == _InternalEvaluationMetrics.ECI:
115
- task = _InternalAnnotationTasks.ECI
116
208
  include_metric = False
117
209
  elif metric == EvaluationMetrics.XPIA:
118
- task = Tasks.XPIA
119
210
  include_metric = False
120
211
  return (
121
212
  {
@@ -131,39 +222,78 @@ def generate_payload(normalized_user_text: str, metric: str) -> Dict:
131
222
  )
132
223
 
133
224
 
134
- async def submit_request(query: str, response: str, metric: str, rai_svc_url: str, token: str) -> str:
225
+ async def submit_request(
226
+ data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str, evaluator_name: str
227
+ ) -> str:
135
228
  """Submit request to Responsible AI service for evaluation and return operation ID
136
229
 
137
- :param query: The query to evaluate.
138
- :type query: str
139
- :param response: The response to evaluate.
140
- :type response: str
230
+ :param data: The data to evaluate.
231
+ :type data: dict
141
232
  :param metric: The evaluation metric to use.
142
233
  :type metric: str
143
234
  :param rai_svc_url: The Responsible AI service URL.
144
235
  :type rai_svc_url: str
145
236
  :param token: The Azure authentication token.
146
237
  :type token: str
238
+ :param annotation_task: The annotation task to use.
239
+ :type annotation_task: str
240
+ :param evaluator_name: The evaluator name.
241
+ :type evaluator_name: str
147
242
  :return: The operation ID.
148
243
  :rtype: str
149
244
  """
150
- user_text = f"<Human>{query}</><System>{response}</>"
151
- normalized_user_text = user_text.replace("'", '\\"')
152
- payload = generate_payload(normalized_user_text, metric)
245
+ normalized_user_text = get_formatted_template(data, annotation_task)
246
+ payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
153
247
 
154
248
  url = rai_svc_url + "/submitannotation"
155
- headers = get_common_headers(token)
249
+ headers = get_common_headers(token, evaluator_name)
156
250
 
157
- async with get_async_http_client() as client:
158
- response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
159
- url, json=payload, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
160
- )
251
+ async with get_async_http_client_with_timeout() as client:
252
+ http_response = await client.post(url, json=payload, headers=headers)
161
253
 
162
- if response.status_code != 202:
163
- print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], response.text))
164
- response.raise_for_status()
254
+ if http_response.status_code != 202:
255
+ print("Fail evaluating '%s' with error message: %s" % (payload["UserTextList"], http_response.text()))
256
+ http_response.raise_for_status()
257
+ result = http_response.json()
258
+ operation_id = result["location"].split("/")[-1]
259
+ return operation_id
165
260
 
166
- result = response.json()
261
+
262
+ async def submit_request_onedp(
263
+ client: AIProjectClient,
264
+ data: dict,
265
+ metric: str,
266
+ token: str,
267
+ annotation_task: str,
268
+ evaluator_name: str,
269
+ scan_session_id: Optional[str] = None,
270
+ ) -> str:
271
+ """Submit request to Responsible AI service for evaluation and return operation ID
272
+
273
+ :param client: The AI project client.
274
+ :type client: AIProjectClient
275
+ :param data: The data to evaluate.
276
+ :type data: dict
277
+ :param metric: The evaluation metric to use.
278
+ :type metric: str
279
+ :param token: The Azure authentication token.
280
+ :type token: str
281
+ :param annotation_task: The annotation task to use.
282
+ :type annotation_task: str
283
+ :param evaluator_name: The evaluator name.
284
+ :type evaluator_name: str
285
+ :param scan_session_id: The scan session ID to use for the evaluation.
286
+ :type scan_session_id: Optional[str]
287
+ :return: The operation ID.
288
+ :rtype: str
289
+ """
290
+ normalized_user_text = get_formatted_template(data, annotation_task)
291
+ payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
292
+ headers = get_common_headers(token, evaluator_name)
293
+ if scan_session_id:
294
+ headers["x-ms-client-request-id"] = scan_session_id
295
+ response = client.evaluations.submit_annotation(payload, headers=headers)
296
+ result = json.loads(response)
167
297
  operation_id = result["location"].split("/")[-1]
168
298
  return operation_id
169
299
 
@@ -191,9 +321,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
191
321
  headers = get_common_headers(token)
192
322
 
193
323
  async with get_async_http_client() as client:
194
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
195
- url, headers=headers, timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT
196
- )
324
+ response = await client.get(url, headers=headers, timeout=RAIService.TIMEOUT)
197
325
 
198
326
  if response.status_code == 200:
199
327
  return response.json()
@@ -207,65 +335,195 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
207
335
  await asyncio.sleep(sleep_time)
208
336
 
209
337
 
338
+ async def fetch_result_onedp(client: AIProjectClient, operation_id: str, token: str) -> Dict:
339
+ """Fetch the annotation result from Responsible AI service
340
+
341
+ :param client: The AI project client.
342
+ :type client: AIProjectClient
343
+ :param operation_id: The operation ID.
344
+ :type operation_id: str
345
+ :param token: The Azure authentication token.
346
+ :type token: str
347
+ :return: The annotation result.
348
+ :rtype: Dict
349
+ """
350
+ start = time.time()
351
+ request_count = 0
352
+
353
+ while True:
354
+ headers = get_common_headers(token)
355
+ try:
356
+ return client.evaluations.operation_results(operation_id, headers=headers)
357
+ except HttpResponseError:
358
+ request_count += 1
359
+ time_elapsed = time.time() - start
360
+ if time_elapsed > RAIService.TIMEOUT:
361
+ raise TimeoutError(
362
+ f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds"
363
+ )
364
+
365
+ sleep_time = RAIService.SLEEP_TIME**request_count
366
+ await asyncio.sleep(sleep_time)
367
+
368
+
210
369
  def parse_response( # pylint: disable=too-many-branches,too-many-statements
211
- batch_response: List[Dict], metric_name: str
212
- ) -> Dict:
370
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
371
+ ) -> Dict[str, Union[str, float]]:
213
372
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
214
373
 
215
374
  :param batch_response: The annotation response from Responsible AI service.
216
375
  :type batch_response: List[Dict]
217
376
  :param metric_name: The evaluation metric to use.
218
377
  :type metric_name: str
378
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
379
+ :type metric_display_name: Optional[str]
219
380
  :return: The parsed annotation result.
220
- :rtype: List[List[Dict]]
381
+ :rtype: Dict[str, Union[str, float]]
221
382
  """
383
+ if metric_display_name is None:
384
+ metric_display_name = metric_name
385
+
222
386
  # non-numeric metrics
223
- if metric_name in {EvaluationMetrics.PROTECTED_MATERIAL, _InternalEvaluationMetrics.ECI, EvaluationMetrics.XPIA}:
224
- if not batch_response or len(batch_response[0]) == 0 or metric_name not in batch_response[0]:
387
+ if metric_name in {
388
+ EvaluationMetrics.PROTECTED_MATERIAL,
389
+ _InternalEvaluationMetrics.ECI,
390
+ EvaluationMetrics.XPIA,
391
+ EvaluationMetrics.CODE_VULNERABILITY,
392
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
393
+ }:
394
+ result = {}
395
+ if not batch_response or len(batch_response[0]) == 0:
396
+ return {}
397
+ if (
398
+ metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
399
+ and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]
400
+ ):
401
+ batch_response[0] = {
402
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
403
+ }
404
+ if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
405
+ pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
406
+ for pm_metric_name in pm_metric_names:
407
+ response = batch_response[0][pm_metric_name]
408
+ response = response.replace("false", "False")
409
+ response = response.replace("true", "True")
410
+ parsed_response = literal_eval(response)
411
+ result[pm_metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
412
+ result[pm_metric_name + "_reason"] = (
413
+ parsed_response["reasoning"] if "reasoning" in parsed_response else ""
414
+ )
415
+ result[pm_metric_name + "_total_tokens"] = (
416
+ parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
417
+ )
418
+ result[pm_metric_name + "_prompt_tokens"] = (
419
+ parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
420
+ )
421
+ result[pm_metric_name + "_completion_tokens"] = (
422
+ parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
423
+ )
424
+ result[pm_metric_name + "_finish_reason"] = (
425
+ parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
426
+ )
427
+ result[pm_metric_name + "_sample_input"] = (
428
+ parsed_response["sample_input"] if "sample_input" in parsed_response else ""
429
+ )
430
+ result[pm_metric_name + "_sample_output"] = (
431
+ parsed_response["sample_output"] if "sample_output" in parsed_response else ""
432
+ )
433
+ result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
434
+ return result
435
+ if metric_name not in batch_response[0]:
225
436
  return {}
226
437
  response = batch_response[0][metric_name]
227
438
  response = response.replace("false", "False")
228
439
  response = response.replace("true", "True")
229
440
  parsed_response = literal_eval(response)
230
- result = {}
231
441
  # Use label instead of score since these are assumed to be boolean results.
232
- # Use np.nan as null value since it's ignored by aggregations rather than treated as 0.
233
- result[metric_name + "_label"] = parsed_response["label"] if "label" in parsed_response else np.nan
234
- result[metric_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
442
+ # Use math.nan as null value since it's ignored by aggregations rather than treated as 0.
443
+ result[metric_display_name + "_label"] = parsed_response["label"] if "label" in parsed_response else math.nan
444
+ result[metric_display_name + "_reason"] = parsed_response["reasoning"] if "reasoning" in parsed_response else ""
235
445
 
236
446
  if metric_name == EvaluationMetrics.XPIA:
237
447
  # Add "manipulated_content", "intrusion" and "information_gathering" to the result
238
- # if present else set them to np.nan
239
- result[metric_name + "_manipulated_content"] = (
240
- parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else np.nan
448
+ # if present else set them to math.nan
449
+ result[metric_display_name + "_manipulated_content"] = (
450
+ parsed_response["manipulated_content"] if "manipulated_content" in parsed_response else math.nan
241
451
  )
242
- result[metric_name + "_intrusion"] = (
243
- parsed_response["intrusion"] if "intrusion" in parsed_response else np.nan
452
+ result[metric_display_name + "_intrusion"] = (
453
+ parsed_response["intrusion"] if "intrusion" in parsed_response else math.nan
244
454
  )
245
- result[metric_name + "_information_gathering"] = (
246
- parsed_response["information_gathering"] if "information_gathering" in parsed_response else np.nan
455
+ result[metric_display_name + "_information_gathering"] = (
456
+ parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
247
457
  )
458
+ if (
459
+ metric_name == EvaluationMetrics.CODE_VULNERABILITY
460
+ or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
461
+ ):
462
+ # Add all attributes under the details.
463
+ details = {}
464
+ for key, value in parsed_response.items():
465
+ if key not in {
466
+ "label",
467
+ "reasoning",
468
+ "version",
469
+ "totalTokenCount",
470
+ "inputTokenCount",
471
+ "outputTokenCount",
472
+ "finish_reason",
473
+ "sample_input",
474
+ "sample_output",
475
+ "model",
476
+ }:
477
+ details[key.replace("-", "_")] = value
478
+ result[metric_display_name + "_details"] = details
479
+ result[metric_display_name + "_total_tokens"] = (
480
+ parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
481
+ )
482
+ result[metric_display_name + "_prompt_tokens"] = (
483
+ parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
484
+ )
485
+ result[metric_display_name + "_completion_tokens"] = (
486
+ parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
487
+ )
488
+ result[metric_display_name + "_finish_reason"] = (
489
+ parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
490
+ )
491
+ result[metric_display_name + "_sample_input"] = (
492
+ parsed_response["sample_input"] if "sample_input" in parsed_response else ""
493
+ )
494
+ result[metric_display_name + "_sample_output"] = (
495
+ parsed_response["sample_output"] if "sample_output" in parsed_response else ""
496
+ )
497
+ result[metric_display_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
248
498
  return result
249
- return _parse_content_harm_response(batch_response, metric_name)
499
+ return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
250
500
 
251
501
 
252
- def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -> Dict:
502
+ def _parse_content_harm_response(
503
+ batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
504
+ ) -> Dict[str, Union[str, float]]:
253
505
  """Parse the annotation response from Responsible AI service for a content harm evaluation.
254
506
 
255
507
  :param batch_response: The annotation response from Responsible AI service.
256
508
  :type batch_response: List[Dict]
257
509
  :param metric_name: The evaluation metric to use.
258
510
  :type metric_name: str
511
+ :param metric_display_name: The evaluation metric display name to use. If unset, use the metric_name.
512
+ :type metric_display_name: Optional[str]
259
513
  :return: The parsed annotation result.
260
- :rtype: List[List[Dict]]
514
+ :rtype: Dict[str, Union[str, float]]
261
515
  """
262
516
  # Fix the metric name if it's "hate_fairness"
263
517
  # Eventually we will remove this fix once the RAI service is updated
264
- key = metric_name
518
+ key = metric_name if metric_display_name is None else metric_display_name
265
519
  if key == EvaluationMetrics.HATE_FAIRNESS:
266
520
  key = EvaluationMetrics.HATE_UNFAIRNESS
267
521
 
268
- result = {key: np.nan, key + "_score": np.nan, key + "_reason": ""}
522
+ result: Dict[str, Union[str, float]] = {
523
+ (key.value if hasattr(key, "value") else key): math.nan,
524
+ f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan,
525
+ f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan,
526
+ }
269
527
 
270
528
  response = batch_response[0]
271
529
  if metric_name not in response:
@@ -276,6 +534,13 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
276
534
  except Exception: # pylint: disable=broad-exception-caught
277
535
  harm_response = response[metric_name]
278
536
 
537
+ total_tokens = 0
538
+ prompt_tokens = 0
539
+ completion_tokens = 0
540
+ finish_reason = ""
541
+ sample_input = ""
542
+ sample_output = ""
543
+ model = ""
279
544
  if harm_response != "" and isinstance(harm_response, dict):
280
545
  # check if "output" is one key in harm_response
281
546
  if "output" in harm_response:
@@ -283,11 +548,18 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
283
548
 
284
549
  # get content harm metric_value
285
550
  if "label" in harm_response:
286
- metric_value = harm_response["label"]
551
+ try:
552
+ # Handle "n/a" or other non-numeric values
553
+ if isinstance(harm_response["label"], str) and harm_response["label"].strip().lower() == "n/a":
554
+ metric_value = math.nan
555
+ else:
556
+ metric_value = float(harm_response["label"])
557
+ except (ValueError, TypeError):
558
+ metric_value = math.nan
287
559
  elif "valid" in harm_response:
288
- metric_value = 0 if harm_response["valid"] else np.nan
560
+ metric_value = 0 if harm_response["valid"] else math.nan
289
561
  else:
290
- metric_value = np.nan
562
+ metric_value = math.nan
291
563
 
292
564
  # get reason
293
565
  if "reasoning" in harm_response:
@@ -296,31 +568,77 @@ def _parse_content_harm_response(batch_response: List[Dict], metric_name: str) -
296
568
  reason = harm_response["reason"]
297
569
  else:
298
570
  reason = ""
571
+
572
+ # get token_usage
573
+ if "totalTokenCount" in harm_response:
574
+ total_tokens = harm_response["totalTokenCount"]
575
+ else:
576
+ total_tokens = 0
577
+ if "inputTokenCount" in harm_response:
578
+ prompt_tokens = harm_response["inputTokenCount"]
579
+ else:
580
+ prompt_tokens = 0
581
+ if "outputTokenCount" in harm_response:
582
+ completion_tokens = harm_response["outputTokenCount"]
583
+ else:
584
+ completion_tokens = 0
585
+
586
+ # get finish_reason
587
+ if "finish_reason" in harm_response:
588
+ finish_reason = harm_response["finish_reason"]
589
+ else:
590
+ finish_reason = ""
591
+
592
+ # get sample_input
593
+ if "sample_input" in harm_response:
594
+ sample_input = harm_response["sample_input"]
595
+ else:
596
+ sample_input = ""
597
+
598
+ # get sample_output
599
+ if "sample_output" in harm_response:
600
+ sample_output = harm_response["sample_output"]
601
+ else:
602
+ sample_output = ""
603
+
604
+ # get model
605
+ if "model" in harm_response:
606
+ model = harm_response["model"]
607
+ else:
608
+ model = ""
299
609
  elif harm_response != "" and isinstance(harm_response, str):
300
610
  metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
301
611
  if metric_value_match:
302
612
  metric_value = int(metric_value_match[0])
303
613
  else:
304
- metric_value = np.nan
614
+ metric_value = math.nan
305
615
  reason = harm_response
306
616
  elif harm_response != "" and isinstance(harm_response, (int, float)):
307
617
  if 0 < harm_response <= 7:
308
618
  metric_value = harm_response
309
619
  else:
310
- metric_value = np.nan
620
+ metric_value = math.nan
311
621
  reason = ""
312
622
  else:
313
- metric_value = np.nan
623
+ metric_value = math.nan
314
624
  reason = ""
315
625
 
316
626
  harm_score = metric_value
317
- if not np.isnan(metric_value):
318
- # int(np.nan) causes a value error, and np.nan is already handled
627
+ # We've already handled the "n/a" case by converting to math.nan
628
+ if not math.isnan(metric_value):
629
+ # int(math.nan) causes a value error, and math.nan is already handled
319
630
  # by get_harm_severity_level
320
631
  harm_score = int(metric_value)
321
632
  result[key] = get_harm_severity_level(harm_score)
322
633
  result[key + "_score"] = harm_score
323
634
  result[key + "_reason"] = reason
635
+ result[key + "_total_tokens"] = total_tokens
636
+ result[key + "_prompt_tokens"] = prompt_tokens
637
+ result[key + "_completion_tokens"] = completion_tokens
638
+ result[key + "_finish_reason"] = finish_reason
639
+ result[key + "_sample_input"] = sample_input
640
+ result[key + "_sample_output"] = sample_output
641
+ result[key + "_model"] = model
324
642
 
325
643
  return result
326
644
 
@@ -337,31 +655,34 @@ async def _get_service_discovery_url(azure_ai_project: AzureAIProject, token: st
337
655
  """
338
656
  headers = get_common_headers(token)
339
657
 
340
- async with get_async_http_client() as client:
341
- response = await client.get( # pylint: disable=too-many-function-args,unexpected-keyword-arg
658
+ async with get_async_http_client_with_timeout() as client:
659
+ response = await client.get(
342
660
  f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/"
343
661
  f"resourceGroups/{azure_ai_project['resource_group_name']}/"
344
662
  f"providers/Microsoft.MachineLearningServices/workspaces/{azure_ai_project['project_name']}?"
345
663
  f"api-version=2023-08-01-preview",
346
664
  headers=headers,
347
- timeout=CommonConstants.DEFAULT_HTTP_TIMEOUT,
348
665
  )
349
666
 
350
667
  if response.status_code != 200:
351
- msg = "Failed to retrieve the discovery service URL."
668
+ msg = (
669
+ f"Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, "
670
+ f"and make sure you have the necessary access permissions. "
671
+ f"Status code: {response.status_code}."
672
+ )
352
673
  raise EvaluationException(
353
674
  message=msg,
354
- internal_message=msg,
355
675
  target=ErrorTarget.RAI_CLIENT,
356
- category=ErrorCategory.SERVICE_UNAVAILABLE,
357
- blame=ErrorBlame.UNKNOWN,
676
+ blame=ErrorBlame.USER_ERROR,
677
+ category=ErrorCategory.PROJECT_ACCESS_ERROR,
678
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
358
679
  )
359
680
 
360
681
  base_url = urlparse(response.json()["properties"]["discoveryUrl"])
361
682
  return f"{base_url.scheme}://{base_url.netloc}"
362
683
 
363
684
 
364
- async def get_rai_svc_url(project_scope: dict, token: str) -> str:
685
+ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
365
686
  """Get the Responsible AI service URL
366
687
 
367
688
  :param project_scope: The Azure AI project scope details.
@@ -385,7 +706,9 @@ async def get_rai_svc_url(project_scope: dict, token: str) -> str:
385
706
  return rai_url
386
707
 
387
708
 
388
- async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str:
709
+ async def fetch_or_reuse_token(
710
+ credential: TokenCredential, token: Optional[str] = None, workspace: Optional[str] = ML_WORKSPACE
711
+ ) -> str:
389
712
  """Get token. Fetch a new token if the current token is near expiry
390
713
 
391
714
  :param credential: The Azure authentication credential.
@@ -395,58 +718,424 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -
395
718
  :type token: str
396
719
  :return: The Azure authentication token.
397
720
  """
398
- acquire_new_token = True
399
- try:
400
- if token:
401
- # Decode the token to get its expiration time
721
+ if token:
722
+ # Decode the token to get its expiration time
723
+ try:
402
724
  decoded_token = jwt.decode(token, options={"verify_signature": False})
725
+ except jwt.PyJWTError:
726
+ pass
727
+ else:
403
728
  exp_time = decoded_token["exp"]
404
729
  current_time = time.time()
405
730
 
406
- # Check if the token is near expiry
731
+ # Return current token if not near expiry
407
732
  if (exp_time - current_time) >= 300:
408
- acquire_new_token = False
409
- except Exception: # pylint: disable=broad-exception-caught
410
- pass
733
+ return token
411
734
 
412
- if acquire_new_token:
413
- token = credential.get_token("https://management.azure.com/.default").token
414
-
415
- return token
735
+ return credential.get_token(workspace).token
416
736
 
417
737
 
418
738
  async def evaluate_with_rai_service(
419
- query: str, response: str, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
739
+ data: dict,
740
+ metric_name: str,
741
+ project_scope: Union[str, AzureAIProject],
742
+ credential: TokenCredential,
743
+ annotation_task: str = Tasks.CONTENT_HARM,
744
+ metric_display_name=None,
745
+ evaluator_name=None,
746
+ scan_session_id: Optional[str] = None,
747
+ ) -> Dict[str, Union[str, float]]:
748
+ """Evaluate the content safety of the response using Responsible AI service
749
+
750
+ :param data: The data to evaluate.
751
+ :type data: dict
752
+ :param metric_name: The evaluation metric to use.
753
+ :type metric_name: str
754
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
755
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
756
+ :type project_scope: Union[str, AzureAIProject]
757
+ :param credential: The Azure authentication credential.
758
+ :type credential: ~azure.core.credentials.TokenCredential
759
+ :param annotation_task: The annotation task to use.
760
+ :type annotation_task: str
761
+ :param metric_display_name: The display name of metric to use.
762
+ :type metric_display_name: str
763
+ :param evaluator_name: The evaluator name to use.
764
+ :type evaluator_name: str
765
+ :param scan_session_id: The scan session ID to use for the evaluation.
766
+ :type scan_session_id: Optional[str]
767
+ :return: The parsed annotation result.
768
+ :rtype: Dict[str, Union[str, float]]
769
+ """
770
+
771
+ if is_onedp_project(project_scope):
772
+ client = AIProjectClient(
773
+ endpoint=project_scope,
774
+ credential=credential,
775
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
776
+ )
777
+ token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
778
+ await ensure_service_availability_onedp(client, token, annotation_task)
779
+ operation_id = await submit_request_onedp(
780
+ client, data, metric_name, token, annotation_task, evaluator_name, scan_session_id
781
+ )
782
+ annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
783
+ result = parse_response(annotation_response, metric_name, metric_display_name)
784
+ return result
785
+ else:
786
+ # Get RAI service URL from discovery service and check service availability
787
+ token = await fetch_or_reuse_token(credential)
788
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
789
+ await ensure_service_availability(rai_svc_url, token, annotation_task)
790
+
791
+ # Submit annotation request and fetch result
792
+ operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task, evaluator_name)
793
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
794
+ result = parse_response(annotation_response, metric_name, metric_display_name)
795
+
796
+ return result
797
+
798
+
799
+ def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
800
+ """Generate the payload for the annotation request
801
+ :param content_type: The type of the content representing multimodal or images.
802
+ :type content_type: str
803
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
804
+ :type messages: str
805
+ :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed
806
+ in the payload.
807
+ :type metric: str
808
+ :return: The payload for the annotation request.
809
+ :rtype: Dict
810
+ """
811
+ include_metric = True
812
+ task = Tasks.CONTENT_HARM
813
+ if metric == EvaluationMetrics.PROTECTED_MATERIAL:
814
+ task = Tasks.PROTECTED_MATERIAL
815
+ include_metric = False
816
+
817
+ if include_metric:
818
+ return {
819
+ "ContentType": content_type,
820
+ "Contents": [{"messages": messages}],
821
+ "AnnotationTask": task,
822
+ "MetricList": [metric],
823
+ }
824
+ return {
825
+ "ContentType": content_type,
826
+ "Contents": [{"messages": messages}],
827
+ "AnnotationTask": task,
828
+ }
829
+
830
+
831
+ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str:
832
+ """Submit request to Responsible AI service for evaluation and return operation ID
833
+ :param messages: The normalized list of messages to be entered as the "Contents" in the payload.
834
+ :type messages: str
835
+ :param metric: The evaluation metric to use.
836
+ :type metric: str
837
+ :param rai_svc_url: The Responsible AI service URL.
838
+ :type rai_svc_url: str
839
+ :param token: The Azure authentication token.
840
+ :type token: str
841
+ :return: The operation ID.
842
+ :rtype: str
843
+ """
844
+ ## handle json payload and payload from inference sdk strongly type messages
845
+ if len(messages) > 0 and not isinstance(messages[0], dict):
846
+ try:
847
+ from azure.ai.inference.models import ChatRequestMessage
848
+ except ImportError as ex:
849
+ error_message = (
850
+ "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
851
+ )
852
+ raise MissingRequiredPackage(message=error_message) from ex
853
+ if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
854
+ messages = [message.as_dict() for message in messages]
855
+
856
+ filtered_messages = [message for message in messages if message["role"] != "system"]
857
+ assistant_messages = [message for message in messages if message["role"] == "assistant"]
858
+ content_type = retrieve_content_type(assistant_messages, metric)
859
+ payload = generate_payload_multimodal(content_type, filtered_messages, metric)
860
+
861
+ ## calling rai service for annotation
862
+ url = rai_svc_url + "/submitannotation"
863
+ headers = get_common_headers(token)
864
+ async with get_async_http_client() as client:
865
+ response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg
866
+ url, json=payload, headers=headers
867
+ )
868
+ if response.status_code != 202:
869
+ raise HttpResponseError(
870
+ message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response
871
+ )
872
+ result = response.json()
873
+ operation_id = result["location"].split("/")[-1]
874
+ return operation_id
875
+
876
+
877
+ async def submit_multimodal_request_onedp(client: AIProjectClient, messages, metric: str, token: str) -> str:
878
+
879
+ # handle inference sdk strongly type messages
880
+ if len(messages) > 0 and not isinstance(messages[0], dict):
881
+ try:
882
+ from azure.ai.inference.models import ChatRequestMessage
883
+ except ImportError as ex:
884
+ error_message = (
885
+ "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
886
+ )
887
+ raise MissingRequiredPackage(message=error_message) from ex
888
+ if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
889
+ messages = [message.as_dict() for message in messages]
890
+
891
+ ## fetch system and assistant messages from the list of messages
892
+ filtered_messages = [message for message in messages if message["role"] != "system"]
893
+ assistant_messages = [message for message in messages if message["role"] == "assistant"]
894
+
895
+ ## prepare for request
896
+ content_type = retrieve_content_type(assistant_messages, metric)
897
+ payload = generate_payload_multimodal(content_type, filtered_messages, metric)
898
+ headers = get_common_headers(token)
899
+
900
+ response = client.evaluations.submit_annotation(payload, headers=headers)
901
+
902
+ result = json.loads(response)
903
+ operation_id = result["location"].split("/")[-1]
904
+ return operation_id
905
+
906
+
907
+ def _build_sync_eval_payload(
908
+ data: dict, metric_name: str, annotation_task: str, scan_session_id: Optional[str] = None
909
+ ) -> Dict:
910
+ """Build the sync_evals payload for evaluation using QueryResponseInlineMessage format.
911
+
912
+ :param data: The data to evaluate, containing 'query', 'response', and optionally 'context' and 'tool_calls'.
913
+ :type data: dict
914
+ :param metric_name: The evaluation metric to use.
915
+ :type metric_name: str
916
+ :param annotation_task: The annotation task to use.
917
+ :type annotation_task: str
918
+ :param scan_session_id: The scan session ID to use for the evaluation.
919
+ :type scan_session_id: Optional[str]
920
+ :return: The sync_eval payload ready to send to the API.
921
+ :rtype: Dict
922
+ """
923
+
924
+ # Build properties/metadata (scenario, category, taxonomy, etc.)
925
+ properties = {}
926
+ if data.get("scenario") is not None:
927
+ properties["scenario"] = data["scenario"]
928
+ if data.get("risk_sub_type") is not None:
929
+ properties["category"] = data["risk_sub_type"]
930
+ if data.get("taxonomy") is not None:
931
+ properties["taxonomy"] = str(data["taxonomy"]) # Ensure taxonomy is converted to string
932
+
933
+ # Prepare context if available
934
+ context = None
935
+ if data.get("context") is not None:
936
+ context = " ".join(c["content"] for c in data["context"]["contexts"])
937
+
938
+ # Build QueryResponseInlineMessage object
939
+ item_content = QueryResponseInlineMessage(
940
+ query=data.get("query", ""),
941
+ response=data.get("response", ""),
942
+ context=context,
943
+ tools=data.get("tool_calls"),
944
+ properties=properties if properties else None,
945
+ )
946
+
947
+ # Build the data mapping using mustache syntax {{item.field}}
948
+ data_mapping = {
949
+ "query": "{{item.query}}",
950
+ "response": "{{item.response}}",
951
+ }
952
+
953
+ # Create the sync eval input payload
954
+ # Structure: Uses QueryResponseInlineMessage format with azure_ai_evaluator type
955
+ sync_eval_payload = {
956
+ "name": f"Safety Eval - {metric_name}",
957
+ "data_source": {
958
+ "type": "jsonl",
959
+ "source": {"type": "file_content", "content": {"item": item_content}},
960
+ },
961
+ "testing_criteria": [
962
+ {
963
+ "type": "azure_ai_evaluator",
964
+ "name": metric_name,
965
+ "evaluator_name": metric_name,
966
+ "data_mapping": data_mapping,
967
+ }
968
+ ],
969
+ }
970
+
971
+ return sync_eval_payload
972
+
973
+
974
+ def _parse_sync_eval_result(
975
+ eval_result, metric_name: str, metric_display_name: Optional[str] = None
976
+ ) -> Dict[str, Union[str, float]]:
977
+ """Parse the result from sync_evals response (EvalRunOutputItem) into the standard format.
978
+
979
+ :param eval_result: The result from sync_evals.create() call (EvalRunOutputItem).
980
+ :param metric_name: The evaluation metric name.
981
+ :type metric_name: str
982
+ :param metric_display_name: The display name for the metric.
983
+ :type metric_display_name: Optional[str]
984
+ :return: The parsed result in standard format compatible with parse_response.
985
+ :rtype: Dict[str, Union[str, float]]
986
+ """
987
+ # Handle EvalRunOutputItem structure
988
+ # Expected structure: {'results': [{'name': 'violence', 'score': 0.0, 'reason': '...', ...}]}
989
+
990
+ display_name = metric_display_name or metric_name
991
+
992
+ # Handle both dict and object formats
993
+ if hasattr(eval_result, "results"):
994
+ results = eval_result.results
995
+ elif isinstance(eval_result, dict) and "results" in eval_result:
996
+ results = eval_result["results"]
997
+ else:
998
+ return {}
999
+
1000
+ if not results or len(results) == 0:
1001
+ return {}
1002
+
1003
+ # Find the result for our specific metric
1004
+ target_result = None
1005
+ for result_item in results:
1006
+ if isinstance(result_item, dict):
1007
+ if result_item.get("name") == metric_name or result_item.get("metric") == metric_name:
1008
+ target_result = result_item
1009
+ break
1010
+ elif hasattr(result_item, "name") and result_item.name == metric_name:
1011
+ target_result = result_item
1012
+ break
1013
+
1014
+ if not target_result:
1015
+ return {}
1016
+
1017
+ # Extract values from the result item
1018
+ if isinstance(target_result, dict):
1019
+ score = target_result.get("score", math.nan)
1020
+ reason = target_result.get("reason", "")
1021
+ # Also check properties.reasoning for additional reason text
1022
+ if not reason and "properties" in target_result:
1023
+ props = target_result["properties"]
1024
+ if isinstance(props, dict):
1025
+ reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
1026
+ else:
1027
+ score = getattr(target_result, "score", math.nan)
1028
+ reason = getattr(target_result, "reason", "")
1029
+ if not reason and hasattr(target_result, "properties"):
1030
+ props = target_result.properties
1031
+ if isinstance(props, dict):
1032
+ reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
1033
+
1034
+ # Convert score to severity level using existing logic
1035
+ harm_score = score if not math.isnan(score) else math.nan
1036
+ severity_level = get_harm_severity_level(harm_score) if not math.isnan(harm_score) else math.nan
1037
+
1038
+ # Return in the standard format expected by the red team processor
1039
+ return {display_name: severity_level, f"{display_name}_score": harm_score, f"{display_name}_reason": reason}
1040
+
1041
+
1042
+ async def evaluate_with_rai_service_sync(
1043
+ data: dict,
1044
+ metric_name: str,
1045
+ project_scope: Union[str, AzureAIProject],
1046
+ credential: TokenCredential,
1047
+ annotation_task: str = Tasks.CONTENT_HARM,
1048
+ metric_display_name=None,
1049
+ evaluator_name=None,
1050
+ scan_session_id: Optional[str] = None,
420
1051
  ):
421
- """ "Evaluate the content safety of the response using Responsible AI service
1052
+ """Evaluate the content safety of the response using sync_evals endpoint (OneDP only).
422
1053
 
423
- :param query: The query to evaluate.
424
- :type query: str
425
- :param response: The response to evaluate.
426
- :type response: str
427
- :param metric_name: The evaluation metric to use.
428
- :type metric_name: str
429
- :param project_scope: The Azure AI project scope details.
430
- :type project_scope: Dict
431
- :param credential: The Azure authentication credential.
432
- :type credential:
433
- ~azure.core.credentials.TokenCredential
434
- :return: The parsed annotation result.
435
- :rtype: List[List[Dict]]
1054
+ This function uses the sync_evals.create() API instead of the legacy evaluations.submit_annotation()
1055
+ approach. It's specifically designed for OneDP projects and provides better integration with
1056
+ the newer evaluation infrastructure. Returns the raw EvalRunOutputItem for direct use.
1057
+
1058
+ :param data: The data to evaluate.
1059
+ :type data: dict
1060
+ :param metric_name: The evaluation metric to use.
1061
+ :type metric_name: str
1062
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
1063
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
1064
+ :type project_scope: Union[str, AzureAIProject]
1065
+ :param credential: The Azure authentication credential.
1066
+ :type credential: ~azure.core.credentials.TokenCredential
1067
+ :param annotation_task: The annotation task to use.
1068
+ :type annotation_task: str
1069
+ :param metric_display_name: The display name of metric to use.
1070
+ :type metric_display_name: str
1071
+ :param evaluator_name: The evaluator name to use.
1072
+ :type evaluator_name: str
1073
+ :param scan_session_id: The scan session ID to use for the evaluation.
1074
+ :type scan_session_id: Optional[str]
1075
+ :return: The EvalRunOutputItem containing the evaluation results.
1076
+ :rtype: EvalRunOutputItem
1077
+ :raises: EvaluationException if project_scope is not a OneDP project
436
1078
  """
437
- # Use DefaultAzureCredential if no credential is provided
438
- # This is for the for batch run scenario as the credential cannot be serialized by promoptflow
439
- if credential is None or credential == {}:
440
- credential = DefaultAzureCredential()
1079
+ if not is_onedp_project(project_scope):
1080
+ msg = "evaluate_with_rai_service_sync only supports OneDP projects. Use evaluate_with_rai_service for legacy projects."
1081
+ raise EvaluationException(
1082
+ message=msg,
1083
+ internal_message=msg,
1084
+ target=ErrorTarget.RAI_CLIENT,
1085
+ category=ErrorCategory.INVALID_VALUE,
1086
+ blame=ErrorBlame.USER_ERROR,
1087
+ )
1088
+
1089
+ client = AIProjectClient(
1090
+ endpoint=project_scope,
1091
+ credential=credential,
1092
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
1093
+ )
441
1094
 
442
- # Get RAI service URL from discovery service and check service availability
443
- token = await fetch_or_reuse_token(credential)
444
- rai_svc_url = await get_rai_svc_url(project_scope, token)
445
- await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
1095
+ # Build the sync eval payload
1096
+ sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id)
1097
+ # Call sync_evals.create() with the JSON payload
1098
+ eval_result = client.sync_evals.create(eval=sync_eval_payload)
446
1099
 
447
- # Submit annotation request and fetch result
448
- operation_id = await submit_request(query, response, metric_name, rai_svc_url, token)
449
- annotation_response = await fetch_result(operation_id, rai_svc_url, credential, token)
450
- result = parse_response(annotation_response, metric_name)
1100
+ # Return the raw EvalRunOutputItem for downstream processing
1101
+ return eval_result
451
1102
 
452
- return result
1103
+
1104
+ async def evaluate_with_rai_service_multimodal(
1105
+ messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential
1106
+ ):
1107
+ """ "Evaluate the content safety of the response using Responsible AI service
1108
+ :param messages: The normalized list of messages.
1109
+ :type messages: str
1110
+ :param metric_name: The evaluation metric to use.
1111
+ :type metric_name: str
1112
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
1113
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
1114
+ :type project_scope: Union[str, AzureAIProject]
1115
+ :param credential: The Azure authentication credential.
1116
+ :type credential: ~azure.core.credentials.TokenCredential
1117
+ :return: The parsed annotation result.
1118
+ :rtype: List[List[Dict]]
1119
+ """
1120
+
1121
+ if is_onedp_project(project_scope):
1122
+ client = AIProjectClient(
1123
+ endpoint=project_scope,
1124
+ credential=credential,
1125
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
1126
+ )
1127
+ token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
1128
+ await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
1129
+ operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
1130
+ annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
1131
+ result = parse_response(annotation_response, metric_name)
1132
+ return result
1133
+ else:
1134
+ token = await fetch_or_reuse_token(credential)
1135
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
1136
+ await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
1137
+ # Submit annotation request and fetch result
1138
+ operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
1139
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
1140
+ result = parse_response(annotation_response, metric_name)
1141
+ return result