azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -12,16 +12,21 @@ from ast import literal_eval
12
12
  from typing import Dict, List, Optional, Union, cast
13
13
  from urllib.parse import urlparse
14
14
  from string import Template
15
+ from azure.ai.evaluation._common.onedp._client import ProjectsClient as AIProjectClient
16
+ from azure.ai.evaluation._common.onedp.models import QueryResponseInlineMessage
17
+ from azure.core.exceptions import HttpResponseError
15
18
 
16
19
  import jwt
17
20
 
18
- from promptflow.core._errors import MissingRequiredPackage
21
+ from azure.ai.evaluation._legacy._adapters._errors import MissingRequiredPackage
19
22
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
20
23
  from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client
21
24
  from azure.ai.evaluation._model_configurations import AzureAIProject
25
+ from azure.ai.evaluation._user_agent import UserAgentSingleton
26
+ from azure.ai.evaluation._common.utils import is_onedp_project
22
27
  from azure.core.credentials import TokenCredential
23
28
  from azure.core.exceptions import HttpResponseError
24
- from azure.core.pipeline.policies import AsyncRetryPolicy
29
+ from azure.core.pipeline.policies import AsyncRetryPolicy, UserAgentPolicy
25
30
 
26
31
  from .constants import (
27
32
  CommonConstants,
@@ -32,15 +37,14 @@ from .constants import (
32
37
  )
33
38
  from .utils import get_harm_severity_level, retrieve_content_type
34
39
 
35
- try:
36
- version = importlib.metadata.version("azure-ai-evaluation")
37
- except importlib.metadata.PackageNotFoundError:
38
- version = "unknown"
39
- USER_AGENT = "{}/{}".format("azure-ai-evaluation", version)
40
40
 
41
41
  USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = {
42
42
  "DEFAULT": Template("<Human>{$query}</><System>{$response}</>"),
43
43
  }
44
+ ML_WORKSPACE = "https://management.azure.com/.default"
45
+ COG_SRV_WORKSPACE = "https://ai.azure.com/.default"
46
+
47
+ INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"
44
48
 
45
49
 
46
50
  def get_formatted_template(data: dict, annotation_task: str) -> str:
@@ -64,6 +68,16 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
64
68
  "context": data.get("context", ""),
65
69
  }
66
70
  return json.dumps(as_dict)
71
+ if annotation_task == Tasks.CODE_VULNERABILITY:
72
+ as_dict = {"context": data.get("query", ""), "completion": data.get("response", "")}
73
+ return json.dumps(as_dict)
74
+ if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
75
+ as_dict = {
76
+ "query": data.get("query", ""),
77
+ "response": data.get("response", ""),
78
+ "context": data.get("context", ""),
79
+ }
80
+ return json.dumps(as_dict)
67
81
  as_dict = {
68
82
  "query": html.escape(data.get("query", "")),
69
83
  "response": html.escape(data.get("response", "")),
@@ -72,21 +86,24 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
72
86
  return user_text.replace("'", '\\"')
73
87
 
74
88
 
75
- def get_common_headers(token: str) -> Dict:
89
+ def get_common_headers(token: str, evaluator_name: Optional[str] = None) -> Dict:
76
90
  """Get common headers for the HTTP request
77
91
 
78
92
  :param token: The Azure authentication token.
79
93
  :type token: str
94
+ :param evaluator_name: The evaluator name. Default is None.
95
+ :type evaluator_name: str
80
96
  :return: The common headers.
81
97
  :rtype: Dict
82
98
  """
99
+ user_agent = (
100
+ f"{UserAgentSingleton().value} (type=evaluator; subtype={evaluator_name})"
101
+ if evaluator_name
102
+ else UserAgentSingleton().value
103
+ )
83
104
  return {
84
105
  "Authorization": f"Bearer {token}",
85
- "Content-Type": "application/json",
86
- "User-Agent": USER_AGENT,
87
- # Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
88
- # https://github.com/encode/httpx/discussions/2959
89
- "Connection": "close",
106
+ "User-Agent": user_agent,
90
107
  }
91
108
 
92
109
 
@@ -96,6 +113,34 @@ def get_async_http_client_with_timeout() -> AsyncHttpPipeline:
96
113
  )
97
114
 
98
115
 
116
+ async def ensure_service_availability_onedp(
117
+ client: AIProjectClient, token: str, capability: Optional[str] = None
118
+ ) -> None:
119
+ """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
120
+
121
+ :param client: The AI project client.
122
+ :type client: AIProjectClient
123
+ :param token: The Azure authentication token.
124
+ :type token: str
125
+ :param capability: The capability to check. Default is None.
126
+ :type capability: str
127
+ :raises Exception: If the service is not available in the region or the capability is not available.
128
+ """
129
+ headers = get_common_headers(token)
130
+ capabilities = client.evaluations.check_annotation(headers=headers)
131
+
132
+ if capability and capability not in capabilities:
133
+ msg = f"The needed capability '{capability}' is not supported by the RAI service in this region."
134
+ raise EvaluationException(
135
+ message=msg,
136
+ internal_message=msg,
137
+ target=ErrorTarget.RAI_CLIENT,
138
+ category=ErrorCategory.SERVICE_UNAVAILABLE,
139
+ blame=ErrorBlame.USER_ERROR,
140
+ tsg_link="https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot",
141
+ )
142
+
143
+
99
144
  async def ensure_service_availability(rai_svc_url: str, token: str, capability: Optional[str] = None) -> None:
100
145
  """Check if the Responsible AI service is available in the region and has the required capability, if relevant.
101
146
 
@@ -157,6 +202,8 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
157
202
  task = annotation_task
158
203
  if metric == EvaluationMetrics.PROTECTED_MATERIAL:
159
204
  include_metric = False
205
+ elif metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
206
+ include_metric = False
160
207
  elif metric == _InternalEvaluationMetrics.ECI:
161
208
  include_metric = False
162
209
  elif metric == EvaluationMetrics.XPIA:
@@ -175,7 +222,9 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
175
222
  )
176
223
 
177
224
 
178
- async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str) -> str:
225
+ async def submit_request(
226
+ data: dict, metric: str, rai_svc_url: str, token: str, annotation_task: str, evaluator_name: str
227
+ ) -> str:
179
228
  """Submit request to Responsible AI service for evaluation and return operation ID
180
229
 
181
230
  :param data: The data to evaluate.
@@ -188,6 +237,8 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
188
237
  :type token: str
189
238
  :param annotation_task: The annotation task to use.
190
239
  :type annotation_task: str
240
+ :param evaluator_name: The evaluator name.
241
+ :type evaluator_name: str
191
242
  :return: The operation ID.
192
243
  :rtype: str
193
244
  """
@@ -195,7 +246,7 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
195
246
  payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
196
247
 
197
248
  url = rai_svc_url + "/submitannotation"
198
- headers = get_common_headers(token)
249
+ headers = get_common_headers(token, evaluator_name)
199
250
 
200
251
  async with get_async_http_client_with_timeout() as client:
201
252
  http_response = await client.post(url, json=payload, headers=headers)
@@ -208,6 +259,45 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str,
208
259
  return operation_id
209
260
 
210
261
 
262
+ async def submit_request_onedp(
263
+ client: AIProjectClient,
264
+ data: dict,
265
+ metric: str,
266
+ token: str,
267
+ annotation_task: str,
268
+ evaluator_name: str,
269
+ scan_session_id: Optional[str] = None,
270
+ ) -> str:
271
+ """Submit request to Responsible AI service for evaluation and return operation ID
272
+
273
+ :param client: The AI project client.
274
+ :type client: AIProjectClient
275
+ :param data: The data to evaluate.
276
+ :type data: dict
277
+ :param metric: The evaluation metric to use.
278
+ :type metric: str
279
+ :param token: The Azure authentication token.
280
+ :type token: str
281
+ :param annotation_task: The annotation task to use.
282
+ :type annotation_task: str
283
+ :param evaluator_name: The evaluator name.
284
+ :type evaluator_name: str
285
+ :param scan_session_id: The scan session ID to use for the evaluation.
286
+ :type scan_session_id: Optional[str]
287
+ :return: The operation ID.
288
+ :rtype: str
289
+ """
290
+ normalized_user_text = get_formatted_template(data, annotation_task)
291
+ payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task)
292
+ headers = get_common_headers(token, evaluator_name)
293
+ if scan_session_id:
294
+ headers["x-ms-client-request-id"] = scan_session_id
295
+ response = client.evaluations.submit_annotation(payload, headers=headers)
296
+ result = json.loads(response)
297
+ operation_id = result["location"].split("/")[-1]
298
+ return operation_id
299
+
300
+
211
301
  async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential, token: str) -> Dict:
212
302
  """Fetch the annotation result from Responsible AI service
213
303
 
@@ -230,8 +320,8 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
230
320
  token = await fetch_or_reuse_token(credential, token)
231
321
  headers = get_common_headers(token)
232
322
 
233
- async with get_async_http_client_with_timeout() as client:
234
- response = await client.get(url, headers=headers)
323
+ async with get_async_http_client() as client:
324
+ response = await client.get(url, headers=headers, timeout=RAIService.TIMEOUT)
235
325
 
236
326
  if response.status_code == 200:
237
327
  return response.json()
@@ -245,6 +335,37 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
245
335
  await asyncio.sleep(sleep_time)
246
336
 
247
337
 
338
+ async def fetch_result_onedp(client: AIProjectClient, operation_id: str, token: str) -> Dict:
339
+ """Fetch the annotation result from Responsible AI service
340
+
341
+ :param client: The AI project client.
342
+ :type client: AIProjectClient
343
+ :param operation_id: The operation ID.
344
+ :type operation_id: str
345
+ :param token: The Azure authentication token.
346
+ :type token: str
347
+ :return: The annotation result.
348
+ :rtype: Dict
349
+ """
350
+ start = time.time()
351
+ request_count = 0
352
+
353
+ while True:
354
+ headers = get_common_headers(token)
355
+ try:
356
+ return client.evaluations.operation_results(operation_id, headers=headers)
357
+ except HttpResponseError:
358
+ request_count += 1
359
+ time_elapsed = time.time() - start
360
+ if time_elapsed > RAIService.TIMEOUT:
361
+ raise TimeoutError(
362
+ f"Fetching annotation result {request_count} times out after {time_elapsed:.2f} seconds"
363
+ )
364
+
365
+ sleep_time = RAIService.SLEEP_TIME**request_count
366
+ await asyncio.sleep(sleep_time)
367
+
368
+
248
369
  def parse_response( # pylint: disable=too-many-branches,too-many-statements
249
370
  batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
250
371
  ) -> Dict[str, Union[str, float]]:
@@ -267,10 +388,19 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
267
388
  EvaluationMetrics.PROTECTED_MATERIAL,
268
389
  _InternalEvaluationMetrics.ECI,
269
390
  EvaluationMetrics.XPIA,
391
+ EvaluationMetrics.CODE_VULNERABILITY,
392
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES,
270
393
  }:
271
394
  result = {}
272
395
  if not batch_response or len(batch_response[0]) == 0:
273
396
  return {}
397
+ if (
398
+ metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
399
+ and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]
400
+ ):
401
+ batch_response[0] = {
402
+ EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
403
+ }
274
404
  if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
275
405
  pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
276
406
  for pm_metric_name in pm_metric_names:
@@ -282,6 +412,25 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
282
412
  result[pm_metric_name + "_reason"] = (
283
413
  parsed_response["reasoning"] if "reasoning" in parsed_response else ""
284
414
  )
415
+ result[pm_metric_name + "_total_tokens"] = (
416
+ parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
417
+ )
418
+ result[pm_metric_name + "_prompt_tokens"] = (
419
+ parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
420
+ )
421
+ result[pm_metric_name + "_completion_tokens"] = (
422
+ parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
423
+ )
424
+ result[pm_metric_name + "_finish_reason"] = (
425
+ parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
426
+ )
427
+ result[pm_metric_name + "_sample_input"] = (
428
+ parsed_response["sample_input"] if "sample_input" in parsed_response else ""
429
+ )
430
+ result[pm_metric_name + "_sample_output"] = (
431
+ parsed_response["sample_output"] if "sample_output" in parsed_response else ""
432
+ )
433
+ result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
285
434
  return result
286
435
  if metric_name not in batch_response[0]:
287
436
  return {}
@@ -306,6 +455,46 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
306
455
  result[metric_display_name + "_information_gathering"] = (
307
456
  parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
308
457
  )
458
+ if (
459
+ metric_name == EvaluationMetrics.CODE_VULNERABILITY
460
+ or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
461
+ ):
462
+ # Add all attributes under the details.
463
+ details = {}
464
+ for key, value in parsed_response.items():
465
+ if key not in {
466
+ "label",
467
+ "reasoning",
468
+ "version",
469
+ "totalTokenCount",
470
+ "inputTokenCount",
471
+ "outputTokenCount",
472
+ "finish_reason",
473
+ "sample_input",
474
+ "sample_output",
475
+ "model",
476
+ }:
477
+ details[key.replace("-", "_")] = value
478
+ result[metric_display_name + "_details"] = details
479
+ result[metric_display_name + "_total_tokens"] = (
480
+ parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
481
+ )
482
+ result[metric_display_name + "_prompt_tokens"] = (
483
+ parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
484
+ )
485
+ result[metric_display_name + "_completion_tokens"] = (
486
+ parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
487
+ )
488
+ result[metric_display_name + "_finish_reason"] = (
489
+ parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
490
+ )
491
+ result[metric_display_name + "_sample_input"] = (
492
+ parsed_response["sample_input"] if "sample_input" in parsed_response else ""
493
+ )
494
+ result[metric_display_name + "_sample_output"] = (
495
+ parsed_response["sample_output"] if "sample_output" in parsed_response else ""
496
+ )
497
+ result[metric_display_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
309
498
  return result
310
499
  return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
311
500
 
@@ -345,6 +534,13 @@ def _parse_content_harm_response(
345
534
  except Exception: # pylint: disable=broad-exception-caught
346
535
  harm_response = response[metric_name]
347
536
 
537
+ total_tokens = 0
538
+ prompt_tokens = 0
539
+ completion_tokens = 0
540
+ finish_reason = ""
541
+ sample_input = ""
542
+ sample_output = ""
543
+ model = ""
348
544
  if harm_response != "" and isinstance(harm_response, dict):
349
545
  # check if "output" is one key in harm_response
350
546
  if "output" in harm_response:
@@ -352,7 +548,14 @@ def _parse_content_harm_response(
352
548
 
353
549
  # get content harm metric_value
354
550
  if "label" in harm_response:
355
- metric_value = float(harm_response["label"])
551
+ try:
552
+ # Handle "n/a" or other non-numeric values
553
+ if isinstance(harm_response["label"], str) and harm_response["label"].strip().lower() == "n/a":
554
+ metric_value = math.nan
555
+ else:
556
+ metric_value = float(harm_response["label"])
557
+ except (ValueError, TypeError):
558
+ metric_value = math.nan
356
559
  elif "valid" in harm_response:
357
560
  metric_value = 0 if harm_response["valid"] else math.nan
358
561
  else:
@@ -365,6 +568,44 @@ def _parse_content_harm_response(
365
568
  reason = harm_response["reason"]
366
569
  else:
367
570
  reason = ""
571
+
572
+ # get token_usage
573
+ if "totalTokenCount" in harm_response:
574
+ total_tokens = harm_response["totalTokenCount"]
575
+ else:
576
+ total_tokens = 0
577
+ if "inputTokenCount" in harm_response:
578
+ prompt_tokens = harm_response["inputTokenCount"]
579
+ else:
580
+ prompt_tokens = 0
581
+ if "outputTokenCount" in harm_response:
582
+ completion_tokens = harm_response["outputTokenCount"]
583
+ else:
584
+ completion_tokens = 0
585
+
586
+ # get finish_reason
587
+ if "finish_reason" in harm_response:
588
+ finish_reason = harm_response["finish_reason"]
589
+ else:
590
+ finish_reason = ""
591
+
592
+ # get sample_input
593
+ if "sample_input" in harm_response:
594
+ sample_input = harm_response["sample_input"]
595
+ else:
596
+ sample_input = ""
597
+
598
+ # get sample_output
599
+ if "sample_output" in harm_response:
600
+ sample_output = harm_response["sample_output"]
601
+ else:
602
+ sample_output = ""
603
+
604
+ # get model
605
+ if "model" in harm_response:
606
+ model = harm_response["model"]
607
+ else:
608
+ model = ""
368
609
  elif harm_response != "" and isinstance(harm_response, str):
369
610
  metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
370
611
  if metric_value_match:
@@ -383,8 +624,7 @@ def _parse_content_harm_response(
383
624
  reason = ""
384
625
 
385
626
  harm_score = metric_value
386
- if metric_value == "n/a":
387
- return result
627
+ # We've already handled the "n/a" case by converting to math.nan
388
628
  if not math.isnan(metric_value):
389
629
  # int(math.nan) causes a value error, and math.nan is already handled
390
630
  # by get_harm_severity_level
@@ -392,6 +632,13 @@ def _parse_content_harm_response(
392
632
  result[key] = get_harm_severity_level(harm_score)
393
633
  result[key + "_score"] = harm_score
394
634
  result[key + "_reason"] = reason
635
+ result[key + "_total_tokens"] = total_tokens
636
+ result[key + "_prompt_tokens"] = prompt_tokens
637
+ result[key + "_completion_tokens"] = completion_tokens
638
+ result[key + "_finish_reason"] = finish_reason
639
+ result[key + "_sample_input"] = sample_input
640
+ result[key + "_sample_output"] = sample_output
641
+ result[key + "_model"] = model
395
642
 
396
643
  return result
397
644
 
@@ -459,7 +706,9 @@ async def get_rai_svc_url(project_scope: AzureAIProject, token: str) -> str:
459
706
  return rai_url
460
707
 
461
708
 
462
- async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str] = None) -> str:
709
+ async def fetch_or_reuse_token(
710
+ credential: TokenCredential, token: Optional[str] = None, workspace: Optional[str] = ML_WORKSPACE
711
+ ) -> str:
463
712
  """Get token. Fetch a new token if the current token is near expiry
464
713
 
465
714
  :param credential: The Azure authentication credential.
@@ -483,47 +732,68 @@ async def fetch_or_reuse_token(credential: TokenCredential, token: Optional[str]
483
732
  if (exp_time - current_time) >= 300:
484
733
  return token
485
734
 
486
- return credential.get_token("https://management.azure.com/.default").token
735
+ return credential.get_token(workspace).token
487
736
 
488
737
 
489
738
  async def evaluate_with_rai_service(
490
739
  data: dict,
491
740
  metric_name: str,
492
- project_scope: AzureAIProject,
741
+ project_scope: Union[str, AzureAIProject],
493
742
  credential: TokenCredential,
494
743
  annotation_task: str = Tasks.CONTENT_HARM,
495
744
  metric_display_name=None,
745
+ evaluator_name=None,
746
+ scan_session_id: Optional[str] = None,
496
747
  ) -> Dict[str, Union[str, float]]:
497
- """ "Evaluate the content safety of the response using Responsible AI service
748
+ """Evaluate the content safety of the response using Responsible AI service
498
749
 
499
- :param data: The data to evaluate.
500
- :type data: dict
501
- :param metric_name: The evaluation metric to use.
502
- :type metric_name: str
503
- :param project_scope: The Azure AI project scope details.
504
- :type project_scope: Dict
505
- :param credential: The Azure authentication credential.
506
- :type credential:
507
- ~azure.core.credentials.TokenCredential
508
- :param annotation_task: The annotation task to use.
509
- :type annotation_task: str
510
- :param metric_display_name: The display name of metric to use.
511
- :type metric_display_name: str
512
- :return: The parsed annotation result.
513
- :rtype: Dict[str, Union[str, float]]
750
+ :param data: The data to evaluate.
751
+ :type data: dict
752
+ :param metric_name: The evaluation metric to use.
753
+ :type metric_name: str
754
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
755
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
756
+ :type project_scope: Union[str, AzureAIProject]
757
+ :param credential: The Azure authentication credential.
758
+ :type credential: ~azure.core.credentials.TokenCredential
759
+ :param annotation_task: The annotation task to use.
760
+ :type annotation_task: str
761
+ :param metric_display_name: The display name of metric to use.
762
+ :type metric_display_name: str
763
+ :param evaluator_name: The evaluator name to use.
764
+ :type evaluator_name: str
765
+ :param scan_session_id: The scan session ID to use for the evaluation.
766
+ :type scan_session_id: Optional[str]
767
+ :return: The parsed annotation result.
768
+ :rtype: Dict[str, Union[str, float]]
514
769
  """
515
770
 
516
- # Get RAI service URL from discovery service and check service availability
517
- token = await fetch_or_reuse_token(credential)
518
- rai_svc_url = await get_rai_svc_url(project_scope, token)
519
- await ensure_service_availability(rai_svc_url, token, annotation_task)
771
+ if is_onedp_project(project_scope):
772
+ client = AIProjectClient(
773
+ endpoint=project_scope,
774
+ credential=credential,
775
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
776
+ )
777
+ token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
778
+ await ensure_service_availability_onedp(client, token, annotation_task)
779
+ operation_id = await submit_request_onedp(
780
+ client, data, metric_name, token, annotation_task, evaluator_name, scan_session_id
781
+ )
782
+ annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
783
+ result = parse_response(annotation_response, metric_name, metric_display_name)
784
+ return result
785
+ else:
786
+ # Get RAI service URL from discovery service and check service availability
787
+ token = await fetch_or_reuse_token(credential)
788
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
789
+ await ensure_service_availability(rai_svc_url, token, annotation_task)
520
790
 
521
- # Submit annotation request and fetch result
522
- operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task)
523
- annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
524
- result = parse_response(annotation_response, metric_name, metric_display_name)
791
+ # Submit annotation request and fetch result
792
+ operation_id = await submit_request(data, metric_name, rai_svc_url, token, annotation_task, evaluator_name)
793
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
794
+ result = parse_response(annotation_response, metric_name, metric_display_name)
525
795
 
526
- return result
796
+ return result
527
797
 
528
798
 
529
799
  def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict:
@@ -604,29 +874,268 @@ async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, tok
604
874
  return operation_id
605
875
 
606
876
 
877
+ async def submit_multimodal_request_onedp(client: AIProjectClient, messages, metric: str, token: str) -> str:
878
+
879
+ # handle inference sdk strongly type messages
880
+ if len(messages) > 0 and not isinstance(messages[0], dict):
881
+ try:
882
+ from azure.ai.inference.models import ChatRequestMessage
883
+ except ImportError as ex:
884
+ error_message = (
885
+ "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage"
886
+ )
887
+ raise MissingRequiredPackage(message=error_message) from ex
888
+ if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage):
889
+ messages = [message.as_dict() for message in messages]
890
+
891
+ ## fetch system and assistant messages from the list of messages
892
+ filtered_messages = [message for message in messages if message["role"] != "system"]
893
+ assistant_messages = [message for message in messages if message["role"] == "assistant"]
894
+
895
+ ## prepare for request
896
+ content_type = retrieve_content_type(assistant_messages, metric)
897
+ payload = generate_payload_multimodal(content_type, filtered_messages, metric)
898
+ headers = get_common_headers(token)
899
+
900
+ response = client.evaluations.submit_annotation(payload, headers=headers)
901
+
902
+ result = json.loads(response)
903
+ operation_id = result["location"].split("/")[-1]
904
+ return operation_id
905
+
906
+
907
+ def _build_sync_eval_payload(
908
+ data: dict, metric_name: str, annotation_task: str, scan_session_id: Optional[str] = None
909
+ ) -> Dict:
910
+ """Build the sync_evals payload for evaluation using QueryResponseInlineMessage format.
911
+
912
+ :param data: The data to evaluate, containing 'query', 'response', and optionally 'context' and 'tool_calls'.
913
+ :type data: dict
914
+ :param metric_name: The evaluation metric to use.
915
+ :type metric_name: str
916
+ :param annotation_task: The annotation task to use.
917
+ :type annotation_task: str
918
+ :param scan_session_id: The scan session ID to use for the evaluation.
919
+ :type scan_session_id: Optional[str]
920
+ :return: The sync_eval payload ready to send to the API.
921
+ :rtype: Dict
922
+ """
923
+
924
+ # Build properties/metadata (scenario, category, taxonomy, etc.)
925
+ properties = {}
926
+ if data.get("scenario") is not None:
927
+ properties["scenario"] = data["scenario"]
928
+ if data.get("risk_sub_type") is not None:
929
+ properties["category"] = data["risk_sub_type"]
930
+ if data.get("taxonomy") is not None:
931
+ properties["taxonomy"] = str(data["taxonomy"]) # Ensure taxonomy is converted to string
932
+
933
+ # Prepare context if available
934
+ context = None
935
+ if data.get("context") is not None:
936
+ context = " ".join(c["content"] for c in data["context"]["contexts"])
937
+
938
+ # Build QueryResponseInlineMessage object
939
+ item_content = QueryResponseInlineMessage(
940
+ query=data.get("query", ""),
941
+ response=data.get("response", ""),
942
+ context=context,
943
+ tools=data.get("tool_calls"),
944
+ properties=properties if properties else None,
945
+ )
946
+
947
+ # Build the data mapping using mustache syntax {{item.field}}
948
+ data_mapping = {
949
+ "query": "{{item.query}}",
950
+ "response": "{{item.response}}",
951
+ }
952
+
953
+ # Create the sync eval input payload
954
+ # Structure: Uses QueryResponseInlineMessage format with azure_ai_evaluator type
955
+ sync_eval_payload = {
956
+ "name": f"Safety Eval - {metric_name}",
957
+ "data_source": {
958
+ "type": "jsonl",
959
+ "source": {"type": "file_content", "content": {"item": item_content}},
960
+ },
961
+ "testing_criteria": [
962
+ {
963
+ "type": "azure_ai_evaluator",
964
+ "name": metric_name,
965
+ "evaluator_name": metric_name,
966
+ "data_mapping": data_mapping,
967
+ }
968
+ ],
969
+ }
970
+
971
+ return sync_eval_payload
972
+
973
+
974
+ def _parse_sync_eval_result(
975
+ eval_result, metric_name: str, metric_display_name: Optional[str] = None
976
+ ) -> Dict[str, Union[str, float]]:
977
+ """Parse the result from sync_evals response (EvalRunOutputItem) into the standard format.
978
+
979
+ :param eval_result: The result from sync_evals.create() call (EvalRunOutputItem).
980
+ :param metric_name: The evaluation metric name.
981
+ :type metric_name: str
982
+ :param metric_display_name: The display name for the metric.
983
+ :type metric_display_name: Optional[str]
984
+ :return: The parsed result in standard format compatible with parse_response.
985
+ :rtype: Dict[str, Union[str, float]]
986
+ """
987
+ # Handle EvalRunOutputItem structure
988
+ # Expected structure: {'results': [{'name': 'violence', 'score': 0.0, 'reason': '...', ...}]}
989
+
990
+ display_name = metric_display_name or metric_name
991
+
992
+ # Handle both dict and object formats
993
+ if hasattr(eval_result, "results"):
994
+ results = eval_result.results
995
+ elif isinstance(eval_result, dict) and "results" in eval_result:
996
+ results = eval_result["results"]
997
+ else:
998
+ return {}
999
+
1000
+ if not results or len(results) == 0:
1001
+ return {}
1002
+
1003
+ # Find the result for our specific metric
1004
+ target_result = None
1005
+ for result_item in results:
1006
+ if isinstance(result_item, dict):
1007
+ if result_item.get("name") == metric_name or result_item.get("metric") == metric_name:
1008
+ target_result = result_item
1009
+ break
1010
+ elif hasattr(result_item, "name") and result_item.name == metric_name:
1011
+ target_result = result_item
1012
+ break
1013
+
1014
+ if not target_result:
1015
+ return {}
1016
+
1017
+ # Extract values from the result item
1018
+ if isinstance(target_result, dict):
1019
+ score = target_result.get("score", math.nan)
1020
+ reason = target_result.get("reason", "")
1021
+ # Also check properties.reasoning for additional reason text
1022
+ if not reason and "properties" in target_result:
1023
+ props = target_result["properties"]
1024
+ if isinstance(props, dict):
1025
+ reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
1026
+ else:
1027
+ score = getattr(target_result, "score", math.nan)
1028
+ reason = getattr(target_result, "reason", "")
1029
+ if not reason and hasattr(target_result, "properties"):
1030
+ props = target_result.properties
1031
+ if isinstance(props, dict):
1032
+ reason = props.get("reasoning", props.get("scoreProperties", {}).get("reasoning", ""))
1033
+
1034
+ # Convert score to severity level using existing logic
1035
+ harm_score = score if not math.isnan(score) else math.nan
1036
+ severity_level = get_harm_severity_level(harm_score) if not math.isnan(harm_score) else math.nan
1037
+
1038
+ # Return in the standard format expected by the red team processor
1039
+ return {display_name: severity_level, f"{display_name}_score": harm_score, f"{display_name}_reason": reason}
1040
+
1041
+
1042
+ async def evaluate_with_rai_service_sync(
1043
+ data: dict,
1044
+ metric_name: str,
1045
+ project_scope: Union[str, AzureAIProject],
1046
+ credential: TokenCredential,
1047
+ annotation_task: str = Tasks.CONTENT_HARM,
1048
+ metric_display_name=None,
1049
+ evaluator_name=None,
1050
+ scan_session_id: Optional[str] = None,
1051
+ ):
1052
+ """Evaluate the content safety of the response using sync_evals endpoint (OneDP only).
1053
+
1054
+ This function uses the sync_evals.create() API instead of the legacy evaluations.submit_annotation()
1055
+ approach. It's specifically designed for OneDP projects and provides better integration with
1056
+ the newer evaluation infrastructure. Returns the raw EvalRunOutputItem for direct use.
1057
+
1058
+ :param data: The data to evaluate.
1059
+ :type data: dict
1060
+ :param metric_name: The evaluation metric to use.
1061
+ :type metric_name: str
1062
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
1063
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
1064
+ :type project_scope: Union[str, AzureAIProject]
1065
+ :param credential: The Azure authentication credential.
1066
+ :type credential: ~azure.core.credentials.TokenCredential
1067
+ :param annotation_task: The annotation task to use.
1068
+ :type annotation_task: str
1069
+ :param metric_display_name: The display name of metric to use.
1070
+ :type metric_display_name: str
1071
+ :param evaluator_name: The evaluator name to use.
1072
+ :type evaluator_name: str
1073
+ :param scan_session_id: The scan session ID to use for the evaluation.
1074
+ :type scan_session_id: Optional[str]
1075
+ :return: The EvalRunOutputItem containing the evaluation results.
1076
+ :rtype: EvalRunOutputItem
1077
+ :raises: EvaluationException if project_scope is not a OneDP project
1078
+ """
1079
+ if not is_onedp_project(project_scope):
1080
+ msg = "evaluate_with_rai_service_sync only supports OneDP projects. Use evaluate_with_rai_service for legacy projects."
1081
+ raise EvaluationException(
1082
+ message=msg,
1083
+ internal_message=msg,
1084
+ target=ErrorTarget.RAI_CLIENT,
1085
+ category=ErrorCategory.INVALID_VALUE,
1086
+ blame=ErrorBlame.USER_ERROR,
1087
+ )
1088
+
1089
+ client = AIProjectClient(
1090
+ endpoint=project_scope,
1091
+ credential=credential,
1092
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
1093
+ )
1094
+
1095
+ # Build the sync eval payload
1096
+ sync_eval_payload = _build_sync_eval_payload(data, metric_name, annotation_task, scan_session_id)
1097
+ # Call sync_evals.create() with the JSON payload
1098
+ eval_result = client.sync_evals.create(eval=sync_eval_payload)
1099
+
1100
+ # Return the raw EvalRunOutputItem for downstream processing
1101
+ return eval_result
1102
+
1103
+
607
1104
  async def evaluate_with_rai_service_multimodal(
608
- messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential
1105
+ messages, metric_name: str, project_scope: Union[str, AzureAIProject], credential: TokenCredential
609
1106
  ):
610
1107
  """ "Evaluate the content safety of the response using Responsible AI service
611
- :param messages: The normalized list of messages.
612
- :type messages: str
613
- :param metric_name: The evaluation metric to use.
614
- :type metric_name: str
615
- :param project_scope: The Azure AI project scope details.
616
- :type project_scope: Dict
617
- :param credential: The Azure authentication credential.
618
- :type credential:
619
- ~azure.core.credentials.TokenCredential
620
- :return: The parsed annotation result.
621
- :rtype: List[List[Dict]]
1108
+ :param messages: The normalized list of messages.
1109
+ :type messages: str
1110
+ :param metric_name: The evaluation metric to use.
1111
+ :type metric_name: str
1112
+ :param project_scope: The Azure AI project, which can either be a string representing the project endpoint
1113
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
1114
+ :type project_scope: Union[str, AzureAIProject]
1115
+ :param credential: The Azure authentication credential.
1116
+ :type credential: ~azure.core.credentials.TokenCredential
1117
+ :return: The parsed annotation result.
1118
+ :rtype: List[List[Dict]]
622
1119
  """
623
1120
 
624
- # Get RAI service URL from discovery service and check service availability
625
- token = await fetch_or_reuse_token(credential)
626
- rai_svc_url = await get_rai_svc_url(project_scope, token)
627
- await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
628
- # Submit annotation request and fetch result
629
- operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
630
- annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
631
- result = parse_response(annotation_response, metric_name)
632
- return result
1121
+ if is_onedp_project(project_scope):
1122
+ client = AIProjectClient(
1123
+ endpoint=project_scope,
1124
+ credential=credential,
1125
+ user_agent_policy=UserAgentPolicy(base_user_agent=UserAgentSingleton().value),
1126
+ )
1127
+ token = await fetch_or_reuse_token(credential=credential, workspace=COG_SRV_WORKSPACE)
1128
+ await ensure_service_availability_onedp(client, token, Tasks.CONTENT_HARM)
1129
+ operation_id = await submit_multimodal_request_onedp(client, messages, metric_name, token)
1130
+ annotation_response = cast(List[Dict], await fetch_result_onedp(client, operation_id, token))
1131
+ result = parse_response(annotation_response, metric_name)
1132
+ return result
1133
+ else:
1134
+ token = await fetch_or_reuse_token(credential)
1135
+ rai_svc_url = await get_rai_svc_url(project_scope, token)
1136
+ await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM)
1137
+ # Submit annotation request and fetch result
1138
+ operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token)
1139
+ annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token))
1140
+ result = parse_response(annotation_response, metric_name)
1141
+ return result