azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,21 @@
1
+ from pyrit.models import PromptDataType
2
+ from pyrit.prompt_converter import ConverterResult, PromptConverter
3
+
4
+
5
+ class _DefaultConverter(PromptConverter):
6
+
7
+ async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult:
8
+ """
9
+ Simple converter that does nothing to the prompt and returns it as is.
10
+ """
11
+ if not self.input_supported(input_type):
12
+ raise ValueError("Input type not supported")
13
+
14
+ result = ConverterResult(output_text=prompt, output_type="text")
15
+ return result
16
+
17
+ def input_supported(self, input_type: PromptDataType) -> bool:
18
+ return input_type == "text"
19
+
20
+ def output_supported(self, output_type: PromptDataType) -> bool:
21
+ return output_type == "text"
@@ -0,0 +1,505 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ """
5
+ Evaluation processing module for Red Team Agent.
6
+
7
+ This module handles the evaluation of conversations against risk categories,
8
+ processing evaluation results, and managing evaluation workflows.
9
+ """
10
+
11
+ import asyncio
12
+ import json
13
+ import os
14
+ import tempfile
15
+ import uuid
16
+ from datetime import datetime
17
+ from typing import Dict, List, Optional, Union
18
+ from pathlib import Path
19
+ from tqdm import tqdm
20
+
21
+ # Retry imports
22
+ import httpx
23
+ import httpcore
24
+ from tenacity import retry
25
+
26
+ # Azure AI Evaluation imports
27
+ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
28
+ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_sync
29
+ from azure.ai.evaluation._common.utils import is_onedp_project, get_default_threshold_for_evaluator
30
+ from azure.ai.evaluation._evaluate._utils import _write_output
31
+
32
+ # Local imports
33
+ from ._attack_strategy import AttackStrategy
34
+ from ._attack_objective_generator import RiskCategory
35
+ from ._utils.constants import RESULTS_EXT, TASK_STATUS
36
+ from ._utils.metric_mapping import (
37
+ get_annotation_task_from_risk_category,
38
+ get_metric_from_risk_category,
39
+ get_attack_objective_from_risk_category,
40
+ )
41
+ from ._utils.logging_utils import log_error
42
+ from ._utils.formatting_utils import get_strategy_name
43
+
44
+
45
+ class EvaluationProcessor:
46
+ """Handles evaluation of red team attack conversations."""
47
+
48
+ def __init__(
49
+ self,
50
+ logger,
51
+ azure_ai_project,
52
+ credential,
53
+ attack_success_thresholds,
54
+ retry_config,
55
+ scan_session_id=None,
56
+ scan_output_dir=None,
57
+ taxonomy_risk_categories=None,
58
+ ):
59
+ """Initialize the evaluation processor.
60
+
61
+ :param logger: Logger instance for logging
62
+ :param azure_ai_project: Azure AI project configuration
63
+ :param credential: Authentication credential
64
+ :param attack_success_thresholds: Configured attack success thresholds
65
+ :param retry_config: Retry configuration for network errors
66
+ :param scan_session_id: Session ID for the current scan
67
+ :param scan_output_dir: Directory for scan outputs
68
+ :param taxonomy_risk_categories: Dictionary mapping risk categories to taxonomy values
69
+ """
70
+ self.logger = logger
71
+ self.azure_ai_project = azure_ai_project
72
+ self.credential = credential
73
+ self.attack_success_thresholds = attack_success_thresholds
74
+ self.retry_config = retry_config
75
+ self.scan_session_id = scan_session_id
76
+ self.scan_output_dir = scan_output_dir
77
+ self.taxonomy_risk_categories = taxonomy_risk_categories or {}
78
+
79
+ async def evaluate_conversation(
80
+ self,
81
+ conversation: Dict,
82
+ metric_name: str,
83
+ strategy_name: str,
84
+ risk_category: RiskCategory,
85
+ idx: int,
86
+ risk_sub_type: Optional[str] = None,
87
+ ) -> Dict:
88
+ """Evaluate a single conversation using the specified metric and risk category.
89
+
90
+ :param conversation: Dictionary containing the conversation to evaluate
91
+ :type conversation: Dict
92
+ :param metric_name: Name of the evaluation metric to apply
93
+ :type metric_name: str
94
+ :param strategy_name: Name of the attack strategy used in the conversation
95
+ :type strategy_name: str
96
+ :param risk_category: Risk category to evaluate against
97
+ :type risk_category: RiskCategory
98
+ :param idx: Index of the conversation for tracking purposes
99
+ :type idx: int
100
+ :param risk_sub_type: Optional risk sub type for the evaluation
101
+ :type risk_sub_type: Optional[str]
102
+ :return: Dictionary containing evaluation results
103
+ :rtype: Dict
104
+ """
105
+ annotation_task = get_annotation_task_from_risk_category(risk_category)
106
+
107
+ messages = conversation["conversation"]["messages"]
108
+
109
+ # Extract all assistant messages for evaluation
110
+ assistant_messages = [msg["content"] for msg in messages if msg.get("role") == "assistant"]
111
+
112
+ context = [msg["context"] for msg in messages if msg.get("role") == "user"]
113
+ tool_calls = [
114
+ msg.get("tool_calls", []) for msg in messages if msg.get("role") == "assistant" and msg.get("tool_calls")
115
+ ]
116
+
117
+ if assistant_messages:
118
+ # Create query-response pair with empty query and all assistant messages
119
+ query_response = {
120
+ "query": "query",
121
+ "response": " ".join(assistant_messages),
122
+ "scenario": "redteam",
123
+ }
124
+
125
+ # Add context to query_response if found
126
+ if context and context[0] is not None:
127
+ # If context is a string, wrap it in the expected format for the RAI service
128
+ context_value = context[0]
129
+ if isinstance(context_value, str):
130
+ query_response["context"] = {"contexts": [{"content": context_value}]}
131
+ else:
132
+ query_response["context"] = context_value
133
+
134
+ if tool_calls and any(tool_calls):
135
+ query_response["tool_calls"] = [call for sublist in tool_calls for call in sublist if call]
136
+
137
+ # Add risk_sub_type to query_response if it exists
138
+ if risk_sub_type:
139
+ query_response["risk_sub_type"] = risk_sub_type
140
+
141
+ # Add taxonomy to query_response if it exists for this risk category
142
+ if self.taxonomy_risk_categories and risk_category.value in self.taxonomy_risk_categories:
143
+ taxonomy_value = self.taxonomy_risk_categories[risk_category.value]
144
+ # Convert taxonomy to string if it's not already a string
145
+ if taxonomy_value is not None:
146
+ query_response["taxonomy"] = str(taxonomy_value)
147
+
148
+ try:
149
+ self.logger.debug(f"Evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}")
150
+
151
+ @retry(**self.retry_config["network_retry"])
152
+ async def evaluate_with_rai_service_with_retry():
153
+ try:
154
+ # Use sync_evals endpoint for OneDP projects, legacy endpoint for others
155
+ if is_onedp_project(self.azure_ai_project):
156
+ # If using sync API use hate_unfairness rather than hate_fairness
157
+ effective_metric_name = "hate_unfairness" if metric_name == "hate_fairness" else metric_name
158
+ return await evaluate_with_rai_service_sync(
159
+ data=query_response,
160
+ metric_name=effective_metric_name,
161
+ project_scope=self.azure_ai_project,
162
+ credential=self.credential,
163
+ annotation_task=annotation_task,
164
+ scan_session_id=self.scan_session_id,
165
+ )
166
+ else:
167
+ return await evaluate_with_rai_service(
168
+ data=query_response,
169
+ metric_name=metric_name,
170
+ project_scope=self.azure_ai_project,
171
+ credential=self.credential,
172
+ annotation_task=annotation_task,
173
+ scan_session_id=self.scan_session_id,
174
+ )
175
+ except (
176
+ httpx.ConnectTimeout,
177
+ httpx.ReadTimeout,
178
+ httpx.ConnectError,
179
+ httpx.HTTPError,
180
+ httpx.TimeoutException,
181
+ ConnectionError,
182
+ ConnectionRefusedError,
183
+ ConnectionResetError,
184
+ TimeoutError,
185
+ OSError,
186
+ IOError,
187
+ ) as e:
188
+ self.logger.warning(
189
+ f"Network error while evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}: {type(e).__name__}: {str(e)}"
190
+ )
191
+ await asyncio.sleep(2)
192
+ raise
193
+
194
+ # Call the retry-enabled function
195
+ evaluate_output = await evaluate_with_rai_service_with_retry()
196
+
197
+ # Create a row with the format expected by result processing
198
+ row = {"inputs.conversation": {"messages": messages}}
199
+
200
+ # Check if we got an EvalRunOutputItem (from sync_evals) or legacy dict format
201
+ risk_cat_value = risk_category.value
202
+
203
+ # Handle EvalRunOutputItem format (from sync_evals for OneDP)
204
+ if hasattr(evaluate_output, "results") or (
205
+ isinstance(evaluate_output, dict) and "results" in evaluate_output
206
+ ):
207
+ # Extract results from EvalRunOutputItem
208
+ results = (
209
+ evaluate_output.results
210
+ if hasattr(evaluate_output, "results")
211
+ else evaluate_output.get("results", [])
212
+ )
213
+
214
+ # Find the result matching our metric/risk category
215
+ eval_result = None
216
+ for result_item in results:
217
+ result_dict = result_item if isinstance(result_item, dict) else result_item.__dict__
218
+ if result_dict.get("name") == metric_name or result_dict.get("metric") == metric_name:
219
+ eval_result = result_dict
220
+ break
221
+
222
+ if eval_result:
223
+ # Extract values from EvalRunOutputItem result format
224
+ score = eval_result.get("score", 0)
225
+ reason = eval_result.get("reason", "")
226
+
227
+ # Convert score to severity label if needed
228
+ severity_label = eval_result.get("label")
229
+ if severity_label is None:
230
+ # Calculate severity from score
231
+ from azure.ai.evaluation._common.utils import get_harm_severity_level
232
+
233
+ severity_label = get_harm_severity_level(score)
234
+
235
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}"] = severity_label
236
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = reason
237
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_score"] = score
238
+
239
+ # Store the EvalRunOutputItem as dict for new format processing
240
+ # Use model's serialization method or JSON round-trip
241
+ if hasattr(evaluate_output, "as_dict"):
242
+ row["_eval_run_output_item"] = evaluate_output.as_dict()
243
+ elif hasattr(evaluate_output, "model_dump"):
244
+ row["_eval_run_output_item"] = evaluate_output.model_dump()
245
+ else:
246
+ # Fallback: JSON serialize and parse back
247
+ row["_eval_run_output_item"] = json.loads(
248
+ json.dumps(evaluate_output, default=lambda o: o.__dict__)
249
+ )
250
+
251
+ # Add result field based on score and threshold comparison
252
+ # Get pattern-specific default threshold for this evaluator
253
+ default_threshold = get_default_threshold_for_evaluator(risk_cat_value)
254
+
255
+ # Content safety evaluators use "lower is better" scoring by default
256
+ # If attack success thresholds are configured for this category, use them
257
+ if risk_cat_value in self.attack_success_thresholds:
258
+ attack_threshold = self.attack_success_thresholds[risk_cat_value]
259
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = attack_threshold
260
+ result = int(score) <= int(attack_threshold)
261
+ self.logger.debug(f"Using attack success threshold {attack_threshold} for {risk_cat_value}")
262
+ self.logger.debug(f"Score: {score}, Attack Threshold: {attack_threshold}, Pass: {result}")
263
+ else:
264
+ # Store the default threshold in the results for transparency
265
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = default_threshold
266
+ result = int(score) <= int(default_threshold)
267
+ self.logger.debug(f"Using default threshold {default_threshold} for {risk_cat_value}")
268
+ self.logger.debug(f"Score: {score}, Default Threshold: {default_threshold}, Pass: {result}")
269
+
270
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_result"] = EVALUATION_PASS_FAIL_MAPPING[result]
271
+ self.logger.debug(
272
+ f"Successfully evaluated conversation {idx+1} for {risk_category.value}/{strategy_name}"
273
+ )
274
+ return row
275
+
276
+ # Handle legacy dict format (from evaluate_with_rai_service)
277
+ if evaluate_output and isinstance(evaluate_output, dict):
278
+ if risk_cat_value in evaluate_output:
279
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}"] = evaluate_output.get(risk_cat_value)
280
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = evaluate_output.get(
281
+ f"{risk_cat_value}_reason", ""
282
+ )
283
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_score"] = evaluate_output.get(
284
+ f"{risk_cat_value}_score", 0
285
+ )
286
+
287
+ # Add result field based on score and threshold comparison
288
+ score = evaluate_output.get(f"{risk_cat_value}_score", 0)
289
+ # Get pattern-specific default threshold for this evaluator
290
+ default_threshold = evaluate_output.get(
291
+ f"{risk_cat_value}_threshold", get_default_threshold_for_evaluator(risk_cat_value)
292
+ )
293
+
294
+ # Content safety evaluators use "lower is better" scoring by default
295
+ # If attack success thresholds are configured for this category, use them
296
+ if risk_cat_value in self.attack_success_thresholds:
297
+ attack_threshold = self.attack_success_thresholds[risk_cat_value]
298
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = attack_threshold
299
+ result = int(score) <= int(attack_threshold)
300
+ self.logger.debug(f"Using attack success threshold {attack_threshold} for {risk_cat_value}")
301
+ self.logger.debug(f"Score: {score}, Attack Threshold: {attack_threshold}, Pass: {result}")
302
+ else:
303
+ # Store the default threshold in the results for transparency
304
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_threshold"] = default_threshold
305
+ result = int(score) <= int(default_threshold)
306
+ self.logger.debug(f"Using default threshold {default_threshold} for {risk_cat_value}")
307
+ self.logger.debug(f"Score: {score}, Default Threshold: {default_threshold}, Pass: {result}")
308
+
309
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_result"] = EVALUATION_PASS_FAIL_MAPPING[result]
310
+ self.logger.debug(
311
+ f"Successfully evaluated conversation {idx+1} for {risk_category.value}/{strategy_name}"
312
+ )
313
+ return row
314
+ else:
315
+ if risk_cat_value in self.attack_success_thresholds:
316
+ self.logger.warning(
317
+ "Unable to use attack success threshold for evaluation as the evaluator does not return a score."
318
+ )
319
+
320
+ result = evaluate_output.get(f"{risk_cat_value}_label", "")
321
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_reason"] = evaluate_output.get(
322
+ f"{risk_cat_value}_reason", ""
323
+ )
324
+ row[f"outputs.{risk_cat_value}.{risk_cat_value}_result"] = EVALUATION_PASS_FAIL_MAPPING[
325
+ result == False
326
+ ]
327
+ self.logger.debug(
328
+ f"Successfully evaluated conversation {idx+1} for {risk_category.value}/{strategy_name}"
329
+ )
330
+ return row
331
+ except Exception as e:
332
+ error_msg = str(e)
333
+ self.logger.error(
334
+ f"Error evaluating conversation {idx+1} for {risk_category.value}/{strategy_name}: {error_msg}"
335
+ )
336
+ # Return a row with error information AND conversation data so it can be matched
337
+ # The error field will be picked up by result processing to populate sample.error
338
+ return {
339
+ "inputs.conversation": {"messages": messages},
340
+ "error": error_msg,
341
+ }
342
+
343
+ return {}
344
+
345
+ async def evaluate(
346
+ self,
347
+ data_path: Union[str, os.PathLike],
348
+ risk_category: RiskCategory,
349
+ strategy: Union[AttackStrategy, List[AttackStrategy]],
350
+ scan_name: Optional[str] = None,
351
+ output_path: Optional[Union[str, os.PathLike]] = None,
352
+ _skip_evals: bool = False,
353
+ red_team_info: Dict = None,
354
+ ) -> None:
355
+ """Perform evaluation on collected red team attack data.
356
+
357
+ :param data_path: Path to the input data containing red team conversations
358
+ :type data_path: Union[str, os.PathLike]
359
+ :param risk_category: Risk category to evaluate against
360
+ :type risk_category: RiskCategory
361
+ :param strategy: Attack strategy or strategies used to generate the data
362
+ :type strategy: Union[AttackStrategy, List[AttackStrategy]]
363
+ :param scan_name: Optional name for the evaluation
364
+ :type scan_name: Optional[str]
365
+ :param output_path: Path for storing evaluation results
366
+ :type output_path: Optional[Union[str, os.PathLike]]
367
+ :param _skip_evals: Whether to skip the actual evaluation process
368
+ :type _skip_evals: bool
369
+ :param red_team_info: Dictionary to store evaluation results
370
+ :type red_team_info: Dict
371
+ :return: None
372
+ """
373
+ strategy_name = get_strategy_name(strategy)
374
+ self.logger.debug(
375
+ f"Evaluate called with data_path={data_path}, risk_category={risk_category.value}, strategy={strategy_name}, output_path={output_path}, skip_evals={_skip_evals}, scan_name={scan_name}"
376
+ )
377
+ self.logger.debug(f"EvaluationProcessor scan_output_dir: {self.scan_output_dir}")
378
+
379
+ if _skip_evals:
380
+ return None
381
+
382
+ # If output_path is provided, use it; otherwise create one in the scan output directory if available
383
+ if output_path:
384
+ result_path = output_path
385
+ self.logger.debug(f"Using provided output_path: {result_path}")
386
+ elif self.scan_output_dir:
387
+ result_filename = f"{strategy_name}_{risk_category.value}_{str(uuid.uuid4())}{RESULTS_EXT}"
388
+ result_path = os.path.join(self.scan_output_dir, result_filename)
389
+ # Ensure the result path is absolute
390
+ if not os.path.isabs(result_path):
391
+ result_path = os.path.abspath(result_path)
392
+ self.logger.debug(f"Using scan_output_dir: {self.scan_output_dir}, result_path: {result_path}")
393
+ else:
394
+ result_path = f"{str(uuid.uuid4())}{RESULTS_EXT}"
395
+ # Make it absolute if not already
396
+ if not os.path.isabs(result_path):
397
+ result_path = os.path.abspath(result_path)
398
+ self.logger.debug(f"Using fallback path: {result_path}")
399
+
400
+ self.logger.debug(f"Final result_path: {result_path}")
401
+
402
+ try:
403
+ # Get the appropriate metric for this risk category
404
+ metric_name = get_metric_from_risk_category(risk_category)
405
+ self.logger.debug(f"Using metric '{metric_name}' for risk category '{risk_category.value}'")
406
+
407
+ # Load all conversations from the data file
408
+ conversations = []
409
+ try:
410
+ with open(data_path, "r", encoding="utf-8") as f:
411
+ for line in f:
412
+ try:
413
+ data = json.loads(line)
414
+ if "conversation" in data and "messages" in data["conversation"]:
415
+ conversations.append(data)
416
+ except json.JSONDecodeError:
417
+ self.logger.warning(f"Skipping invalid JSON line in {data_path}")
418
+ except Exception as e:
419
+ self.logger.error(f"Failed to read conversations from {data_path}: {str(e)}")
420
+ return None
421
+
422
+ if not conversations:
423
+ self.logger.warning(f"No valid conversations found in {data_path}, skipping evaluation")
424
+ return None
425
+
426
+ self.logger.debug(f"Found {len(conversations)} conversations in {data_path}")
427
+
428
+ # Evaluate each conversation
429
+ eval_start_time = datetime.now()
430
+ tasks = [
431
+ self.evaluate_conversation(
432
+ conversation=conversation,
433
+ metric_name=metric_name,
434
+ strategy_name=strategy_name,
435
+ risk_category=risk_category,
436
+ idx=idx,
437
+ risk_sub_type=conversation.get("risk_sub_type"),
438
+ )
439
+ for idx, conversation in enumerate(conversations)
440
+ ]
441
+ rows = await asyncio.gather(*tasks)
442
+
443
+ if not rows:
444
+ self.logger.warning(f"No conversations could be successfully evaluated in {data_path}")
445
+ return None
446
+
447
+ # Create the evaluation result structure
448
+ evaluation_result = {
449
+ "rows": rows,
450
+ "metrics": {},
451
+ }
452
+
453
+ # Write evaluation results to the output file
454
+ os.makedirs(os.path.dirname(result_path), exist_ok=True)
455
+ with open(result_path, "w", encoding="utf-8") as f:
456
+ json.dump(evaluation_result, f, indent=2)
457
+
458
+ # Log the full path for debugging
459
+ tqdm.write(f'Evaluation results saved to "{result_path}".')
460
+
461
+ eval_duration = (datetime.now() - eval_start_time).total_seconds()
462
+ self.logger.debug(
463
+ f"Evaluation of {len(rows)} conversations for {risk_category.value}/{strategy_name} completed in {eval_duration} seconds"
464
+ )
465
+ self.logger.debug(f"Successfully wrote evaluation results for {len(rows)} conversations to {result_path}")
466
+
467
+ except Exception as e:
468
+ self.logger.error(f"Error during evaluation for {risk_category.value}/{strategy_name}: {str(e)}")
469
+ evaluation_result = None
470
+
471
+ # Update red_team_info with evaluation results
472
+ if red_team_info:
473
+ red_team_info[strategy_name][risk_category.value]["evaluation_result_file"] = str(result_path)
474
+ red_team_info[strategy_name][risk_category.value]["evaluation_result"] = evaluation_result
475
+ red_team_info[strategy_name][risk_category.value]["status"] = TASK_STATUS["COMPLETED"]
476
+ self.logger.debug(
477
+ f"Evaluation complete for {strategy_name}/{risk_category.value}, results stored in red_team_info"
478
+ )
479
+ self.logger.debug(f"Updated red_team_info evaluation_result_file path: {result_path}")
480
+
481
+ def get_attack_success(self, result: Union[str, float], risk_category: Optional[str] = None) -> bool:
482
+ """Determine if an attack was successful based on evaluation result.
483
+
484
+ :param result: The evaluation result - either a string or numeric score
485
+ :type result: Union[str, float]
486
+ :param risk_category: The risk category (for API compatibility)
487
+ :type risk_category: Optional[str]
488
+ :return: Boolean indicating whether the attack was successful
489
+ :rtype: bool
490
+ """
491
+ # If result is already a boolean, return it directly
492
+ if isinstance(result, bool):
493
+ return result
494
+
495
+ # If the result is already a pass/fail string from EVALUATION_PASS_FAIL_MAPPING
496
+ # we can directly determine success without rechecking thresholds
497
+ if isinstance(result, str) and result.lower() in ("pass", "fail"):
498
+ # For safety evaluations, "pass" means the content is safe (attack failed)
499
+ # "fail" means content violated safety policy (attack succeeded)
500
+ return result.lower() == "fail"
501
+
502
+ # Fall back to binary string evaluation for all other cases
503
+ from ._utils.formatting_utils import get_attack_success
504
+
505
+ return get_attack_success(str(result))