azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,114 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ import logging
6
+ from typing import List, Optional
7
+
8
+ from pyrit.models import Score, PromptRequestPiece, UnvalidatedScore
9
+ from pyrit.score.scorer import Scorer
10
+
11
+ from azure.ai.evaluation.simulator._model_tools._generated_rai_client import GeneratedRAIClient
12
+ from ._rai_service_eval_chat_target import RAIServiceEvalChatTarget
13
+ from .._attack_objective_generator import RiskCategory
14
+
15
+
16
+ class AzureRAIServiceTrueFalseScorer(Scorer):
17
+ """Scorer for Azure RAI service.
18
+
19
+ This class extends the PyRIT Scorer class to support scoring prompts using
20
+ the Azure RAI service. It sends prompts to the RAI service and processes
21
+ the responses to generate scores.
22
+
23
+ This is a true/false scorer that determines whether a given prompt response
24
+ achieves the specified objective.
25
+ """
26
+
27
+ scorer_type = "true_false"
28
+
29
+ def __init__(
30
+ self,
31
+ *,
32
+ client: GeneratedRAIClient,
33
+ risk_category: RiskCategory,
34
+ credential,
35
+ azure_ai_project,
36
+ api_version: Optional[str] = None,
37
+ model: Optional[str] = None,
38
+ prompt_template_key: Optional[str] = None,
39
+ logger: Optional[logging.Logger] = None,
40
+ context: Optional[str] = None,
41
+ ) -> None:
42
+ """Initialize the scorer.
43
+
44
+ :param client: The RAI client to use for scoring
45
+ :param api_version: Optional API version to use
46
+ :param model: Optional model name to score with
47
+ :param prompt_template_key: Optional prompt template key to use
48
+ :param logger: Optional logger to use
49
+ """
50
+ super().__init__()
51
+ self._client = client
52
+ self._api_version = api_version
53
+ self._model = model
54
+ self._prompt_template_key = prompt_template_key
55
+ self.logger = logger
56
+ self.context = context
57
+
58
+ # Use AzureRAIServiceTarget for scoring as well
59
+ self._scoring_target_red_llm = RAIServiceEvalChatTarget(
60
+ logger=self.logger,
61
+ credential=credential,
62
+ azure_ai_project=azure_ai_project,
63
+ risk_category=risk_category,
64
+ context=context,
65
+ )
66
+
67
+ async def score_async(
68
+ self,
69
+ request_response: PromptRequestPiece,
70
+ *,
71
+ task: Optional[str] = None,
72
+ ) -> List[Score]:
73
+ """Score a prompt response asynchronously.
74
+
75
+ :param request_response: The prompt response to score (PromptRequestPiece)
76
+ :param task: Optional task description (objective)
77
+ :return: List of Score objects
78
+ """
79
+ self.logger.info(f"Starting to score prompt response")
80
+
81
+ # Validate request_response
82
+ self.validate(request_response, task=task)
83
+
84
+ if not self._scoring_target_red_llm.context:
85
+ self._scoring_target_red_llm.context = request_response.labels.get("context", "")
86
+
87
+ unvalidated_score: UnvalidatedScore = await self._score_value_with_llm(
88
+ prompt_target=self._scoring_target_red_llm,
89
+ system_prompt="Please evaluate the following response.", # TODO: What should this be?
90
+ prompt_request_value=request_response.converted_value,
91
+ prompt_request_data_type=request_response.converted_value_data_type,
92
+ scored_prompt_id=request_response.id,
93
+ task=task,
94
+ orchestrator_identifier=request_response.orchestrator_identifier,
95
+ )
96
+
97
+ score = unvalidated_score.to_score(score_value=unvalidated_score.raw_score_value)
98
+
99
+ # self._memory.add_scores_to_memory(scores=[score])
100
+ return [score]
101
+
102
+ def validate(self, request_response, *, task: Optional[str] = None):
103
+ """Validates the request_response piece to score.
104
+
105
+ This method checks if the request_response is valid for scoring by this scorer.
106
+
107
+ :param request_response: The request response to be validated
108
+ :param task: The task based on which the text should be scored (the original attacker model's objective)
109
+ :raises: ValueError if the request_response is invalid
110
+ """
111
+
112
+ # Additional validation can be added here as needed
113
+ # For now we'll keep it simple since we handle conversion to PromptRequestResponse in score_async
114
+ pass
@@ -0,0 +1,72 @@
1
+ """
2
+ Constants used in Red Team Agent.
3
+ """
4
+
5
+ import os
6
+ from .._attack_strategy import AttackStrategy
7
+ from .._attack_objective_generator import RiskCategory
8
+
9
+ # File extensions
10
+ BASELINE_IDENTIFIER = "baseline"
11
+ DATA_EXT = ".jsonl"
12
+ RESULTS_EXT = ".json"
13
+
14
+ # Mapping of attack strategies to complexity levels
15
+
16
+ ATTACK_STRATEGY_COMPLEXITY_MAP = {
17
+ str(AttackStrategy.Baseline.value): "baseline",
18
+ str(AttackStrategy.AnsiAttack.value): "easy",
19
+ str(AttackStrategy.AsciiArt.value): "easy",
20
+ str(AttackStrategy.AsciiSmuggler.value): "easy",
21
+ str(AttackStrategy.Atbash.value): "easy",
22
+ str(AttackStrategy.Base64.value): "easy",
23
+ str(AttackStrategy.Binary.value): "easy",
24
+ str(AttackStrategy.Caesar.value): "easy",
25
+ str(AttackStrategy.CharacterSpace.value): "easy",
26
+ str(AttackStrategy.CharSwap.value): "easy",
27
+ str(AttackStrategy.Diacritic.value): "easy",
28
+ str(AttackStrategy.Flip.value): "easy",
29
+ str(AttackStrategy.Leetspeak.value): "easy",
30
+ str(AttackStrategy.Morse.value): "easy",
31
+ str(AttackStrategy.ROT13.value): "easy",
32
+ str(AttackStrategy.SuffixAppend.value): "easy",
33
+ str(AttackStrategy.StringJoin.value): "easy",
34
+ str(AttackStrategy.UnicodeConfusable.value): "easy",
35
+ str(AttackStrategy.UnicodeSubstitution.value): "easy",
36
+ str(AttackStrategy.Url.value): "easy",
37
+ str(AttackStrategy.EASY.value): "easy",
38
+ str(AttackStrategy.Tense.value): "moderate",
39
+ str(AttackStrategy.MODERATE.value): "moderate",
40
+ str(AttackStrategy.DIFFICULT.value): "difficult",
41
+ str(AttackStrategy.Jailbreak.value): "easy",
42
+ str(AttackStrategy.IndirectJailbreak.value): "easy",
43
+ str(AttackStrategy.MultiTurn.value): "difficult",
44
+ str(AttackStrategy.Crescendo.value): "difficult",
45
+ }
46
+
47
+ # Task timeouts and status codes
48
+ INTERNAL_TASK_TIMEOUT = 120
49
+
50
+ # Sampling constants
51
+ # Multiplier for the maximum number of sampling iterations when round-robin sampling from risk subtypes.
52
+ # This prevents infinite loops while allowing sufficient attempts to find unique objectives.
53
+ # With N subtypes, this allows up to N * MAX_SAMPLING_ITERATIONS_MULTIPLIER total iterations.
54
+ MAX_SAMPLING_ITERATIONS_MULTIPLIER = 100
55
+
56
+ # Map of risk categories to their maximum number of subtypes
57
+ # Used to calculate num_objectives_with_subtypes for adequate subtype coverage
58
+ RISK_TO_NUM_SUBTYPE_MAP = {
59
+ RiskCategory.ProhibitedActions: 32,
60
+ RiskCategory.TaskAdherence: 9,
61
+ RiskCategory.SensitiveDataLeakage: 19,
62
+ }
63
+
64
+ # Task status definitions
65
+ TASK_STATUS = {
66
+ "PENDING": "pending",
67
+ "RUNNING": "running",
68
+ "COMPLETED": "completed",
69
+ "FAILED": "failed",
70
+ "TIMEOUT": "timeout",
71
+ "INCOMPLETE": "incomplete",
72
+ }
@@ -0,0 +1,345 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ """
5
+ Exception handling utilities for Red Team Agent.
6
+
7
+ This module provides centralized exception handling, error categorization,
8
+ and error reporting utilities for red team operations.
9
+ """
10
+
11
+ import logging
12
+ import traceback
13
+ import asyncio
14
+ from typing import Optional, Any, Dict, Union
15
+ from enum import Enum
16
+
17
+
18
+ class ErrorCategory(Enum):
19
+ """Categories of errors that can occur during red team operations."""
20
+
21
+ NETWORK = "network"
22
+ AUTHENTICATION = "authentication"
23
+ CONFIGURATION = "configuration"
24
+ DATA_PROCESSING = "data_processing"
25
+ ORCHESTRATOR = "orchestrator"
26
+ EVALUATION = "evaluation"
27
+ FILE_IO = "file_io"
28
+ TIMEOUT = "timeout"
29
+ UNKNOWN = "unknown"
30
+
31
+
32
+ class ErrorSeverity(Enum):
33
+ """Severity levels for errors."""
34
+
35
+ LOW = "low" # Warning level, operation can continue
36
+ MEDIUM = "medium" # Error level, task failed but scan can continue
37
+ HIGH = "high" # Critical error, scan should be aborted
38
+ FATAL = "fatal" # Unrecoverable error
39
+
40
+
41
+ class RedTeamError(Exception):
42
+ """Base exception for Red Team operations."""
43
+
44
+ def __init__(
45
+ self,
46
+ message: str,
47
+ category: ErrorCategory = ErrorCategory.UNKNOWN,
48
+ severity: ErrorSeverity = ErrorSeverity.MEDIUM,
49
+ context: Optional[Dict[str, Any]] = None,
50
+ original_exception: Optional[Exception] = None,
51
+ ):
52
+ super().__init__(message)
53
+ self.message = message
54
+ self.category = category
55
+ self.severity = severity
56
+ self.context = context or {}
57
+ self.original_exception = original_exception
58
+
59
+
60
+ class ExceptionHandler:
61
+ """Centralized exception handling for Red Team operations."""
62
+
63
+ def __init__(self, logger: Optional[logging.Logger] = None):
64
+ """Initialize exception handler.
65
+
66
+ :param logger: Logger instance for error reporting
67
+ """
68
+ self.logger = logger or logging.getLogger(__name__)
69
+ self.error_counts: Dict[ErrorCategory, int] = {category: 0 for category in ErrorCategory}
70
+
71
+ def categorize_exception(self, exception: Exception) -> ErrorCategory:
72
+ """Categorize an exception based on its type and message.
73
+
74
+ :param exception: The exception to categorize
75
+ :return: The appropriate error category
76
+ """
77
+ import httpx
78
+ import httpcore
79
+
80
+ # Network-related errors
81
+ network_exceptions = (
82
+ httpx.ConnectTimeout,
83
+ httpx.ReadTimeout,
84
+ httpx.ConnectError,
85
+ httpx.HTTPError,
86
+ httpx.TimeoutException,
87
+ httpcore.ReadTimeout,
88
+ ConnectionError,
89
+ ConnectionRefusedError,
90
+ ConnectionResetError,
91
+ )
92
+
93
+ if isinstance(exception, network_exceptions):
94
+ return ErrorCategory.NETWORK
95
+
96
+ # Timeout errors (separate from network to handle asyncio.TimeoutError)
97
+ if isinstance(exception, (TimeoutError, asyncio.TimeoutError)):
98
+ return ErrorCategory.TIMEOUT
99
+
100
+ # File I/O errors
101
+ if isinstance(exception, (IOError, OSError, FileNotFoundError, PermissionError)):
102
+ return ErrorCategory.FILE_IO
103
+
104
+ # HTTP status code specific errors
105
+ if hasattr(exception, "response") and hasattr(exception.response, "status_code"):
106
+ status_code = exception.response.status_code
107
+ if 500 <= status_code < 600:
108
+ return ErrorCategory.NETWORK
109
+ elif status_code == 401:
110
+ return ErrorCategory.AUTHENTICATION
111
+ elif status_code == 403:
112
+ return ErrorCategory.CONFIGURATION
113
+
114
+ # String-based categorization
115
+ message = str(exception).lower()
116
+
117
+ # Define keyword mappings for cleaner logic
118
+ keyword_mappings = {
119
+ ErrorCategory.AUTHENTICATION: ["authentication", "unauthorized"],
120
+ ErrorCategory.CONFIGURATION: ["configuration", "config"],
121
+ ErrorCategory.ORCHESTRATOR: ["orchestrator"],
122
+ ErrorCategory.EVALUATION: ["evaluation", "evaluate", "model_error"],
123
+ ErrorCategory.DATA_PROCESSING: ["data", "json"],
124
+ }
125
+
126
+ for category, keywords in keyword_mappings.items():
127
+ if any(keyword in message for keyword in keywords):
128
+ return category
129
+
130
+ return ErrorCategory.UNKNOWN
131
+
132
+ def determine_severity(
133
+ self, exception: Exception, category: ErrorCategory, context: Optional[Dict[str, Any]] = None
134
+ ) -> ErrorSeverity:
135
+ """Determine the severity of an exception.
136
+
137
+ :param exception: The exception to evaluate
138
+ :param category: The error category
139
+ :param context: Additional context for severity determination
140
+ :return: The appropriate error severity
141
+ """
142
+ context = context or {}
143
+
144
+ # Critical system errors
145
+ if isinstance(exception, (MemoryError, SystemExit, KeyboardInterrupt)):
146
+ return ErrorSeverity.FATAL
147
+
148
+ # Authentication and configuration are typically high severity
149
+ if category in (ErrorCategory.AUTHENTICATION, ErrorCategory.CONFIGURATION):
150
+ return ErrorSeverity.HIGH
151
+
152
+ # File I/O errors can be high severity if they involve critical files
153
+ if category == ErrorCategory.FILE_IO:
154
+ if context.get("critical_file", False):
155
+ return ErrorSeverity.HIGH
156
+ return ErrorSeverity.MEDIUM
157
+
158
+ # Network and timeout errors are usually medium severity (retryable)
159
+ if category in (ErrorCategory.NETWORK, ErrorCategory.TIMEOUT):
160
+ return ErrorSeverity.MEDIUM
161
+
162
+ # Task-specific errors are medium severity
163
+ if category in (ErrorCategory.ORCHESTRATOR, ErrorCategory.EVALUATION, ErrorCategory.DATA_PROCESSING):
164
+ return ErrorSeverity.MEDIUM
165
+
166
+ return ErrorSeverity.LOW
167
+
168
+ def handle_exception(
169
+ self,
170
+ exception: Exception,
171
+ context: Optional[Dict[str, Any]] = None,
172
+ task_name: Optional[str] = None,
173
+ reraise: bool = False,
174
+ ) -> RedTeamError:
175
+ """Handle an exception with proper categorization and logging.
176
+
177
+ :param exception: The exception to handle
178
+ :param context: Additional context information
179
+ :param task_name: Name of the task where the exception occurred
180
+ :param reraise: Whether to reraise the exception after handling
181
+ :return: A RedTeamError with categorized information
182
+ """
183
+ context = context or {}
184
+
185
+ # If it's already a RedTeamError, just log and return/reraise
186
+ if isinstance(exception, RedTeamError):
187
+ self._log_error(exception, task_name)
188
+ if reraise:
189
+ raise exception
190
+ return exception
191
+
192
+ # Categorize the exception
193
+ category = self.categorize_exception(exception)
194
+ severity = self.determine_severity(exception, category, context)
195
+
196
+ # Update error counts
197
+ self.error_counts[category] += 1
198
+
199
+ # Create RedTeamError
200
+ message = f"{category.value.title()} error"
201
+ if task_name:
202
+ message += f" in {task_name}"
203
+ message += f": {str(exception)}"
204
+
205
+ red_team_error = RedTeamError(
206
+ message=message, category=category, severity=severity, context=context, original_exception=exception
207
+ )
208
+
209
+ # Log the error
210
+ self._log_error(red_team_error, task_name)
211
+
212
+ if reraise:
213
+ raise red_team_error
214
+
215
+ return red_team_error
216
+
217
+ def _log_error(self, error: RedTeamError, task_name: Optional[str] = None) -> None:
218
+ """Log an error with appropriate level based on severity.
219
+
220
+ :param error: The RedTeamError to log
221
+ :param task_name: Optional task name for context
222
+ """
223
+ # Determine log level based on severity
224
+ if error.severity == ErrorSeverity.FATAL:
225
+ log_level = logging.CRITICAL
226
+ elif error.severity == ErrorSeverity.HIGH:
227
+ log_level = logging.ERROR
228
+ elif error.severity == ErrorSeverity.MEDIUM:
229
+ log_level = logging.WARNING
230
+ else:
231
+ log_level = logging.INFO
232
+
233
+ # Create log message
234
+ message_parts = []
235
+ if task_name:
236
+ message_parts.append(f"[{task_name}]")
237
+ message_parts.append(f"[{error.category.value}]")
238
+ message_parts.append(f"[{error.severity.value}]")
239
+ message_parts.append(error.message)
240
+
241
+ log_message = " ".join(message_parts)
242
+
243
+ # Log with appropriate level
244
+ self.logger.log(log_level, log_message)
245
+
246
+ # Log additional context if available
247
+ if error.context:
248
+ self.logger.debug(f"Error context: {error.context}")
249
+
250
+ # Log original exception traceback for debugging
251
+ if error.original_exception and self.logger.isEnabledFor(logging.DEBUG):
252
+ self.logger.debug(f"Original exception traceback:\n{traceback.format_exc()}")
253
+
254
+ def should_abort_scan(self) -> bool:
255
+ """Determine if the scan should be aborted based on error patterns.
256
+
257
+ :return: True if the scan should be aborted
258
+ """
259
+ # Abort if we have too many high-severity errors
260
+ high_severity_categories = [ErrorCategory.AUTHENTICATION, ErrorCategory.CONFIGURATION]
261
+ high_severity_count = sum(self.error_counts[cat] for cat in high_severity_categories)
262
+
263
+ if high_severity_count > 2:
264
+ return True
265
+
266
+ # Abort if we have too many network errors (indicates systemic issue)
267
+ if self.error_counts[ErrorCategory.NETWORK] > 10:
268
+ return True
269
+
270
+ return False
271
+
272
+ def get_error_summary(self) -> Dict[str, Any]:
273
+ """Get a summary of all errors encountered.
274
+
275
+ :return: Dictionary containing error statistics
276
+ """
277
+ total_errors = sum(self.error_counts.values())
278
+
279
+ return {
280
+ "total_errors": total_errors,
281
+ "error_counts_by_category": dict(self.error_counts),
282
+ "most_common_category": max(self.error_counts, key=self.error_counts.get) if total_errors > 0 else None,
283
+ "should_abort": self.should_abort_scan(),
284
+ }
285
+
286
+ def log_error_summary(self) -> None:
287
+ """Log a summary of all errors encountered."""
288
+ summary = self.get_error_summary()
289
+
290
+ if summary["total_errors"] == 0:
291
+ self.logger.info("No errors encountered during operation")
292
+ return
293
+
294
+ self.logger.info(f"Error Summary: {summary['total_errors']} total errors")
295
+
296
+ for category, count in summary["error_counts_by_category"].items():
297
+ if count > 0:
298
+ self.logger.info(f" {category}: {count}")
299
+
300
+ if summary["most_common_category"]:
301
+ self.logger.info(f"Most common error type: {summary['most_common_category']}")
302
+
303
+
304
+ def create_exception_handler(logger: Optional[logging.Logger] = None) -> ExceptionHandler:
305
+ """Create an ExceptionHandler instance.
306
+
307
+ :param logger: Logger instance for error reporting
308
+ :return: Configured ExceptionHandler
309
+ """
310
+ return ExceptionHandler(logger=logger)
311
+
312
+
313
+ # Convenience context manager for handling exceptions
314
+ class exception_context:
315
+ """Context manager for handling exceptions in Red Team operations."""
316
+
317
+ def __init__(
318
+ self,
319
+ handler: ExceptionHandler,
320
+ task_name: str,
321
+ context: Optional[Dict[str, Any]] = None,
322
+ reraise_fatal: bool = True,
323
+ ):
324
+ self.handler = handler
325
+ self.task_name = task_name
326
+ self.context = context or {}
327
+ self.reraise_fatal = reraise_fatal
328
+ self.error: Optional[RedTeamError] = None
329
+
330
+ def __enter__(self):
331
+ return self
332
+
333
+ def __exit__(self, exc_type, exc_val, exc_tb):
334
+ if exc_val is not None:
335
+ self.error = self.handler.handle_exception(
336
+ exception=exc_val, context=self.context, task_name=self.task_name, reraise=False
337
+ )
338
+
339
+ # Reraise fatal errors unless specifically disabled
340
+ if self.reraise_fatal and self.error.severity == ErrorSeverity.FATAL:
341
+ raise self.error
342
+
343
+ # Suppress the original exception (we've handled it)
344
+ return True
345
+ return False