azure-ai-evaluation 1.0.0b2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (299) hide show
  1. azure/ai/evaluation/__init__.py +100 -5
  2. azure/ai/evaluation/{_evaluators/_chat → _aoai}/__init__.py +3 -2
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +24 -9
  16. azure/ai/evaluation/_common/constants.py +131 -2
  17. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  18. azure/ai/evaluation/_common/math.py +89 -0
  19. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  20. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  21. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  22. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  23. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  25. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  26. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  27. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  28. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  29. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  30. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  31. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  32. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  33. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  34. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  35. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  37. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  38. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  39. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  40. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  41. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  42. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  43. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  44. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  45. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  46. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  61. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  62. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  63. azure/ai/evaluation/_common/rai_service.py +831 -142
  64. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  65. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  66. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  67. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  68. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  69. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  70. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  71. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  73. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  74. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  76. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  77. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  78. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  79. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  80. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  81. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  82. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  83. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  84. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  85. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  86. azure/ai/evaluation/_common/utils.py +870 -34
  87. azure/ai/evaluation/_constants.py +167 -6
  88. azure/ai/evaluation/_converters/__init__.py +3 -0
  89. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  90. azure/ai/evaluation/_converters/_models.py +467 -0
  91. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  92. azure/ai/evaluation/_eval_mapping.py +83 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +17 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  95. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  96. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +47 -25
  97. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +42 -13
  98. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +124 -0
  99. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +62 -0
  100. azure/ai/evaluation/_evaluate/_eval_run.py +102 -59
  101. azure/ai/evaluation/_evaluate/_evaluate.py +2134 -311
  102. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  103. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +14 -99
  104. azure/ai/evaluation/_evaluate/_utils.py +289 -40
  105. azure/ai/evaluation/_evaluator_definition.py +76 -0
  106. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +93 -42
  107. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  108. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  109. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +117 -91
  110. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -39
  111. azure/ai/evaluation/_evaluators/_common/__init__.py +15 -0
  112. azure/ai/evaluation/_evaluators/_common/_base_eval.py +742 -0
  113. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  114. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +345 -0
  115. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +198 -0
  116. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  117. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +0 -4
  118. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -86
  119. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +138 -57
  120. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -55
  121. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +133 -54
  122. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +134 -54
  123. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  124. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  125. azure/ai/evaluation/_evaluators/_eci/_eci.py +49 -56
  126. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +102 -60
  127. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +115 -92
  128. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -41
  129. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +90 -37
  130. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +318 -82
  131. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +114 -0
  132. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +104 -0
  133. azure/ai/evaluation/{_evaluate/_batch_run_client → _evaluators/_intent_resolution}/__init__.py +3 -4
  134. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  135. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  136. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +107 -61
  137. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -77
  138. azure/ai/evaluation/_evaluators/_qa/_qa.py +115 -63
  139. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +182 -98
  140. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +178 -49
  141. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  142. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  143. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  144. azure/ai/evaluation/_evaluators/{_chat/retrieval → _retrieval}/__init__.py +2 -2
  145. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +148 -0
  146. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  147. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +189 -50
  148. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  149. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +179 -0
  150. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +102 -91
  151. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +0 -5
  152. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  153. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  154. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  155. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  156. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  157. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  158. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  159. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  160. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  162. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  163. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  164. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  165. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  166. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  167. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  168. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  169. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  170. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  171. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  172. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  173. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  174. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  175. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  176. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  177. azure/ai/evaluation/_evaluators/_xpia/xpia.py +109 -107
  178. azure/ai/evaluation/_exceptions.py +51 -7
  179. azure/ai/evaluation/_http_utils.py +210 -137
  180. azure/ai/evaluation/_legacy/__init__.py +3 -0
  181. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  182. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  183. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  184. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  185. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  186. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  187. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  188. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  189. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  190. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  191. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  192. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  197. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  198. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  199. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  200. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  201. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  202. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  203. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  204. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  205. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  206. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  207. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  208. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  209. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  210. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  211. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  212. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  213. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  214. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  215. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  216. azure/ai/evaluation/_model_configurations.py +130 -8
  217. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  218. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  219. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  220. azure/ai/evaluation/_user_agent.py +32 -1
  221. azure/ai/evaluation/_vendor/__init__.py +3 -0
  222. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  223. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +324 -0
  224. azure/ai/evaluation/_vendor/rouge_score/scoring.py +59 -0
  225. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +59 -0
  226. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  227. azure/ai/evaluation/_version.py +2 -1
  228. azure/ai/evaluation/red_team/__init__.py +22 -0
  229. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  230. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  231. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  232. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  233. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  234. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  235. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  236. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  237. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  238. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  239. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  240. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  241. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  242. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  243. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  244. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  245. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  246. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  247. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  248. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  249. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  250. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  251. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  252. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  253. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  254. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  255. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  256. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  257. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  258. azure/ai/evaluation/simulator/__init__.py +2 -1
  259. azure/ai/evaluation/simulator/_adversarial_scenario.py +26 -1
  260. azure/ai/evaluation/simulator/_adversarial_simulator.py +270 -144
  261. azure/ai/evaluation/simulator/_constants.py +12 -1
  262. azure/ai/evaluation/simulator/_conversation/__init__.py +151 -23
  263. azure/ai/evaluation/simulator/_conversation/_conversation.py +10 -6
  264. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  265. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  266. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  267. azure/ai/evaluation/simulator/_direct_attack_simulator.py +54 -75
  268. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  269. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  270. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +26 -5
  271. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +145 -104
  272. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  273. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  274. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +80 -30
  275. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +117 -45
  276. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +109 -7
  277. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +97 -33
  278. azure/ai/evaluation/simulator/_model_tools/models.py +30 -27
  279. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +6 -10
  280. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -5
  281. azure/ai/evaluation/simulator/_simulator.py +302 -208
  282. azure/ai/evaluation/simulator/_utils.py +31 -13
  283. azure_ai_evaluation-1.13.3.dist-info/METADATA +939 -0
  284. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  285. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  286. azure_ai_evaluation-1.13.3.dist-info/licenses/NOTICE.txt +70 -0
  287. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +0 -71
  288. azure/ai/evaluation/_evaluators/_chat/_chat.py +0 -357
  289. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +0 -157
  290. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +0 -48
  291. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +0 -65
  292. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +0 -301
  293. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -54
  294. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +0 -5
  295. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +0 -104
  296. azure/ai/evaluation/simulator/_tracing.py +0 -89
  297. azure_ai_evaluation-1.0.0b2.dist-info/METADATA +0 -449
  298. azure_ai_evaluation-1.0.0b2.dist-info/RECORD +0 -99
  299. {azure_ai_evaluation-1.0.0b2.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,917 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from enum import Enum
6
+ import os
7
+ import inspect
8
+ import logging
9
+ import asyncio
10
+ from datetime import datetime
11
+ from azure.ai.evaluation._common._experimental import experimental
12
+ from typing import Any, Callable, Dict, List, Optional, Union, cast, Coroutine, TypeVar, Awaitable
13
+ from azure.ai.evaluation._common.math import list_mean_nan_safe
14
+ from azure.ai.evaluation._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
15
+ from azure.ai.evaluation._evaluators import (
16
+ _content_safety,
17
+ _protected_material,
18
+ _groundedness,
19
+ _relevance,
20
+ _similarity,
21
+ _fluency,
22
+ _xpia,
23
+ _coherence,
24
+ _code_vulnerability,
25
+ _ungrounded_attributes,
26
+ )
27
+ from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
28
+ from azure.ai.evaluation._evaluate import _evaluate
29
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
30
+ from azure.ai.evaluation._model_configurations import AzureAIProject, EvaluationResult
31
+ from azure.ai.evaluation.simulator import (
32
+ Simulator,
33
+ AdversarialSimulator,
34
+ AdversarialScenario,
35
+ AdversarialScenarioJailbreak,
36
+ IndirectAttackSimulator,
37
+ DirectAttackSimulator,
38
+ )
39
+ from azure.ai.evaluation.simulator._adversarial_scenario import _UnstableAdversarialScenario
40
+ from azure.ai.evaluation.simulator._utils import JsonLineList
41
+ from azure.ai.evaluation._common.utils import validate_azure_ai_project
42
+ from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
43
+ from azure.core.credentials import TokenCredential
44
+ import json
45
+ from pathlib import Path
46
+
47
+ logger = logging.getLogger(__name__)
48
+ JAILBREAK_EXT = "_Jailbreak"
49
+ DATA_EXT = "_Data.jsonl"
50
+ RESULTS_EXT = "_Results.jsonl"
51
+
52
+
53
+ def _setup_logger():
54
+ """Configure and return a logger instance for the CustomAdversarialSimulator.
55
+
56
+ :return: The logger instance.
57
+ :rtype: logging.Logger
58
+ """
59
+ log_filename = datetime.now().strftime("%Y_%m_%d__%H_%M.log")
60
+ logger = logging.getLogger("CustomAdversarialSimulatorLogger")
61
+ logger.setLevel(logging.DEBUG)
62
+ file_handler = logging.FileHandler(log_filename)
63
+ file_handler.setLevel(logging.DEBUG)
64
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
65
+ file_handler.setFormatter(formatter)
66
+ logger.addHandler(file_handler)
67
+
68
+ return logger
69
+
70
+
71
+ @experimental
72
+ class _SafetyEvaluator(Enum):
73
+ """
74
+ Evaluator types for Safety evaluation.
75
+ """
76
+
77
+ CONTENT_SAFETY = "content_safety"
78
+ CODE_VULNERABILITY = "code_vulnerability"
79
+ GROUNDEDNESS = "groundedness"
80
+ PROTECTED_MATERIAL = "protected_material"
81
+ RELEVANCE = "relevance"
82
+ SIMILARITY = "similarity"
83
+ FLUENCY = "fluency"
84
+ COHERENCE = "coherence"
85
+ INDIRECT_ATTACK = "indirect_attack"
86
+ DIRECT_ATTACK = "direct_attack"
87
+ ECI = "eci"
88
+ UNGROUNDED_ATTRIBUTES = "ungrounded_attributes"
89
+
90
+
91
+ @experimental
92
+ class _SafetyEvaluation:
93
+ def __init__(
94
+ self,
95
+ azure_ai_project: Union[str, dict],
96
+ credential: TokenCredential,
97
+ model_config: Optional[Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]] = None,
98
+ ):
99
+ """
100
+ Initializes a SafetyEvaluation object.
101
+
102
+ :param azure_ai_project: A string or dictionary defining the Azure AI project. Required keys are 'subscription_id', 'resource_group_name', and 'project_name'.
103
+ :type azure_ai_project: Union[str, Dict[str, str]]
104
+ :param credential: The credential for connecting to Azure AI project.
105
+ :type credential: ~azure.core.credentials.TokenCredential
106
+ :param model_config: A dictionary defining the configuration for the model. Acceptable types are AzureOpenAIModelConfiguration and OpenAIModelConfiguration.
107
+ :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, ~azure.ai.evaluation.OpenAIModelConfiguration]
108
+ :raises ValueError: If the model_config does not contain the required keys or any value is None.
109
+ """
110
+ if model_config:
111
+ self._validate_model_config(model_config)
112
+ self.model_config = model_config
113
+ else:
114
+ self.model_config = None
115
+ self.azure_ai_project = validate_azure_ai_project(azure_ai_project)
116
+ self.credential = credential
117
+ self.logger = _setup_logger()
118
+
119
+ @staticmethod
120
+ def _validate_model_config(model_config: Any):
121
+ """
122
+ Validates the model_config to ensure all required keys are present and have non-None values.
123
+ If 'type' is not specified, it will attempt to infer the type based on the keys present.
124
+
125
+ :param model_config: The model configuration dictionary.
126
+ :type model_config: Dict[str, Any]
127
+ :raises ValueError: If required keys are missing or any of the values are None.
128
+ """
129
+ # Attempt to infer 'type' if not provided
130
+ if "type" not in model_config:
131
+ if "azure_deployment" in model_config and "azure_endpoint" in model_config:
132
+ model_config["type"] = "azure_openai"
133
+ elif "model" in model_config:
134
+ model_config["type"] = "openai"
135
+ else:
136
+ raise ValueError(
137
+ "Unable to infer 'type' from model_config. Please specify 'type' as 'azure_openai' or 'openai'."
138
+ )
139
+
140
+ if model_config["type"] == "azure_openai":
141
+ required_keys = ["azure_deployment", "azure_endpoint"]
142
+ elif model_config["type"] == "openai":
143
+ required_keys = ["api_key", "model"]
144
+ else:
145
+ raise ValueError("model_config 'type' must be 'azure_openai' or 'openai'.")
146
+
147
+ missing_keys = [key for key in required_keys if key not in model_config]
148
+ if missing_keys:
149
+ raise ValueError(f"model_config is missing required keys: {', '.join(missing_keys)}")
150
+ none_keys = [key for key in required_keys if model_config.get(key) is None]
151
+ if none_keys:
152
+ raise ValueError(f"The following keys in model_config must not be None: {', '.join(none_keys)}")
153
+
154
+ async def _simulate(
155
+ self,
156
+ target: Callable,
157
+ max_conversation_turns: int = 1,
158
+ max_simulation_results: int = 3,
159
+ conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
160
+ tasks: List[str] = [],
161
+ adversarial_scenario: Optional[
162
+ Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]
163
+ ] = None,
164
+ source_text: Optional[str] = None,
165
+ direct_attack: bool = False,
166
+ randomization_seed: Optional[int] = None,
167
+ concurrent_async_tasks: Optional[int] = 5,
168
+ ) -> Dict[str, str]:
169
+ """
170
+ Generates synthetic conversations based on provided parameters.
171
+
172
+ :param target: The target function to call during the simulation.
173
+ :type target: Callable
174
+ :param max_conversation_turns: The maximum number of turns in a conversation.
175
+ :type max_conversation_turns: int
176
+ :param max_simulation_results: The maximum number of simulation results to generate.
177
+ :type max_simulation_results: int
178
+ :param conversation_turns: Predefined conversation turns to simulate.
179
+ :type conversation_turns: List[List[Union[str, Dict[str, Any]]]]
180
+ :param tasks A list of user tasks, each represented as a list of strings. Text should be relevant for the tasks and facilitate the simulation. One example is to use text to provide context for the tasks.
181
+ :type tasks: List[str] = [],
182
+ :param adversarial_scenario: The adversarial scenario to simulate. If None, the non-adversarial Simulator is used.
183
+ :type adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
184
+ :param source_text: The source text to use as grounding document in the simulation.
185
+ :type source_text: Optional[str]
186
+ :param direct_attack: If True, the DirectAttackSimulator will be run.
187
+ :type direct_attack: bool
188
+ """
189
+
190
+ ## Check if target is already a callback-style function
191
+ if self._check_target_is_callback(target):
192
+ # Use the target directly as it's already a callback
193
+ callback = target
194
+ else:
195
+ # Define callback wrapper for simple targets
196
+ async def callback(
197
+ messages: List[Dict],
198
+ stream: bool = False,
199
+ session_state: Optional[str] = None,
200
+ context: Optional[Dict] = None,
201
+ ) -> dict:
202
+ messages_list = messages["messages"] # type: ignore
203
+ latest_message = messages_list[-1]
204
+ application_input = latest_message["content"]
205
+ context = latest_message.get("context", None)
206
+ latest_context = None
207
+ try:
208
+ is_async = self._is_async_function(target)
209
+ if self._check_target_returns_context(target):
210
+ if is_async:
211
+ response, latest_context = await target(query=application_input)
212
+ else:
213
+ response, latest_context = target(query=application_input)
214
+ else:
215
+ if is_async:
216
+ response = await target(query=application_input)
217
+ else:
218
+ response = target(query=application_input)
219
+ except Exception as e:
220
+ response = f"Something went wrong {e!s}"
221
+
222
+ ## We format the response to follow the openAI chat protocol
223
+ formatted_response = {
224
+ "content": response,
225
+ "role": "assistant",
226
+ "context": latest_context if latest_context else context,
227
+ }
228
+ ## NOTE: In the future, instead of appending to messages we
229
+ ## should just return `formatted_response`
230
+ messages["messages"].append(formatted_response) # type: ignore
231
+ return {
232
+ "messages": messages_list,
233
+ "stream": stream,
234
+ "session_state": session_state,
235
+ "context": latest_context if latest_context else context,
236
+ }
237
+
238
+ ## Run simulator
239
+ simulator = None
240
+ simulator_outputs = None
241
+ jailbreak_outputs = None
242
+ simulator_data_paths = {}
243
+
244
+ # if IndirectAttack, run IndirectAttackSimulator
245
+ if adversarial_scenario == AdversarialScenarioJailbreak.ADVERSARIAL_INDIRECT_JAILBREAK:
246
+ self.logger.info(
247
+ f"Running IndirectAttackSimulator with inputs: adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}, conversation_turns={conversation_turns}, text={source_text}"
248
+ )
249
+ simulator = IndirectAttackSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
250
+ simulator_outputs = await simulator(
251
+ scenario=adversarial_scenario,
252
+ max_conversation_turns=max_conversation_turns,
253
+ max_simulation_results=max_simulation_results,
254
+ tasks=tasks,
255
+ conversation_turns=conversation_turns,
256
+ text=source_text,
257
+ target=callback,
258
+ randomization_seed=randomization_seed,
259
+ concurrent_async_task=concurrent_async_tasks,
260
+ )
261
+
262
+ # if DirectAttack, run DirectAttackSimulator
263
+ elif direct_attack and isinstance(adversarial_scenario, AdversarialScenario):
264
+ self.logger.info(
265
+ f"Running DirectAttackSimulator with inputs: adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}"
266
+ )
267
+ simulator = DirectAttackSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
268
+ simulator_outputs = await simulator(
269
+ scenario=adversarial_scenario if adversarial_scenario else AdversarialScenario.ADVERSARIAL_REWRITE,
270
+ max_conversation_turns=max_conversation_turns,
271
+ max_simulation_results=max_simulation_results,
272
+ target=callback,
273
+ randomization_seed=randomization_seed,
274
+ concurrent_async_task=concurrent_async_tasks,
275
+ )
276
+ jailbreak_outputs = simulator_outputs["jailbreak"]
277
+ simulator_outputs = simulator_outputs["regular"]
278
+
279
+ ## If adversarial_scenario is not provided, run Simulator
280
+ elif adversarial_scenario is None and self.model_config:
281
+ self.logger.info(
282
+ f"Running Simulator with inputs: adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}, conversation_turns={conversation_turns}, source_text={source_text}"
283
+ )
284
+ simulator = Simulator(self.model_config)
285
+ simulator_outputs = await simulator(
286
+ max_conversation_turns=max_conversation_turns,
287
+ max_simulation_results=max_simulation_results,
288
+ conversation_turns=conversation_turns,
289
+ num_queries=max_simulation_results,
290
+ target=callback,
291
+ text=source_text if source_text else "",
292
+ concurrent_async_tasks=concurrent_async_tasks,
293
+ randomization_seed=randomization_seed,
294
+ )
295
+
296
+ ## Run AdversarialSimulator
297
+ elif adversarial_scenario:
298
+ self.logger.info(
299
+ f"Running AdversarialSimulator with inputs: adversarial_scenario={adversarial_scenario}, max_conversation_turns={max_conversation_turns}, max_simulation_results={max_simulation_results}, conversation_turns={conversation_turns}, source_text={source_text}"
300
+ )
301
+ simulator = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
302
+ simulator_outputs = await simulator(
303
+ scenario=adversarial_scenario, # type: ignore
304
+ max_conversation_turns=max_conversation_turns,
305
+ max_simulation_results=max_simulation_results,
306
+ conversation_turns=conversation_turns,
307
+ target=callback,
308
+ text=source_text,
309
+ randomization_seed=randomization_seed,
310
+ concurrent_async_task=concurrent_async_tasks,
311
+ )
312
+
313
+ ## If no outputs are generated, raise an exception
314
+ if not simulator_outputs:
315
+ self.logger.error("No outputs generated by the simulator")
316
+ msg = "No outputs generated by the simulator"
317
+ raise EvaluationException(
318
+ message=msg,
319
+ internal_message=msg,
320
+ target=ErrorTarget.ADVERSARIAL_SIMULATOR,
321
+ category=ErrorCategory.UNKNOWN,
322
+ blame=ErrorBlame.USER_ERROR,
323
+ )
324
+
325
+ data_path_base = simulator.__class__.__name__
326
+
327
+ ## Write outputs to file according to scenario
328
+ if direct_attack and jailbreak_outputs:
329
+ jailbreak_data_path = data_path_base + JAILBREAK_EXT
330
+ with Path(jailbreak_data_path + DATA_EXT).open("w") as f:
331
+ f.writelines(jailbreak_outputs.to_eval_qr_json_lines())
332
+ simulator_data_paths[jailbreak_data_path] = jailbreak_data_path + DATA_EXT
333
+ with Path(data_path_base + DATA_EXT).open("w") as f:
334
+ if not adversarial_scenario or adversarial_scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION:
335
+ if source_text or self._check_target_returns_context(target):
336
+ eval_input_data_json_lines = ""
337
+ for output in simulator_outputs:
338
+ query = None
339
+ response = None
340
+ context = source_text
341
+ ground_truth = source_text
342
+ for message in output["messages"]:
343
+ if message["role"] == "user":
344
+ query = message["content"]
345
+ if message["role"] == "assistant":
346
+ response = message["content"]
347
+ if query and response:
348
+ eval_input_data_json_lines += (
349
+ json.dumps(
350
+ {
351
+ "query": query,
352
+ "response": response,
353
+ "context": context,
354
+ "ground_truth": ground_truth,
355
+ }
356
+ )
357
+ + "\n"
358
+ )
359
+ f.write(eval_input_data_json_lines)
360
+ elif isinstance(simulator_outputs, JsonLineList):
361
+ f.writelines(simulator_outputs.to_eval_qr_json_lines())
362
+ else:
363
+ f.writelines(output.to_eval_qr_json_lines() for output in simulator_outputs)
364
+ else:
365
+ f.writelines(
366
+ [
367
+ json.dumps({"conversation": {"messages": conversation["messages"]}}) + "\n"
368
+ for conversation in simulator_outputs
369
+ ]
370
+ )
371
+ simulator_data_paths[data_path_base] = data_path_base + DATA_EXT
372
+
373
+ return simulator_data_paths
374
+
375
+ def _get_scenario(
376
+ self,
377
+ evaluators: List[_SafetyEvaluator],
378
+ num_turns: int = 3,
379
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
380
+ ) -> Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]]:
381
+ """
382
+ Returns the Simulation scenario based on the provided list of SafetyEvaluator.
383
+
384
+ :param evaluators: A list of SafetyEvaluator.
385
+ :type evaluators: List[SafetyEvaluator]
386
+ :param num_turns: The number of turns in a conversation.
387
+ :type num_turns: int
388
+ :param scenario: The adversarial scenario to simulate.
389
+ :type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
390
+ """
391
+ if len(evaluators) == 0:
392
+ return AdversarialScenario.ADVERSARIAL_QA
393
+ for evaluator in evaluators:
394
+ if evaluator in [_SafetyEvaluator.CONTENT_SAFETY, _SafetyEvaluator.DIRECT_ATTACK]:
395
+ if num_turns == 1 and scenario:
396
+ return scenario
397
+ return (
398
+ AdversarialScenario.ADVERSARIAL_CONVERSATION
399
+ if num_turns > 1
400
+ else AdversarialScenario.ADVERSARIAL_QA
401
+ )
402
+ if evaluator == _SafetyEvaluator.ECI:
403
+ return _UnstableAdversarialScenario.ECI
404
+ if evaluator == _SafetyEvaluator.CODE_VULNERABILITY:
405
+ return AdversarialScenario.ADVERSARIAL_CODE_VULNERABILITY
406
+ if evaluator == _SafetyEvaluator.UNGROUNDED_ATTRIBUTES:
407
+ return AdversarialScenario.ADVERSARIAL_UNGROUNDED_ATTRIBUTES
408
+ if evaluator in [
409
+ _SafetyEvaluator.GROUNDEDNESS,
410
+ _SafetyEvaluator.RELEVANCE,
411
+ _SafetyEvaluator.SIMILARITY,
412
+ _SafetyEvaluator.FLUENCY,
413
+ _SafetyEvaluator.COHERENCE,
414
+ ]:
415
+ return None
416
+ if evaluator == _SafetyEvaluator.PROTECTED_MATERIAL:
417
+ return AdversarialScenario.ADVERSARIAL_CONTENT_PROTECTED_MATERIAL
418
+ if evaluator == _SafetyEvaluator.INDIRECT_ATTACK:
419
+ return AdversarialScenarioJailbreak.ADVERSARIAL_INDIRECT_JAILBREAK
420
+
421
+ msg = f"Invalid evaluator: {evaluator}. Supported evaluators: {_SafetyEvaluator.__members__.values()}"
422
+ raise EvaluationException(
423
+ message=msg,
424
+ internal_message=msg,
425
+ target=ErrorTarget.UNKNOWN,
426
+ category=ErrorCategory.INVALID_VALUE,
427
+ blame=ErrorBlame.USER_ERROR,
428
+ )
429
+
430
+ def _get_evaluators(
431
+ self,
432
+ evaluators: List[_SafetyEvaluator],
433
+ ) -> Dict[str, Callable]:
434
+ """
435
+ Returns a dictionary of evaluators based on the provided list of SafetyEvaluator.
436
+
437
+ :param evaluators: A list of SafetyEvaluator.
438
+ :type evaluators: List[SafetyEvaluator]
439
+ """
440
+ evaluators_dict = {}
441
+ # Default to content safety when no evaluators are specified
442
+ if len(evaluators) == 0:
443
+ evaluators_dict["content_safety"] = _content_safety.ContentSafetyEvaluator(
444
+ azure_ai_project=self.azure_ai_project, credential=self.credential
445
+ )
446
+ return evaluators_dict
447
+
448
+ for evaluator in evaluators:
449
+ if evaluator == _SafetyEvaluator.CONTENT_SAFETY:
450
+ evaluators_dict["content_safety"] = _content_safety.ContentSafetyEvaluator(
451
+ azure_ai_project=self.azure_ai_project, credential=self.credential
452
+ )
453
+ elif evaluator == _SafetyEvaluator.GROUNDEDNESS:
454
+ evaluators_dict["groundedness"] = _groundedness.GroundednessEvaluator(
455
+ model_config=self.model_config,
456
+ )
457
+ elif evaluator == _SafetyEvaluator.PROTECTED_MATERIAL:
458
+ evaluators_dict["protected_material"] = _protected_material.ProtectedMaterialEvaluator(
459
+ azure_ai_project=self.azure_ai_project, credential=self.credential
460
+ )
461
+ elif evaluator == _SafetyEvaluator.RELEVANCE:
462
+ evaluators_dict["relevance"] = _relevance.RelevanceEvaluator(
463
+ model_config=self.model_config,
464
+ )
465
+ elif evaluator == _SafetyEvaluator.SIMILARITY:
466
+ evaluators_dict["similarity"] = _similarity.SimilarityEvaluator(
467
+ model_config=self.model_config,
468
+ )
469
+ elif evaluator == _SafetyEvaluator.FLUENCY:
470
+ evaluators_dict["fluency"] = _fluency.FluencyEvaluator(
471
+ model_config=self.model_config,
472
+ )
473
+ elif evaluator == _SafetyEvaluator.COHERENCE:
474
+ evaluators_dict["coherence"] = _coherence.CoherenceEvaluator(
475
+ model_config=self.model_config,
476
+ )
477
+ elif evaluator == _SafetyEvaluator.INDIRECT_ATTACK:
478
+ evaluators_dict["indirect_attack"] = _xpia.IndirectAttackEvaluator(
479
+ azure_ai_project=self.azure_ai_project, credential=self.credential
480
+ )
481
+ elif evaluator == _SafetyEvaluator.DIRECT_ATTACK:
482
+ evaluators_dict["content_safety"] = _content_safety.ContentSafetyEvaluator(
483
+ azure_ai_project=self.azure_ai_project, credential=self.credential
484
+ )
485
+ elif evaluator == _SafetyEvaluator.ECI:
486
+ evaluators_dict["eci"] = ECIEvaluator(
487
+ azure_ai_project=self.azure_ai_project, credential=self.credential
488
+ )
489
+ elif evaluator == _SafetyEvaluator.CODE_VULNERABILITY:
490
+ evaluators_dict["code_vulnerability"] = _code_vulnerability.CodeVulnerabilityEvaluator(
491
+ azure_ai_project=self.azure_ai_project, credential=self.credential
492
+ )
493
+ elif evaluator == _SafetyEvaluator.UNGROUNDED_ATTRIBUTES:
494
+ evaluators_dict["ungrounded_attributes"] = _ungrounded_attributes.UngroundedAttributesEvaluator(
495
+ azure_ai_project=self.azure_ai_project, credential=self.credential
496
+ )
497
+ else:
498
+ msg = (
499
+ f"Invalid evaluator: {evaluator}. Supported evaluators are: {_SafetyEvaluator.__members__.values()}"
500
+ )
501
+ raise EvaluationException(
502
+ message=msg,
503
+ internal_message=msg,
504
+ target=ErrorTarget.UNKNOWN, ## NOTE: We should add a target for this potentially
505
+ category=ErrorCategory.INVALID_VALUE,
506
+ blame=ErrorBlame.USER_ERROR,
507
+ )
508
+ return evaluators_dict
509
+
510
+ @staticmethod
511
+ def _check_target_returns_context(target: Callable) -> bool:
512
+ """
513
+ Checks if the target function returns a tuple. We assume the second value in the tuple is the "context".
514
+
515
+ :param target: The target function to check.
516
+ :type target: Callable
517
+ """
518
+ sig = inspect.signature(target)
519
+ ret_type = sig.return_annotation
520
+ if ret_type == inspect.Signature.empty:
521
+ return False
522
+
523
+ # Check for Coroutine/Awaitable return types for async functions
524
+ origin = getattr(ret_type, "__origin__", None)
525
+ if origin is not None and (origin is Coroutine or origin is Awaitable):
526
+ args = getattr(ret_type, "__args__", None)
527
+ if args and len(args) > 0:
528
+ # For async functions, check the actual return type inside the Coroutine
529
+ ret_type = args[-1]
530
+
531
+ if ret_type is tuple:
532
+ return True
533
+ return False
534
+
535
+ @staticmethod
536
+ def _check_target_returns_str(target: Callable) -> bool:
537
+ """
538
+ Checks if the target function returns a string.
539
+
540
+ :param target: The target function to check.
541
+ :type target: Callable
542
+ """
543
+ sig = inspect.signature(target)
544
+ ret_type = sig.return_annotation
545
+ if ret_type == inspect.Signature.empty:
546
+ return False
547
+
548
+ # Check for Coroutine/Awaitable return types for async functions
549
+ origin = getattr(ret_type, "__origin__", None)
550
+ if origin is not None and (origin is Coroutine or origin is Awaitable):
551
+ args = getattr(ret_type, "__args__", None)
552
+ if args and len(args) > 0:
553
+ # For async functions, check the actual return type inside the Coroutine
554
+ ret_type = args[-1]
555
+
556
+ if ret_type is str:
557
+ return True
558
+ return False
559
+
560
+ @staticmethod
561
+ def _is_async_function(target: Callable) -> bool:
562
+ """
563
+ Checks if the target function is an async function.
564
+
565
+ :param target: The target function to check.
566
+ :type target: Callable
567
+ :return: True if the target function is async, False otherwise.
568
+ :rtype: bool
569
+ """
570
+ return asyncio.iscoroutinefunction(target)
571
+
572
+ @staticmethod
573
+ def _check_target_is_callback(target: Callable) -> bool:
574
+ sig = inspect.signature(target)
575
+ param_names = list(sig.parameters.keys())
576
+ return "messages" in param_names and "session_state" in param_names and "context" in param_names
577
+
578
+ def _validate_inputs(
579
+ self,
580
+ evaluators: List[_SafetyEvaluator],
581
+ target: Union[Callable, AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
582
+ num_turns: int = 1,
583
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
584
+ source_text: Optional[str] = None,
585
+ ):
586
+ """
587
+ Validates the inputs provided to the __call__ function of the SafetyEvaluation object.
588
+ :param evaluators: A list of SafetyEvaluator.
589
+ :type evaluators: List[SafetyEvaluator]
590
+ :param target: The target function to call during the evaluation.
591
+ :type target: Callable
592
+ :param num_turns: The number of turns in a between the target application and the caller.
593
+ :type num_turns: int
594
+ :param scenario: The adversarial scenario to simulate.
595
+ :type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
596
+ :param source_text: The source text to use as grounding document in the evaluation.
597
+ :type source_text: Optional[str]
598
+ """
599
+ if not callable(target):
600
+ self._validate_model_config(target)
601
+ elif not self._check_target_is_callback(target) and not self._check_target_returns_str(target):
602
+ msg = (
603
+ f"Invalid target function signature. The target function must be either:\n\n"
604
+ f"1. A simple function that takes a 'query' parameter and returns a string:\n"
605
+ f" def my_target(query: str) -> str:\n"
606
+ f" return f'Response to: {{query}}'\n\n"
607
+ f"2. A callback-style function with these exact parameters:\n"
608
+ f" async def my_callback(\n"
609
+ f" messages: List[Dict],\n"
610
+ f" stream: bool = False,\n"
611
+ f" session_state: Any = None,\n"
612
+ f" context: Any = None\n"
613
+ f" ) -> dict:\n"
614
+ f" # Process messages and return dict with 'messages', 'stream', 'session_state', 'context'\n"
615
+ f" return {{'messages': messages['messages'], 'stream': stream, 'session_state': session_state, 'context': context}}\n\n"
616
+ f"Your function '{target.__name__}' does not match either pattern. "
617
+ f"Please check the function signature and return type."
618
+ )
619
+ self.logger.error(msg)
620
+ raise EvaluationException(
621
+ message=msg,
622
+ internal_message=msg,
623
+ target=ErrorTarget.UNKNOWN,
624
+ category=ErrorCategory.INVALID_VALUE,
625
+ blame=ErrorBlame.USER_ERROR,
626
+ )
627
+
628
+ if _SafetyEvaluator.GROUNDEDNESS in evaluators and not source_text:
629
+ self.logger.error(f"GroundednessEvaluator requires source_text. Source text: {source_text}")
630
+ msg = "GroundednessEvaluator requires source_text"
631
+ raise EvaluationException(
632
+ message=msg,
633
+ internal_message=msg,
634
+ target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
635
+ category=ErrorCategory.MISSING_FIELD,
636
+ blame=ErrorBlame.USER_ERROR,
637
+ )
638
+
639
+ if scenario and len(evaluators) > 0 and not _SafetyEvaluator.CONTENT_SAFETY in evaluators:
640
+ self.logger.error(f"Adversarial scenario {scenario} is not supported without content safety evaluation.")
641
+ msg = f"Adversarial scenario {scenario} is not supported without content safety evaluation."
642
+ raise EvaluationException(
643
+ message=msg,
644
+ internal_message=msg,
645
+ target=ErrorTarget.UNKNOWN,
646
+ category=ErrorCategory.INVALID_VALUE,
647
+ blame=ErrorBlame.USER_ERROR,
648
+ )
649
+
650
+ if _SafetyEvaluator.CODE_VULNERABILITY in evaluators and num_turns > 1:
651
+ self.logger.error("Code vulnerability evaluation only supports single-turn conversations.")
652
+ msg = "Code vulnerability evaluation only supports single-turn conversations."
653
+ raise EvaluationException(
654
+ message=msg,
655
+ internal_message=msg,
656
+ target=ErrorTarget.UNKNOWN,
657
+ category=ErrorCategory.INVALID_VALUE,
658
+ blame=ErrorBlame.USER_ERROR,
659
+ )
660
+ if _SafetyEvaluator.UNGROUNDED_ATTRIBUTES in evaluators and num_turns > 1:
661
+ self.logger.error("Ungrounded attributes evaluation only supports single-turn conversations.")
662
+ msg = "Ungrounded attributes evaluation only supports single-turn conversations."
663
+ raise EvaluationException(
664
+ message=msg,
665
+ internal_message=msg,
666
+ target=ErrorTarget.UNKNOWN,
667
+ category=ErrorCategory.INVALID_VALUE,
668
+ blame=ErrorBlame.USER_ERROR,
669
+ )
670
+
671
+ if (
672
+ _SafetyEvaluator.CONTENT_SAFETY in evaluators
673
+ and scenario
674
+ and num_turns > 1
675
+ and scenario != AdversarialScenario.ADVERSARIAL_CONVERSATION
676
+ ):
677
+ self.logger.error(
678
+ f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
679
+ )
680
+ msg = (
681
+ f"Adversarial scenario {scenario} is not supported for content safety evaluation with more than 1 turn."
682
+ )
683
+ raise EvaluationException(
684
+ message=msg,
685
+ internal_message=msg,
686
+ target=ErrorTarget.UNKNOWN,
687
+ category=ErrorCategory.INVALID_VALUE,
688
+ blame=ErrorBlame.USER_ERROR,
689
+ )
690
+
691
+ if (
692
+ any(
693
+ e in evaluators
694
+ for e in [
695
+ _SafetyEvaluator.GROUNDEDNESS,
696
+ _SafetyEvaluator.RELEVANCE,
697
+ _SafetyEvaluator.SIMILARITY,
698
+ _SafetyEvaluator.FLUENCY,
699
+ _SafetyEvaluator.COHERENCE,
700
+ ]
701
+ )
702
+ and not self.model_config
703
+ ):
704
+ self.logger.error(f"Model configuration is required for evaluators: {evaluators}")
705
+ msg = f"Model configuration is required for evaluators: {evaluators}"
706
+ raise EvaluationException(
707
+ message=msg,
708
+ internal_message=msg,
709
+ target=ErrorTarget.UNKNOWN,
710
+ category=ErrorCategory.MISSING_FIELD,
711
+ blame=ErrorBlame.USER_ERROR,
712
+ )
713
+
714
+ def _calculate_defect_rate(self, evaluation_result_dict) -> EvaluationResult:
715
+ jailbreak_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT in key]
716
+ regular_cols = [key for key in evaluation_result_dict.keys() if JAILBREAK_EXT not in key]
717
+ evaluation_result: EvaluationResult = {"metrics": {}, "rows": [], "studio_url": ""}
718
+ if len(jailbreak_cols) > 0 and len(regular_cols) > 0:
719
+ hate_unfairness_defects = []
720
+ violence_defects = []
721
+ sexual_defects = []
722
+ self_harm_defects = []
723
+ jailbreak_rows = sorted(
724
+ evaluation_result_dict[jailbreak_cols[0]]["rows"],
725
+ key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
726
+ )
727
+ regular_rows = sorted(
728
+ evaluation_result_dict[regular_cols[0]]["rows"],
729
+ key=lambda x: (x.get("inputs.category") is None, x.get("inputs.category", "")),
730
+ )
731
+ for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
732
+ hate_unfairness_defects.append(
733
+ 1
734
+ if (
735
+ row[0]["outputs.content_safety.hate_unfairness_score"]
736
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
737
+ and not row[1]["outputs.content_safety.hate_unfairness_score"]
738
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
739
+ )
740
+ else 0
741
+ )
742
+ violence_defects.append(
743
+ 1
744
+ if (
745
+ row[0]["outputs.content_safety.violence_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
746
+ and not row[1]["outputs.content_safety.violence_score"]
747
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
748
+ )
749
+ else 0
750
+ )
751
+ sexual_defects.append(
752
+ 1
753
+ if (
754
+ row[0]["outputs.content_safety.sexual_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
755
+ and not row[1]["outputs.content_safety.sexual_score"]
756
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
757
+ )
758
+ else 0
759
+ )
760
+ self_harm_defects.append(
761
+ 1
762
+ if (
763
+ row[0]["outputs.content_safety.self_harm_score"] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
764
+ and not row[1]["outputs.content_safety.self_harm_score"]
765
+ > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT
766
+ )
767
+ else 0
768
+ )
769
+ hate_unfairness_defect_rate = list_mean_nan_safe(hate_unfairness_defects)
770
+ violence_defect_rate = list_mean_nan_safe(violence_defects)
771
+ sexual_defect_rate = list_mean_nan_safe(sexual_defects)
772
+ self_harm_defect_rate = list_mean_nan_safe(self_harm_defects)
773
+
774
+ evaluation_result["rows"] = (
775
+ evaluation_result_dict[jailbreak_cols[0]]["rows"] + evaluation_result_dict[regular_cols[0]]["rows"]
776
+ )
777
+ evaluation_result["metrics"] = {
778
+ "content_safety.violence_defect_rate": hate_unfairness_defect_rate,
779
+ "content_safety.sexual_defect_rate": violence_defect_rate,
780
+ "content_safety.hate_unfairness_defect_rate": sexual_defect_rate,
781
+ "content_safety.self_harm_defect_rate": self_harm_defect_rate,
782
+ }
783
+ evaluation_result["studio_url"] = (
784
+ evaluation_result_dict[jailbreak_cols[0]]["studio_url"]
785
+ + "\t"
786
+ + evaluation_result_dict[regular_cols[0]]["studio_url"]
787
+ )
788
+ return evaluation_result
789
+
790
+ async def __call__(
791
+ self,
792
+ target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
793
+ evaluators: List[_SafetyEvaluator] = [],
794
+ evaluation_name: Optional[str] = None,
795
+ num_turns: int = 1,
796
+ num_rows: int = 5,
797
+ scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]] = None,
798
+ conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
799
+ tasks: List[str] = [],
800
+ data_only: bool = False,
801
+ source_text: Optional[str] = None,
802
+ data_path: Optional[Union[str, os.PathLike]] = None,
803
+ jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
804
+ output_path: Optional[Union[str, os.PathLike]] = None,
805
+ data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str, os.PathLike]]]] = None,
806
+ randomization_seed: Optional[int] = None,
807
+ concurrent_async_tasks: Optional[int] = 5,
808
+ ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str, os.PathLike]]]:
809
+ """
810
+ Evaluates the target function based on the provided parameters.
811
+
812
+ :param target: The target function to call during the evaluation. This can be a synchronous or asynchronous function.
813
+ :type target: Union[Callable, Awaitable[Any], AzureOpenAIModelConfiguration, OpenAIModelConfiguration]
814
+ :param evaluators: A list of SafetyEvaluator.
815
+ :type evaluators: List[_SafetyEvaluator]
816
+ :param evaluation_name: The display name name of the evaluation.
817
+ :type evaluation_name: Optional[str]
818
+ :param num_turns: The number of turns in a between the target application and the caller.
819
+ :type num_turns: int
820
+ :param num_rows: The (maximum) number of rows to generate for evaluation.
821
+ :type num_rows: int
822
+ :param scenario: The adversarial scenario to simulate.
823
+ :type scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak]]
824
+ :param conversation_turns: Predefined conversation turns to simulate.
825
+ :type conversation_turns: List[List[Union[str, Dict[str, Any]]]]
826
+ :param tasks A list of user tasks, each represented as a list of strings. Text should be relevant for the tasks and facilitate the simulation. One example is to use text to provide context for the tasks.
827
+ :type tasks: List[str] = [],
828
+ :param data_only: If True, the filepath to which simulation results are written will be returned.
829
+ :type data_only: bool
830
+ :param source_text: The source text to use as grounding document in the evaluation.
831
+ :type source_text: Optional[str]
832
+ :param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run.
833
+ :type data_path: Optional[Union[str, os.PathLike]]
834
+ :param jailbreak_data_path: The path to the data file generated by the Simulator for jailbreak scenario. If None, the DirectAttackSimulator will be run.
835
+ :type jailbreak_data_path: Optional[Union[str, os.PathLike]] :param output_path: The path to write the evaluation results to if set.
836
+ :type output_path: Optional[Union[str, os.PathLike]]
837
+ :param data_paths: A dictionary of data paths to evaluate. If None, the Simulator will be run.
838
+ :type data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]]
839
+ :param randomization_seed: The seed used to randomize prompt selection. If unset, the system's default seed is used.
840
+ :type randomization_seed: Optional[int]
841
+ :param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
842
+ :type concurrent_async_tasks: Optional[int]
843
+ """
844
+ ## Log inputs
845
+ self.logger.info(
846
+ f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}"
847
+ )
848
+
849
+ ## Validate arguments
850
+ self._validate_inputs(
851
+ evaluators=evaluators,
852
+ target=target,
853
+ num_turns=num_turns,
854
+ scenario=scenario,
855
+ source_text=source_text,
856
+ )
857
+
858
+ # Get scenario
859
+ adversarial_scenario = self._get_scenario(evaluators, num_turns=num_turns, scenario=scenario)
860
+ self.logger.info(f"Using scenario: {adversarial_scenario}")
861
+
862
+ ## Get evaluators
863
+ evaluators_dict = self._get_evaluators(evaluators)
864
+
865
+ ## If `data_path` is not provided, run simulator
866
+ if not data_paths and data_path is None and jailbreak_data_path is None and isinstance(target, Callable):
867
+ self.logger.info(f"No data_path provided. Running simulator.")
868
+ data_paths = await self._simulate(
869
+ target=target,
870
+ adversarial_scenario=adversarial_scenario,
871
+ max_conversation_turns=num_turns,
872
+ max_simulation_results=num_rows,
873
+ conversation_turns=conversation_turns,
874
+ tasks=tasks,
875
+ source_text=source_text,
876
+ direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
877
+ randomization_seed=randomization_seed,
878
+ concurrent_async_tasks=concurrent_async_tasks,
879
+ )
880
+ elif data_path:
881
+ data_paths = {Path(data_path).stem: data_path}
882
+ if jailbreak_data_path:
883
+ data_paths[Path(jailbreak_data_path).stem + JAILBREAK_EXT] = jailbreak_data_path
884
+
885
+ if data_only and data_paths:
886
+ return data_paths
887
+
888
+ ## Run evaluation
889
+ evaluation_results = {}
890
+ if data_paths:
891
+ for strategy, data_path in data_paths.items():
892
+ self.logger.info(
893
+ f"Running evaluation for data with inputs data_path={data_path}, evaluators={evaluators_dict}, azure_ai_project={self.azure_ai_project}, output_path={output_path}"
894
+ )
895
+ if evaluation_name:
896
+ output_prefix = evaluation_name + "_"
897
+ else:
898
+ output_prefix = ""
899
+ evaluate_outputs = _evaluate.evaluate(
900
+ data=data_path,
901
+ evaluators=evaluators_dict,
902
+ azure_ai_project=self.azure_ai_project,
903
+ evaluation_name=evaluation_name,
904
+ output_path=output_path if output_path else f"{output_prefix}{strategy}{RESULTS_EXT}",
905
+ _use_pf_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
906
+ _use_run_submitter_client=False, # TODO: Remove this once eval logic for red team agent is moved to red team agent
907
+ )
908
+ evaluation_results[strategy] = evaluate_outputs
909
+ return evaluation_results
910
+ else:
911
+ raise EvaluationException(
912
+ message="No data found after simulation",
913
+ internal_message="No data found after simulation",
914
+ target=ErrorTarget.UNKNOWN,
915
+ category=ErrorCategory.MISSING_FIELD,
916
+ blame=ErrorBlame.USER_ERROR,
917
+ )