azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show
  1. azure/ai/evaluation/__init__.py +83 -14
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +140 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +68 -0
  5. azure/ai/evaluation/_aoai/python_grader.py +86 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +94 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +66 -0
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +80 -0
  9. azure/ai/evaluation/_azure/__init__.py +3 -0
  10. azure/ai/evaluation/_azure/_clients.py +204 -0
  11. azure/ai/evaluation/_azure/_envs.py +207 -0
  12. azure/ai/evaluation/_azure/_models.py +227 -0
  13. azure/ai/evaluation/_azure/_token_manager.py +129 -0
  14. azure/ai/evaluation/_common/__init__.py +9 -1
  15. azure/ai/evaluation/_common/constants.py +124 -2
  16. azure/ai/evaluation/_common/evaluation_onedp_client.py +169 -0
  17. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  18. azure/ai/evaluation/_common/onedp/_client.py +166 -0
  19. azure/ai/evaluation/_common/onedp/_configuration.py +72 -0
  20. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  21. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  22. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  23. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  24. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  25. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  26. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  27. azure/ai/evaluation/_common/onedp/_validation.py +66 -0
  28. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  29. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  30. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  31. azure/ai/evaluation/_common/onedp/aio/_client.py +168 -0
  32. azure/ai/evaluation/_common/onedp/aio/_configuration.py +72 -0
  33. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +49 -0
  35. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +7143 -0
  36. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/models/__init__.py +358 -0
  38. azure/ai/evaluation/_common/onedp/models/_enums.py +447 -0
  39. azure/ai/evaluation/_common/onedp/models/_models.py +5963 -0
  40. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  41. azure/ai/evaluation/_common/onedp/operations/__init__.py +49 -0
  42. azure/ai/evaluation/_common/onedp/operations/_operations.py +8951 -0
  43. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  44. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  54. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  55. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  56. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  57. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  58. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  59. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  60. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  61. azure/ai/evaluation/_common/rai_service.py +578 -69
  62. azure/ai/evaluation/_common/raiclient/__init__.py +34 -0
  63. azure/ai/evaluation/_common/raiclient/_client.py +128 -0
  64. azure/ai/evaluation/_common/raiclient/_configuration.py +87 -0
  65. azure/ai/evaluation/_common/raiclient/_model_base.py +1235 -0
  66. azure/ai/evaluation/_common/raiclient/_patch.py +20 -0
  67. azure/ai/evaluation/_common/raiclient/_serialization.py +2050 -0
  68. azure/ai/evaluation/_common/raiclient/_version.py +9 -0
  69. azure/ai/evaluation/_common/raiclient/aio/__init__.py +29 -0
  70. azure/ai/evaluation/_common/raiclient/aio/_client.py +130 -0
  71. azure/ai/evaluation/_common/raiclient/aio/_configuration.py +87 -0
  72. azure/ai/evaluation/_common/raiclient/aio/_patch.py +20 -0
  73. azure/ai/evaluation/_common/raiclient/aio/operations/__init__.py +25 -0
  74. azure/ai/evaluation/_common/raiclient/aio/operations/_operations.py +981 -0
  75. azure/ai/evaluation/_common/raiclient/aio/operations/_patch.py +20 -0
  76. azure/ai/evaluation/_common/raiclient/models/__init__.py +60 -0
  77. azure/ai/evaluation/_common/raiclient/models/_enums.py +18 -0
  78. azure/ai/evaluation/_common/raiclient/models/_models.py +651 -0
  79. azure/ai/evaluation/_common/raiclient/models/_patch.py +20 -0
  80. azure/ai/evaluation/_common/raiclient/operations/__init__.py +25 -0
  81. azure/ai/evaluation/_common/raiclient/operations/_operations.py +1238 -0
  82. azure/ai/evaluation/_common/raiclient/operations/_patch.py +20 -0
  83. azure/ai/evaluation/_common/raiclient/py.typed +1 -0
  84. azure/ai/evaluation/_common/utils.py +505 -27
  85. azure/ai/evaluation/_constants.py +148 -0
  86. azure/ai/evaluation/_converters/__init__.py +3 -0
  87. azure/ai/evaluation/_converters/_ai_services.py +899 -0
  88. azure/ai/evaluation/_converters/_models.py +467 -0
  89. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  90. azure/ai/evaluation/_eval_mapping.py +83 -0
  91. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +10 -2
  92. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +176 -0
  93. azure/ai/evaluation/_evaluate/_batch_run/batch_clients.py +82 -0
  94. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +18 -12
  95. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +19 -6
  96. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +47 -22
  97. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +18 -2
  98. azure/ai/evaluation/_evaluate/_eval_run.py +32 -46
  99. azure/ai/evaluation/_evaluate/_evaluate.py +1809 -142
  100. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +992 -0
  101. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -90
  102. azure/ai/evaluation/_evaluate/_utils.py +237 -42
  103. azure/ai/evaluation/_evaluator_definition.py +76 -0
  104. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +80 -28
  105. azure/ai/evaluation/_evaluators/_code_vulnerability/__init__.py +5 -0
  106. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +119 -0
  107. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +40 -4
  108. azure/ai/evaluation/_evaluators/_common/__init__.py +2 -0
  109. azure/ai/evaluation/_evaluators/_common/_base_eval.py +427 -29
  110. azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py +63 -0
  111. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +269 -12
  112. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +74 -9
  113. azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py +49 -0
  114. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +73 -53
  115. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +35 -5
  116. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +26 -5
  117. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +35 -5
  118. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +34 -4
  119. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +7 -0
  120. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +442 -0
  121. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  122. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +97 -70
  123. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +39 -3
  124. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +80 -25
  125. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +230 -20
  126. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  127. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  128. azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py +7 -0
  129. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +196 -0
  130. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +275 -0
  131. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +89 -36
  132. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +22 -4
  133. azure/ai/evaluation/_evaluators/_qa/_qa.py +94 -35
  134. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +100 -4
  135. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +154 -56
  136. azure/ai/evaluation/_evaluators/_response_completeness/__init__.py +7 -0
  137. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +202 -0
  138. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +84 -0
  139. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +39 -3
  140. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +166 -26
  141. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +38 -7
  142. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +81 -85
  143. azure/ai/evaluation/_evaluators/_task_adherence/__init__.py +7 -0
  144. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +226 -0
  145. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +101 -0
  146. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  147. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  148. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  149. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  150. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  151. azure/ai/evaluation/_evaluators/_tool_call_accuracy/__init__.py +9 -0
  152. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +298 -0
  153. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +166 -0
  154. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  155. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  156. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  157. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  158. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  159. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  160. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  161. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  162. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  163. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  164. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  165. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  166. azure/ai/evaluation/_evaluators/_ungrounded_attributes/__init__.py +5 -0
  167. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +102 -0
  168. azure/ai/evaluation/_evaluators/_xpia/xpia.py +20 -4
  169. azure/ai/evaluation/_exceptions.py +24 -1
  170. azure/ai/evaluation/_http_utils.py +7 -5
  171. azure/ai/evaluation/_legacy/__init__.py +3 -0
  172. azure/ai/evaluation/_legacy/_adapters/__init__.py +7 -0
  173. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  174. azure/ai/evaluation/_legacy/_adapters/_configuration.py +45 -0
  175. azure/ai/evaluation/_legacy/_adapters/_constants.py +10 -0
  176. azure/ai/evaluation/_legacy/_adapters/_errors.py +29 -0
  177. azure/ai/evaluation/_legacy/_adapters/_flows.py +28 -0
  178. azure/ai/evaluation/_legacy/_adapters/_service.py +16 -0
  179. azure/ai/evaluation/_legacy/_adapters/client.py +51 -0
  180. azure/ai/evaluation/_legacy/_adapters/entities.py +26 -0
  181. azure/ai/evaluation/_legacy/_adapters/tracing.py +28 -0
  182. azure/ai/evaluation/_legacy/_adapters/types.py +15 -0
  183. azure/ai/evaluation/_legacy/_adapters/utils.py +31 -0
  184. azure/ai/evaluation/_legacy/_batch_engine/__init__.py +9 -0
  185. azure/ai/evaluation/_legacy/_batch_engine/_config.py +48 -0
  186. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +477 -0
  187. azure/ai/evaluation/_legacy/_batch_engine/_exceptions.py +88 -0
  188. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +132 -0
  189. azure/ai/evaluation/_legacy/_batch_engine/_result.py +107 -0
  190. azure/ai/evaluation/_legacy/_batch_engine/_run.py +127 -0
  191. azure/ai/evaluation/_legacy/_batch_engine/_run_storage.py +128 -0
  192. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +262 -0
  193. azure/ai/evaluation/_legacy/_batch_engine/_status.py +25 -0
  194. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +97 -0
  195. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +97 -0
  196. azure/ai/evaluation/_legacy/_batch_engine/_utils_deprecated.py +131 -0
  197. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  198. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +117 -0
  199. azure/ai/evaluation/_legacy/_common/_logging.py +292 -0
  200. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +17 -0
  201. azure/ai/evaluation/_legacy/prompty/__init__.py +36 -0
  202. azure/ai/evaluation/_legacy/prompty/_connection.py +119 -0
  203. azure/ai/evaluation/_legacy/prompty/_exceptions.py +139 -0
  204. azure/ai/evaluation/_legacy/prompty/_prompty.py +430 -0
  205. azure/ai/evaluation/_legacy/prompty/_utils.py +663 -0
  206. azure/ai/evaluation/_legacy/prompty/_yaml_utils.py +99 -0
  207. azure/ai/evaluation/_model_configurations.py +26 -0
  208. azure/ai/evaluation/_safety_evaluation/__init__.py +3 -0
  209. azure/ai/evaluation/_safety_evaluation/_generated_rai_client.py +0 -0
  210. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +917 -0
  211. azure/ai/evaluation/_user_agent.py +32 -1
  212. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -4
  213. azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -4
  214. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -4
  215. azure/ai/evaluation/_version.py +2 -1
  216. azure/ai/evaluation/red_team/__init__.py +22 -0
  217. azure/ai/evaluation/red_team/_agent/__init__.py +3 -0
  218. azure/ai/evaluation/red_team/_agent/_agent_functions.py +261 -0
  219. azure/ai/evaluation/red_team/_agent/_agent_tools.py +461 -0
  220. azure/ai/evaluation/red_team/_agent/_agent_utils.py +89 -0
  221. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +228 -0
  222. azure/ai/evaluation/red_team/_attack_objective_generator.py +268 -0
  223. azure/ai/evaluation/red_team/_attack_strategy.py +49 -0
  224. azure/ai/evaluation/red_team/_callback_chat_target.py +115 -0
  225. azure/ai/evaluation/red_team/_default_converter.py +21 -0
  226. azure/ai/evaluation/red_team/_evaluation_processor.py +505 -0
  227. azure/ai/evaluation/red_team/_mlflow_integration.py +430 -0
  228. azure/ai/evaluation/red_team/_orchestrator_manager.py +803 -0
  229. azure/ai/evaluation/red_team/_red_team.py +1717 -0
  230. azure/ai/evaluation/red_team/_red_team_result.py +661 -0
  231. azure/ai/evaluation/red_team/_result_processor.py +1708 -0
  232. azure/ai/evaluation/red_team/_utils/__init__.py +37 -0
  233. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +128 -0
  234. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +601 -0
  235. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +114 -0
  236. azure/ai/evaluation/red_team/_utils/constants.py +72 -0
  237. azure/ai/evaluation/red_team/_utils/exception_utils.py +345 -0
  238. azure/ai/evaluation/red_team/_utils/file_utils.py +266 -0
  239. azure/ai/evaluation/red_team/_utils/formatting_utils.py +365 -0
  240. azure/ai/evaluation/red_team/_utils/logging_utils.py +139 -0
  241. azure/ai/evaluation/red_team/_utils/metric_mapping.py +73 -0
  242. azure/ai/evaluation/red_team/_utils/objective_utils.py +46 -0
  243. azure/ai/evaluation/red_team/_utils/progress_utils.py +252 -0
  244. azure/ai/evaluation/red_team/_utils/retry_utils.py +218 -0
  245. azure/ai/evaluation/red_team/_utils/strategy_utils.py +218 -0
  246. azure/ai/evaluation/simulator/_adversarial_scenario.py +6 -0
  247. azure/ai/evaluation/simulator/_adversarial_simulator.py +187 -80
  248. azure/ai/evaluation/simulator/_constants.py +1 -0
  249. azure/ai/evaluation/simulator/_conversation/__init__.py +138 -11
  250. azure/ai/evaluation/simulator/_conversation/_conversation.py +6 -2
  251. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  252. azure/ai/evaluation/simulator/_direct_attack_simulator.py +37 -24
  253. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  254. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +56 -28
  255. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  256. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +225 -0
  257. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +12 -10
  258. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +100 -45
  259. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +101 -3
  260. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +31 -11
  261. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  262. azure/ai/evaluation/simulator/_simulator.py +43 -19
  263. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/METADATA +366 -27
  264. azure_ai_evaluation-1.13.3.dist-info/RECORD +305 -0
  265. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/WHEEL +1 -1
  266. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
  267. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
  268. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +0 -55
  269. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
  270. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
  271. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
  272. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
  273. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
  274. azure/ai/evaluation/simulator/_tracing.py +0 -89
  275. azure_ai_evaluation-1.0.1.dist-info/RECORD +0 -119
  276. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info/licenses}/NOTICE.txt +0 -0
  277. {azure_ai_evaluation-1.0.1.dist-info → azure_ai_evaluation-1.13.3.dist-info}/top_level.txt +0 -0
@@ -7,11 +7,12 @@ import asyncio
7
7
  import importlib.resources as pkg_resources
8
8
  import json
9
9
  import os
10
+ import random
10
11
  import re
11
12
  import warnings
12
13
  from typing import Any, Callable, Dict, List, Optional, Union, Tuple
13
14
 
14
- from promptflow.core import AsyncPrompty
15
+ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
15
16
  from tqdm import tqdm
16
17
 
17
18
  from azure.ai.evaluation._common._experimental import experimental
@@ -19,7 +20,7 @@ from azure.ai.evaluation._common.utils import construct_prompty_model_config
19
20
  from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
20
21
 
21
22
  from .._exceptions import ErrorBlame, ErrorCategory, EvaluationException
22
- from .._user_agent import USER_AGENT
23
+ from .._user_agent import UserAgentSingleton
23
24
  from ._conversation.constants import ConversationRole
24
25
  from ._helpers import ConversationHistory, Turn
25
26
  from ._utils import JsonLineChatProtocol
@@ -50,6 +51,10 @@ class Simulator:
50
51
  if "api_version" not in self.model_config:
51
52
  self.model_config["api_version"] = "2024-06-01" # type: ignore
52
53
 
54
+ @staticmethod
55
+ def __user_agent() -> str:
56
+ return f"{UserAgentSingleton().value} (type=simulator; subtype=Simulator)"
57
+
53
58
  @staticmethod
54
59
  def _validate_model_config(model_config: Any):
55
60
  """
@@ -100,6 +105,7 @@ class Simulator:
100
105
  user_simulator_prompty_options: Dict[str, Any] = {},
101
106
  conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
102
107
  concurrent_async_tasks: int = 5,
108
+ randomization_seed: Optional[int] = None,
103
109
  **kwargs,
104
110
  ) -> List[JsonLineChatProtocol]:
105
111
  """
@@ -130,6 +136,9 @@ class Simulator:
130
136
  :keyword concurrent_async_tasks: The number of asynchronous tasks to run concurrently during the simulation.
131
137
  Defaults to 5.
132
138
  :paramtype concurrent_async_tasks: int
139
+ :keyword randomization_seed: The seed used to randomize task/query order. If unset, the system's
140
+ default seed is used. Defaults to None.
141
+ :paramtype randomization_seed: Optional[int]
133
142
  :return: A list of simulated conversations represented as JsonLineChatProtocol objects.
134
143
  :rtype: List[JsonLineChatProtocol]
135
144
 
@@ -154,7 +163,14 @@ class Simulator:
154
163
  f"You have specified 'num_queries' < len('tasks') ({num_queries} < {len(tasks)}). "
155
164
  f"Only the first {num_queries} lines of the specified tasks will be simulated."
156
165
  )
157
- num_queries = min(num_queries, len(tasks))
166
+
167
+ # Apply randomization to tasks if seed is provided
168
+ if randomization_seed is not None and tasks:
169
+ # Create a local random instance to avoid polluting global state
170
+ local_random = random.Random(randomization_seed)
171
+ tasks = tasks.copy() # Don't modify the original list
172
+ local_random.shuffle(tasks)
173
+
158
174
  max_conversation_turns *= 2 # account for both user and assistant turns
159
175
 
160
176
  prompty_model_config = self.model_config
@@ -375,7 +391,7 @@ class Simulator:
375
391
  prompty_model_config = construct_prompty_model_config(
376
392
  model_config=prompty_model_config, # type: ignore
377
393
  default_api_version="2024-06-01",
378
- user_agent=USER_AGENT,
394
+ user_agent=self.__user_agent(),
379
395
  )
380
396
  return AsyncPrompty.load(source=prompty_path, model=prompty_model_config) # type: ignore
381
397
  except FileNotFoundError as e:
@@ -389,7 +405,7 @@ class Simulator:
389
405
  prompty_model_config = construct_prompty_model_config(
390
406
  model_config=prompty_model_config, # type: ignore
391
407
  default_api_version="2024-06-01",
392
- user_agent=USER_AGENT,
408
+ user_agent=self.__user_agent(),
393
409
  )
394
410
  return AsyncPrompty.load(
395
411
  source=user_simulator_prompty,
@@ -514,7 +530,7 @@ class Simulator:
514
530
  prompty_model_config = construct_prompty_model_config(
515
531
  model_config=prompty_model_config, # type: ignore
516
532
  default_api_version="2024-06-01",
517
- user_agent=USER_AGENT,
533
+ user_agent=self.__user_agent(),
518
534
  )
519
535
  return AsyncPrompty.load(source=prompty_path, model=prompty_model_config) # type: ignore
520
536
  except FileNotFoundError as e:
@@ -528,7 +544,7 @@ class Simulator:
528
544
  prompty_model_config = construct_prompty_model_config(
529
545
  model_config=prompty_model_config, # type: ignore
530
546
  default_api_version="2024-06-01",
531
- user_agent=USER_AGENT,
547
+ user_agent=self.__user_agent(),
532
548
  )
533
549
  return AsyncPrompty.load(
534
550
  source=query_response_generating_prompty,
@@ -583,7 +599,10 @@ class Simulator:
583
599
  for i, query_response_pair in enumerate(query_responses):
584
600
  query = query_response_pair["q"]
585
601
  response = query_response_pair["r"]
586
- task = tasks[i]
602
+ try:
603
+ task = tasks[i]
604
+ except IndexError:
605
+ task = None
587
606
 
588
607
  conversation = await self._complete_conversation(
589
608
  conversation_starter=query,
@@ -618,7 +637,7 @@ class Simulator:
618
637
  *,
619
638
  conversation_starter: str,
620
639
  max_conversation_turns: int,
621
- task: str,
640
+ task: Optional[str],
622
641
  user_simulator_prompty: Optional[str],
623
642
  user_simulator_prompty_options: Dict[str, Any],
624
643
  target: Callable,
@@ -656,16 +675,21 @@ class Simulator:
656
675
  user_simulator_prompty_options=user_simulator_prompty_options,
657
676
  )
658
677
  if len(conversation_history) == 0:
659
- conversation_starter_from_simulated_user = await user_flow(
660
- task=task,
661
- conversation_history=[
662
- {
663
- "role": "assistant",
664
- "content": conversation_starter,
665
- }
666
- ],
667
- action="rewrite the assistant's message as you have to accomplish the task by asking the right questions. Make sure the original question is not lost in your rewrite.",
668
- )
678
+ if task:
679
+ conversation_starter_from_simulated_user = await user_flow(
680
+ task=task,
681
+ conversation_history=[
682
+ {
683
+ "role": "assistant",
684
+ "content": conversation_starter,
685
+ }
686
+ ],
687
+ action="rewrite the assistant's message as you have to accomplish the task by asking the right questions. Make sure the original question is not lost in your rewrite.",
688
+ )
689
+ else:
690
+ conversation_starter_from_simulated_user = {
691
+ "content": conversation_starter,
692
+ }
669
693
  else:
670
694
  conversation_starter_from_simulated_user = await user_flow(
671
695
  task=task,
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: azure-ai-evaluation
3
- Version: 1.0.1
3
+ Version: 1.13.3
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -13,23 +13,48 @@ Classifier: Development Status :: 5 - Production/Stable
13
13
  Classifier: Programming Language :: Python
14
14
  Classifier: Programming Language :: Python :: 3
15
15
  Classifier: Programming Language :: Python :: 3 :: Only
16
- Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
20
19
  Classifier: License :: OSI Approved :: MIT License
21
20
  Classifier: Operating System :: OS Independent
22
- Requires-Python: >=3.8
21
+ Requires-Python: >=3.9
23
22
  Description-Content-Type: text/markdown
24
23
  License-File: NOTICE.txt
25
- Requires-Dist: promptflow-devkit >=1.15.0
26
- Requires-Dist: promptflow-core >=1.15.0
27
- Requires-Dist: pyjwt >=2.8.0
28
- Requires-Dist: azure-identity >=1.16.0
29
- Requires-Dist: azure-core >=1.30.2
30
- Requires-Dist: nltk >=3.9.1
31
- Provides-Extra: remote
32
- Requires-Dist: promptflow-azure <2.0.0,>=1.15.0 ; extra == 'remote'
24
+ Requires-Dist: pyjwt>=2.8.0
25
+ Requires-Dist: azure-identity>=1.19.0
26
+ Requires-Dist: azure-core>=1.31.0
27
+ Requires-Dist: nltk>=3.9.1
28
+ Requires-Dist: azure-storage-blob>=12.19.0
29
+ Requires-Dist: httpx>=0.27.2
30
+ Requires-Dist: pandas<3.0.0,>=2.1.2; python_version < "3.13"
31
+ Requires-Dist: pandas<3.0.0,>=2.2.3; python_version == "3.13"
32
+ Requires-Dist: pandas<3.0.0,>=2.3.3; python_version >= "3.14"
33
+ Requires-Dist: openai>=1.108.0
34
+ Requires-Dist: ruamel.yaml<1.0.0,>=0.17.10
35
+ Requires-Dist: msrest>=0.6.21
36
+ Requires-Dist: Jinja2>=3.1.6
37
+ Requires-Dist: aiohttp>=3.0
38
+ Provides-Extra: redteam
39
+ Requires-Dist: pyrit==0.8.1; python_version >= "3.10" and extra == "redteam"
40
+ Requires-Dist: duckdb==1.3.2; python_version >= "3.10" and extra == "redteam"
41
+ Provides-Extra: opentelemetry
42
+ Requires-Dist: opentelemetry-sdk>=1.17.0; extra == "opentelemetry"
43
+ Requires-Dist: azure-monitor-opentelemetry-exporter>=1.0.0b17; extra == "opentelemetry"
44
+ Dynamic: author
45
+ Dynamic: author-email
46
+ Dynamic: classifier
47
+ Dynamic: description
48
+ Dynamic: description-content-type
49
+ Dynamic: home-page
50
+ Dynamic: keywords
51
+ Dynamic: license
52
+ Dynamic: license-file
53
+ Dynamic: project-url
54
+ Dynamic: provides-extra
55
+ Dynamic: requires-dist
56
+ Dynamic: requires-python
57
+ Dynamic: summary
33
58
 
34
59
  # Azure AI Evaluation client library for Python
35
60
 
@@ -55,8 +80,8 @@ Azure AI SDK provides following to evaluate Generative AI Applications:
55
80
 
56
81
  ### Prerequisites
57
82
 
58
- - Python 3.8 or later is required to use this package.
59
- - [Optional] You must have [Azure AI Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
83
+ - Python 3.9 or later is required to use this package.
84
+ - [Optional] You must have [Azure AI Foundry Project][ai_project] or [Azure Open AI][azure_openai] to use AI-assisted evaluators
60
85
 
61
86
  ### Install the package
62
87
 
@@ -65,10 +90,6 @@ Install the Azure AI Evaluation SDK for Python with [pip][pip_link]:
65
90
  ```bash
66
91
  pip install azure-ai-evaluation
67
92
  ```
68
- If you want to track results in [AI Studio][ai_studio], install `remote` extra:
69
- ```python
70
- pip install azure-ai-evaluation[remote]
71
- ```
72
93
 
73
94
  ## Key concepts
74
95
 
@@ -113,13 +134,23 @@ result = relevance_evaluator(
113
134
  response="The capital of Japan is Tokyo."
114
135
  )
115
136
 
116
- # AI assisted safety evaluator
137
+ # There are two ways to provide Azure AI Project.
138
+ # Option #1 : Using Azure AI Project Details
117
139
  azure_ai_project = {
118
140
  "subscription_id": "<subscription_id>",
119
141
  "resource_group_name": "<resource_group_name>",
120
142
  "project_name": "<project_name>",
121
143
  }
122
144
 
145
+ violence_evaluator = ViolenceEvaluator(azure_ai_project)
146
+ result = violence_evaluator(
147
+ query="What is the capital of France?",
148
+ response="Paris."
149
+ )
150
+
151
+ # Option # 2 : Using Azure AI Project Url
152
+ azure_ai_project = "https://{resource_name}.services.ai.azure.com/api/projects/{project_name}"
153
+
123
154
  violence_evaluator = ViolenceEvaluator(azure_ai_project)
124
155
  result = violence_evaluator(
125
156
  query="What is the capital of France?",
@@ -177,9 +208,9 @@ result = evaluate(
177
208
  }
178
209
  }
179
210
  }
180
- # Optionally provide your AI Studio project information to track your evaluation results in your Azure AI Studio project
211
+ # Optionally provide your AI Foundry project information to track your evaluation results in your Azure AI Foundry project
181
212
  azure_ai_project = azure_ai_project,
182
- # Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
213
+ # Optionally provide an output path to dump a json of metric summary, row level data and metric and AI Foundry URL
183
214
  output_path="./evaluation_results.json"
184
215
  )
185
216
  ```
@@ -270,11 +301,18 @@ with open("simulator_output.jsonl", "w") as f:
270
301
  ```python
271
302
  from azure.ai.evaluation.simulator import AdversarialSimulator, AdversarialScenario
272
303
  from azure.identity import DefaultAzureCredential
304
+
305
+ # There are two ways to provide Azure AI Project.
306
+ # Option #1 : Using Azure AI Project
273
307
  azure_ai_project = {
274
308
  "subscription_id": <subscription_id>,
275
309
  "resource_group_name": <resource_group_name>,
276
310
  "project_name": <project_name>
277
311
  }
312
+
313
+ # Option #2 : Using Azure AI Project Url
314
+ azure_ai_project = "https://{resource_name}.services.ai.azure.com/api/projects/{project_name}"
315
+
278
316
  scenario = AdversarialScenario.ADVERSARIAL_QA
279
317
  simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
280
318
 
@@ -359,13 +397,13 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
359
397
  [evaluate_dataset]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#evaluate-on-test-dataset-using-evaluate
360
398
  [evaluators]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview
361
399
  [evaluate_api]: https://learn.microsoft.com/python/api/azure-ai-evaluation/azure.ai.evaluation?view=azure-python-preview#azure-ai-evaluation-evaluate
362
- [evaluate_app]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/evaluate_app
400
+ [evaluate_app]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Targets/Evaluate_App_Endpoint
363
401
  [evaluation_tsg]: https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/TROUBLESHOOTING.md
364
402
  [ai_studio]: https://learn.microsoft.com/azure/ai-studio/what-is-ai-studio
365
403
  [ai_project]: https://learn.microsoft.com/azure/ai-studio/how-to/create-projects?tabs=ai-studio
366
404
  [azure_openai]: https://learn.microsoft.com/azure/ai-services/openai/
367
- [evaluate_models]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/evaluate_endpoints
368
- [custom_evaluators]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/evaluate_custom
405
+ [evaluate_models]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Targets/Evaluate_Base_Model_Endpoint
406
+ [custom_evaluators]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Supported_Evaluation_Metrics/Custom_Evaluators
369
407
  [evaluate_samples]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate
370
408
  [evaluation_metrics]: https://learn.microsoft.com/azure/ai-studio/concepts/evaluation-metrics-built-in
371
409
  [performance_and_quality_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#performance-and-quality-evaluators
@@ -373,18 +411,318 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
373
411
  [composite_evaluators]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk#composite-evaluators
374
412
  [adversarial_simulation_docs]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#generate-adversarial-simulations-for-safety-evaluation
375
413
  [adversarial_simulation_scenarios]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#supported-adversarial-simulation-scenarios
376
- [adversarial_simulation]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/simulate_adversarial
377
- [simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/simulate_conversation_starter
414
+ [adversarial_simulation]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Adversarial_Data
415
+ [simulate_with_conversation_starter]: https://github.com/Azure-Samples/azureai-samples/tree/main/scenarios/evaluate/Simulators/Simulate_Context-Relevant_Data/Simulate_From_Conversation_Starter
378
416
  [adversarial_jailbreak]: https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data#simulating-jailbreak-attacks
379
417
 
380
418
 
381
419
  # Release History
382
420
 
421
+ ## 1.13.3 (2025-11-08)
422
+
423
+ ### Other Changes
424
+
425
+ - Added `scenario` property to red team evaluation request to align scores with red team concepts of attack success.
426
+
427
+ ## 1.13.2 (2025-11-07)
428
+
429
+ ### Bugs Fixed
430
+
431
+ - Added App Insights redaction for agent safety run telemetry so adversarial prompts are not stored in collected logs.
432
+
433
+ ## 1.13.1 (2025-11-05)
434
+
435
+ ### Features Added
436
+
437
+ - Improved RedTeam coverage across risk sub-categories to ensure comprehensive security testing
438
+ - Made RedTeam's `AttackStrategy.Tense` seed prompts dynamic to allow use of this strategy with additional risk categories
439
+ - Refactors error handling and result semantics in the RedTeam evaluation system to improve clarity and align with Attack Success Rate (ASR) conventions (passed=False means attack success)
440
+
441
+ ### Bugs Fixed
442
+
443
+ - Fixed RedTeam evaluation error related to context handling for context-dependent risk categories
444
+ - Fixed RedTeam prompt application for model targets during Indirect Jailbreak XPIA (Cross-Platform Indirect Attack)
445
+
446
+ ## 1.13.0 (2025-10-30)
447
+
448
+ ### Features Added
449
+
450
+ - Updated `IndirectAttack` risk category for RedTeam to `IndirectJailbreak` to better reflect its purpose. This change allows users to apply cross-domain prompt injection (XPIA) attack strategies across all risk categories, enabling more comprehensive security testing of AI systems against indirect prompt injection attacks during red teaming.
451
+ - Added `TaskAdherence`, `SensitiveDataLeakage`, and `ProhibitedActions` as cloud-only agent safety risk categories for red teaming.
452
+ - Updated all evaluators' output to be of the following schema:
453
+ - `gpt_{evaluator_name}`, `{evaluator_name}`: float score,
454
+ - `{evaluator_name}_result`: pass/fail based on threshold,
455
+ - `{evaluator_name}_reason`, `{evaluator_name}_threshold`
456
+ - `{evaluator_name}_prompt_tokens`, `{evaluator_name}_completion_tokens`, `{evaluator_name}_total_tokens`, `{evaluator_name}_finish_reason`
457
+ - `{evaluator_name}_model`: model used for evaluation
458
+ - `{evaluator_name}_sample_input`, `{evaluator_name}_sample_output`: input and output used for evaluation
459
+
460
+ This change standardizes the output format across all evaluators and follows OTel convention.
461
+
462
+ ### Bugs Fixed
463
+
464
+ - `image_tag` parameter in `AzureOpenAIPythonGrader` is now optional.
465
+
466
+ ## 1.11.2 (2025-10-09)
467
+
468
+ ### Bugs Fixed
469
+
470
+ - **kwargs in an evaluator signature receives input columns that are not otherwise named in the evaluator's signature
471
+
472
+ ## 1.12.0 (2025-10-02)
473
+
474
+ ### Features Added
475
+ - AOAI Graders now accept a "credential" parameter that can be used for authentication with an AzureOpenAIModelConfiguration
476
+ - Added `is_reasoning_model` parameter support to `CoherenceEvaluator`, `FluencyEvaluator`, `SimilarityEvaluator`, `GroundednessEvaluator`, `RetrievalEvaluator`, and `RelevanceEvaluator` to enable reasoning model configuration for o1/o3 models.
477
+
478
+ ### Bugs Fixed
479
+ - Support for multi-level nesting in OpenAI grader (experimental)
480
+
481
+ ## 1.11.1 (2025-09-19)
482
+
483
+ ### Bugs Fixed
484
+ - Pinning duckdb version to 1.3.2 for redteam extra to fix error `TypeError: unhashable type: '_duckdb.typing.DuckDBPyType'`
485
+
486
+ ## 1.11.0 (2025-09-03)
487
+
488
+ ### Features Added
489
+ - Added support for user-supplied tags in the `evaluate` function. Tags are key-value pairs that can be used for experiment tracking, A/B testing, filtering, and organizing evaluation runs. The function accepts a `tags` parameter.
490
+ - Added support for user-supplied TokenCredentials with LLM based evaluators.
491
+ - Enhanced `GroundednessEvaluator` to support AI agent evaluation with tool calls. The evaluator now accepts agent response data containing tool calls and can extract context from `file_search` tool results for groundedness assessment. This enables evaluation of AI agents that use tools to retrieve information and generate responses. Note: Agent groundedness evaluation is currently supported only when the `file_search` tool is used.
492
+ - Added `language` parameter to `RedTeam` class for multilingual red team scanning support. The parameter accepts values from `SupportedLanguages` enum including English, Spanish, French, German, Italian, Portuguese, Japanese, Korean, and Simplified Chinese, enabling red team attacks to be generated and conducted in multiple languages.
493
+ - Added support for IndirectAttack and UngroundedAttributes risk categories in `RedTeam` scanning. These new risk categories expand red team capabilities to detect cross-platform indirect attacks and evaluate ungrounded inferences about human attributes including emotional state and protected class information.
494
+
495
+ ### Bugs Fixed
496
+ - Fixed issue where evaluation results were not properly aligned with input data, leading to incorrect metrics being reported.
497
+
498
+ ### Other Changes
499
+ - Deprecating `AdversarialSimulator` in favor of the [AI Red Teaming Agent](https://aka.ms/airedteamingagent-sample). `AdversarialSimulator` will be removed in the next minor release.
500
+ - Moved retry configuration constants (`MAX_RETRY_ATTEMPTS`, `MAX_RETRY_WAIT_SECONDS`, `MIN_RETRY_WAIT_SECONDS`) from `RedTeam` class to new `RetryManager` class for better code organization and configurability.
501
+
502
+ ## 1.10.0 (2025-07-31)
503
+
504
+ ### Breaking Changes
505
+
506
+ - Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
507
+
508
+ ### Features Added
509
+
510
+ - Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
511
+ - Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
512
+ tolerance for harmful responses).
513
+ - Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
514
+
515
+
516
+ ### Bugs Fixed
517
+
518
+ - Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
519
+ - Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
520
+ - Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
521
+
522
+
523
+ ### Other Changes
524
+
525
+ - The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
526
+ - Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
527
+ This is due to be removed in a future release.
528
+
529
+
530
+ ## 1.9.0 (2025-07-02)
531
+
532
+ ### Features Added
533
+
534
+ - Added support for Azure Open AI evaluation via `AzureOpenAIScoreModelGrader` class, which serves as a wrapper around Azure Open AI score model configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
535
+ - Added new experimental risk categories ProtectedMaterial and CodeVulnerability for redteam agent scan.
536
+
537
+
538
+ ### Bugs Fixed
539
+
540
+ - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
541
+
542
+ - Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
543
+ - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
544
+ - Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
545
+ - `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
546
+
547
+ ## 1.8.0 (2025-05-29)
548
+
549
+ ### Features Added
550
+
551
+ - Introduces `AttackStrategy.MultiTurn` and `AttackStrategy.Crescendo` to `RedTeam`. These strategies attack the target of a `RedTeam` scan over the course of multi-turn conversations.
552
+
553
+ ### Bugs Fixed
554
+ - AdversarialSimulator in `ADVERSARIAL_CONVERSATION` mode was broken. It is now fixed.
555
+
556
+ ## 1.7.0 (2025-05-12)
557
+
558
+ ### Bugs Fixed
559
+ - azure-ai-evaluation failed with module not found [#40992](https://github.com/Azure/azure-sdk-for-python/issues/40992)
560
+
561
+ ## 1.6.0 (2025-05-07)
562
+
563
+ ### Features Added
564
+ - New `<evaluator>.binary_aggregate` field added to evaluation result metrics. This field contains the aggregated binary evaluation results for each evaluator, providing a summary of the evaluation outcomes.
565
+ - Added support for Azure Open AI evaluation via 4 new 'grader' classes, which serve as wrappers around Azure Open AI grader configurations. These new grader objects can be supplied to the main `evaluate` method as if they were normal callable evaluators. The new classes are:
566
+ - AzureOpenAIGrader (general class for experienced users)
567
+ - AzureOpenAILabelGrader
568
+ - AzureOpenAIStringCheckGrader
569
+ - AzureOpenAITextSimilarityGrader
570
+
571
+ ### Breaking Changes
572
+ - In the experimental RedTeam's scan method, the `data_only` param has been replaced with `skip_evals` and if you do not want data to be uploaded, use the `skip_upload` flag.
573
+
574
+ ### Bugs Fixed
575
+ - Fixed error in `evaluate` where data fields could not contain numeric characters. Previously, a data file with schema:
576
+ ```
577
+ "query1": "some query", "response": "some response"
578
+ ```
579
+ throws error when passed into `evaluator_config` as `{"evaluator_name": {"column_mapping": {"query": "${data.query1}", "response": "${data.response}"}},}`.
580
+ Now, users may import data containing fields with numeric characters.
581
+
582
+
583
+ ## 1.5.0 (2025-04-04)
584
+
585
+ ### Features Added
586
+
587
+ - New `RedTeam` agent functionality to assess the safety and resilience of AI systems against adversarial prompt attacks
588
+
589
+ ## 1.4.0 (2025-03-27)
590
+
591
+ ### Features Added
592
+ - Enhanced binary evaluation results with customizable thresholds
593
+ - Added threshold support for QA and ContentSafety evaluators
594
+ - Evaluation results now include both the score and threshold values
595
+ - Configurable threshold parameter allows custom binary classification boundaries
596
+ - Default thresholds provided for backward compatibility
597
+ - Quality evaluators use "higher is better" scoring (score ≥ threshold is positive)
598
+ - Content safety evaluators use "lower is better" scoring (score ≤ threshold is positive)
599
+ - New Built-in evaluator called CodeVulnerabilityEvaluator is added.
600
+ - It provides capabilities to identify the following code vulnerabilities.
601
+ - path-injection
602
+ - sql-injection
603
+ - code-injection
604
+ - stack-trace-exposure
605
+ - incomplete-url-substring-sanitization
606
+ - flask-debug
607
+ - clear-text-logging-sensitive-data
608
+ - incomplete-hostname-regexp
609
+ - server-side-unvalidated-url-redirection
610
+ - weak-cryptographic-algorithm
611
+ - full-ssrf
612
+ - bind-socket-all-network-interfaces
613
+ - client-side-unvalidated-url-redirection
614
+ - likely-bugs
615
+ - reflected-xss
616
+ - clear-text-storage-sensitive-data
617
+ - tarslip
618
+ - hardcoded-credentials
619
+ - insecure-randomness
620
+ - It also supports multiple coding languages such as (Python, Java, C++, C#, Go, Javascript, SQL)
621
+
622
+ - New Built-in evaluator called UngroundedAttributesEvaluator is added.
623
+ - It evaluates ungrounded inference of human attributes for a given query, response, and context for a single-turn evaluation only,
624
+ - where query represents the user query and response represents the AI system response given the provided context.
625
+
626
+ - Ungrounded Attributes checks for whether a response is first, ungrounded, and checks if it contains information about protected class
627
+ - or emotional state of a person.
628
+
629
+ - It identifies the following attributes:
630
+
631
+ - emotional_state
632
+ - protected_class
633
+ - groundedness
634
+ - New Built-in evaluators for Agent Evaluation (Preview)
635
+ - IntentResolutionEvaluator - Evaluates the intent resolution of an agent's response to a user query.
636
+ - ResponseCompletenessEvaluator - Evaluates the response completeness of an agent's response to a user query.
637
+ - TaskAdherenceEvaluator - Evaluates the task adherence of an agent's response to a user query.
638
+ - ToolCallAccuracyEvaluator - Evaluates the accuracy of tool calls made by an agent in response to a user query.
639
+
640
+ ### Bugs Fixed
641
+ - Fixed error in `GroundednessProEvaluator` when handling non-numeric values like "n/a" returned from the service.
642
+ - Uploading local evaluation results from `evaluate` with the same run name will no longer result in each online run sharing (and bashing) result files.
643
+
644
+ ## 1.3.0 (2025-02-28)
645
+
646
+ ### Breaking Changes
647
+ - Multimodal specific evaluators `ContentSafetyMultimodalEvaluator`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` has been removed. Please use `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator` instead.
648
+ - Metric name in ProtectedMaterialEvaluator's output is changed from `protected_material.fictional_characters_label` to `protected_material.fictional_characters_defect_rate`. It's now consistent with other evaluator's metric names (ending with `_defect_rate`).
649
+
650
+ ## 1.2.0 (2025-01-27)
651
+
652
+ ### Features Added
653
+ - CSV files are now supported as data file inputs with `evaluate()` API. The CSV file should have a header row with column names that match the `data` and `target` fields in the `evaluate()` method and the filename should be passed as the `data` parameter. Column name 'Conversation' in CSV file is not fully supported yet.
654
+
655
+ ### Breaking Changes
656
+ - `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be removed in next release.
657
+
658
+ ### Bugs Fixed
659
+ - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Studio.
660
+ - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
661
+ - Fixed the non adversarial simulator to run in task-free mode
662
+ - Content safety evaluators (violence, self harm, sexual, hate/unfairness) return the maximum result as the
663
+ main score when aggregating per-turn evaluations from a conversation into an overall
664
+ evaluation score. Other conversation-capable evaluators still default to a mean for aggregation.
665
+ - Fixed bug in non adversarial simulator sample where `tasks` undefined
666
+
667
+ ### Other Changes
668
+ - Changed minimum required python version to use this package from 3.8 to 3.9
669
+ - Stop dependency on the local promptflow service. No promptflow service will automatically start when running evaluation.
670
+ - Evaluators internally allow for custom aggregation. However, this causes serialization failures if evaluated while the
671
+ environment variable `AI_EVALS_BATCH_USE_ASYNC` is set to false.
672
+
673
+ ## 1.1.0 (2024-12-12)
674
+
675
+ ### Features Added
676
+ - Added image support in `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` and `ProtectedMaterialEvaluator`. Provide image URLs or base64 encoded images in `conversation` input for image evaluation. See below for an example:
677
+
678
+ ```python
679
+ evaluator = ContentSafetyEvaluator(credential=azure_cred, azure_ai_project=project_scope)
680
+ conversation = {
681
+ "messages": [
682
+ {
683
+ "role": "system",
684
+ "content": [
685
+ {"type": "text", "text": "You are an AI assistant that understands images."}
686
+ ],
687
+ },
688
+ {
689
+ "role": "user",
690
+ "content": [
691
+ {"type": "text", "text": "Can you describe this image?"},
692
+ {
693
+ "type": "image_url",
694
+ "image_url": {
695
+ "url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"
696
+ },
697
+ },
698
+ ],
699
+ },
700
+ {
701
+ "role": "assistant",
702
+ "content": [
703
+ {
704
+ "type": "text",
705
+ "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.",
706
+ }
707
+ ],
708
+ },
709
+ ]
710
+ }
711
+ print("Calling Content Safety Evaluator for multi-modal")
712
+ score = evaluator(conversation=conversation)
713
+ ```
714
+
715
+ - Please switch to generic evaluators for image evaluations as mentioned above. `ContentSafetyMultimodalEvaluator`, `ContentSafetyMultimodalEvaluatorBase`, `ViolenceMultimodalEvaluator`, `SexualMultimodalEvaluator`, `SelfHarmMultimodalEvaluator`, `HateUnfairnessMultimodalEvaluator` and `ProtectedMaterialMultimodalEvaluator` will be deprecated in the next release.
716
+
717
+ ### Bugs Fixed
718
+ - Removed `[remote]` extra. This is no longer needed when tracking results in Azure AI Foundry portal.
719
+ - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
720
+
383
721
  ## 1.0.1 (2024-11-15)
384
722
 
385
723
  ### Bugs Fixed
386
- - Fixed `[remote]` extra to be needed only when tracking results in Azure AI Studio.
387
724
  - Removing `azure-ai-inference` as dependency.
725
+ - Fixed `AttributeError: 'NoneType' object has no attribute 'get'` while running simulator with 1000+ results
388
726
 
389
727
  ## 1.0.0 (2024-11-13)
390
728
 
@@ -396,6 +734,7 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
396
734
  - Fixed an issue where the `output_path` parameter in the `evaluate` API did not support relative path.
397
735
  - Output of adversarial simulators are of type `JsonLineList` and the helper function `to_eval_qr_json_lines` now outputs context from both user and assistant turns along with `category` if it exists in the conversation
398
736
  - Fixed an issue where during long-running simulations, API token expires causing "Forbidden" error. Instead, users can now set an environment variable `AZURE_TOKEN_REFRESH_INTERVAL` to refresh the token more frequently to prevent expiration and ensure continuous operation of the simulation.
737
+ - Fixed an issue with the `ContentSafetyEvaluator` that caused parallel execution of sub-evaluators to fail. Parallel execution is now enabled by default again, but can still be disabled via the '_parallel' boolean keyword argument during class initialization.
399
738
  - Fix `evaluate` function not producing aggregated metrics if ANY values to be aggregated were None, NaN, or
400
739
  otherwise difficult to process. Such values are ignored fully, so the aggregated metric of `[1, 2, 3, NaN]`
401
740
  would be 2, not 1.5.