azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show
  1. azure/ai/evaluation/__init__.py +51 -6
  2. azure/ai/evaluation/_aoai/__init__.py +1 -1
  3. azure/ai/evaluation/_aoai/aoai_grader.py +21 -11
  4. azure/ai/evaluation/_aoai/label_grader.py +3 -2
  5. azure/ai/evaluation/_aoai/python_grader.py +84 -0
  6. azure/ai/evaluation/_aoai/score_model_grader.py +91 -0
  7. azure/ai/evaluation/_aoai/string_check_grader.py +3 -2
  8. azure/ai/evaluation/_aoai/text_similarity_grader.py +3 -2
  9. azure/ai/evaluation/_azure/_envs.py +9 -10
  10. azure/ai/evaluation/_azure/_token_manager.py +7 -1
  11. azure/ai/evaluation/_common/constants.py +11 -2
  12. azure/ai/evaluation/_common/evaluation_onedp_client.py +32 -26
  13. azure/ai/evaluation/_common/onedp/__init__.py +32 -32
  14. azure/ai/evaluation/_common/onedp/_client.py +136 -139
  15. azure/ai/evaluation/_common/onedp/_configuration.py +70 -73
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -21
  17. azure/ai/evaluation/_common/onedp/_utils/__init__.py +6 -0
  18. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1232 -0
  19. azure/ai/evaluation/_common/onedp/_utils/serialization.py +2032 -0
  20. azure/ai/evaluation/_common/onedp/_validation.py +50 -50
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -9
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -29
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +138 -143
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +70 -75
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -21
  26. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +37 -39
  27. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4832 -4494
  28. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -21
  29. azure/ai/evaluation/_common/onedp/models/__init__.py +168 -142
  30. azure/ai/evaluation/_common/onedp/models/_enums.py +230 -162
  31. azure/ai/evaluation/_common/onedp/models/_models.py +2685 -2228
  32. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -21
  33. azure/ai/evaluation/_common/onedp/operations/__init__.py +37 -39
  34. azure/ai/evaluation/_common/onedp/operations/_operations.py +6106 -5657
  35. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -21
  36. azure/ai/evaluation/_common/rai_service.py +88 -52
  37. azure/ai/evaluation/_common/raiclient/__init__.py +1 -1
  38. azure/ai/evaluation/_common/raiclient/operations/_operations.py +14 -1
  39. azure/ai/evaluation/_common/utils.py +188 -10
  40. azure/ai/evaluation/_constants.py +2 -1
  41. azure/ai/evaluation/_converters/__init__.py +1 -1
  42. azure/ai/evaluation/_converters/_ai_services.py +9 -8
  43. azure/ai/evaluation/_converters/_models.py +46 -0
  44. azure/ai/evaluation/_converters/_sk_services.py +495 -0
  45. azure/ai/evaluation/_eval_mapping.py +2 -2
  46. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +73 -25
  47. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +2 -2
  48. azure/ai/evaluation/_evaluate/_evaluate.py +210 -94
  49. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +132 -89
  50. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +0 -1
  51. azure/ai/evaluation/_evaluate/_utils.py +25 -17
  52. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +4 -4
  53. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +20 -12
  54. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +6 -6
  55. azure/ai/evaluation/_evaluators/_common/_base_eval.py +45 -11
  56. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +24 -9
  57. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +24 -9
  58. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +28 -18
  59. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +11 -8
  60. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +11 -8
  61. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +12 -9
  62. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -7
  63. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +1 -5
  64. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +37 -64
  65. azure/ai/evaluation/_evaluators/_eci/_eci.py +6 -3
  66. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +5 -5
  67. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +3 -3
  68. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +4 -4
  69. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +12 -8
  70. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +31 -26
  71. azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty +210 -96
  72. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -4
  73. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +14 -7
  74. azure/ai/evaluation/_evaluators/_qa/_qa.py +5 -5
  75. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +62 -15
  76. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +140 -59
  77. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +21 -26
  78. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +5 -5
  79. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +22 -22
  80. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +7 -6
  81. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +4 -4
  82. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +27 -24
  83. azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty +354 -66
  84. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +175 -183
  85. azure/ai/evaluation/_evaluators/_tool_call_accuracy/tool_call_accuracy.prompty +99 -21
  86. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +20 -12
  87. azure/ai/evaluation/_evaluators/_xpia/xpia.py +10 -7
  88. azure/ai/evaluation/_exceptions.py +10 -0
  89. azure/ai/evaluation/_http_utils.py +3 -3
  90. azure/ai/evaluation/_legacy/_batch_engine/_config.py +6 -3
  91. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +117 -32
  92. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +5 -2
  93. azure/ai/evaluation/_legacy/_batch_engine/_result.py +2 -0
  94. azure/ai/evaluation/_legacy/_batch_engine/_run.py +2 -2
  95. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +33 -41
  96. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +1 -4
  97. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +12 -19
  98. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +2 -0
  99. azure/ai/evaluation/_legacy/prompty/_prompty.py +11 -5
  100. azure/ai/evaluation/_safety_evaluation/__init__.py +1 -1
  101. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +195 -111
  102. azure/ai/evaluation/_user_agent.py +32 -1
  103. azure/ai/evaluation/_version.py +1 -1
  104. azure/ai/evaluation/red_team/__init__.py +3 -1
  105. azure/ai/evaluation/red_team/_agent/__init__.py +1 -1
  106. azure/ai/evaluation/red_team/_agent/_agent_functions.py +68 -71
  107. azure/ai/evaluation/red_team/_agent/_agent_tools.py +103 -145
  108. azure/ai/evaluation/red_team/_agent/_agent_utils.py +26 -6
  109. azure/ai/evaluation/red_team/_agent/_semantic_kernel_plugin.py +62 -71
  110. azure/ai/evaluation/red_team/_attack_objective_generator.py +94 -52
  111. azure/ai/evaluation/red_team/_attack_strategy.py +2 -1
  112. azure/ai/evaluation/red_team/_callback_chat_target.py +4 -9
  113. azure/ai/evaluation/red_team/_default_converter.py +1 -1
  114. azure/ai/evaluation/red_team/_red_team.py +1947 -1040
  115. azure/ai/evaluation/red_team/_red_team_result.py +49 -38
  116. azure/ai/evaluation/red_team/_utils/__init__.py +1 -1
  117. azure/ai/evaluation/red_team/_utils/_rai_service_eval_chat_target.py +39 -34
  118. azure/ai/evaluation/red_team/_utils/_rai_service_target.py +163 -138
  119. azure/ai/evaluation/red_team/_utils/_rai_service_true_false_scorer.py +14 -14
  120. azure/ai/evaluation/red_team/_utils/constants.py +1 -13
  121. azure/ai/evaluation/red_team/_utils/formatting_utils.py +41 -44
  122. azure/ai/evaluation/red_team/_utils/logging_utils.py +17 -17
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +31 -4
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +33 -25
  125. azure/ai/evaluation/simulator/_adversarial_scenario.py +2 -0
  126. azure/ai/evaluation/simulator/_adversarial_simulator.py +31 -17
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +2 -2
  128. azure/ai/evaluation/simulator/_direct_attack_simulator.py +8 -8
  129. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +18 -6
  130. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +54 -24
  131. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +7 -1
  132. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +30 -10
  133. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +19 -31
  134. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +20 -6
  135. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  136. azure/ai/evaluation/simulator/_simulator.py +21 -8
  137. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA +46 -3
  138. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/RECORD +141 -136
  139. azure/ai/evaluation/_common/onedp/aio/_vendor.py +0 -40
  140. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/NOTICE.txt +0 -0
  141. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/WHEEL +0 -0
  142. {azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/top_level.txt +0 -0
@@ -8,6 +8,7 @@ from typing_extensions import NotRequired
8
8
 
9
9
  from azure.ai.evaluation._model_configurations import AzureAIProject
10
10
  from azure.ai.evaluation._common.onedp._client import AIProjectClient
11
+ from azure.ai.evaluation.simulator._adversarial_scenario import AdversarialScenario
11
12
 
12
13
  from ._rai_client import RAIClient
13
14
 
@@ -148,14 +149,16 @@ class AdversarialTemplateHandler:
148
149
  """
149
150
  Initialize the AdversarialTemplateHandler.
150
151
 
151
- :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
152
- or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
152
+ :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
153
+ or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
153
154
  :type azure_ai_project: Union[str, AzureAIProject]
154
155
  :param rai_client: The RAI client or AI Project client used for fetching parameters.
155
156
  :type rai_client: Union[~azure.ai.evaluation.simulator._model_tools.RAIClient, ~azure.ai.evaluation._common.onedp._client.AIProjectClient]
156
157
  """
157
158
 
158
- def __init__(self, azure_ai_project: Union[str, AzureAIProject], rai_client: Union[RAIClient, AIProjectClient]) -> None:
159
+ def __init__(
160
+ self, azure_ai_project: Union[str, AzureAIProject], rai_client: Union[RAIClient, AIProjectClient]
161
+ ) -> None:
159
162
  self.azure_ai_project = azure_ai_project
160
163
  self.categorized_ch_parameters: Optional[Dict[str, _CategorizedParameter]] = None
161
164
  self.rai_client = rai_client
@@ -164,12 +167,11 @@ class AdversarialTemplateHandler:
164
167
  if self.categorized_ch_parameters is None:
165
168
  categorized_parameters: Dict[str, _CategorizedParameter] = {}
166
169
  util = ContentHarmTemplatesUtils
167
-
168
170
  if isinstance(self.rai_client, RAIClient):
169
171
  parameters = await self.rai_client.get_contentharm_parameters()
170
172
  elif isinstance(self.rai_client, AIProjectClient):
171
173
  parameters = literal_eval(self.rai_client.red_teams.get_template_parameters())
172
-
174
+
173
175
  for k in parameters.keys():
174
176
  template_key = util.get_template_key(k)
175
177
  categorized_parameters[template_key] = {
@@ -181,17 +183,29 @@ class AdversarialTemplateHandler:
181
183
 
182
184
  template_category = collection_key.split("adv_")[-1]
183
185
 
186
+ # Handle both qa_enterprise and qa_documents mapping to qa
187
+ if template_category in ["qa_enterprise", "qa_documents"]:
188
+ template_category = "qa"
189
+
184
190
  plist = self.categorized_ch_parameters
185
191
  ch_templates = []
192
+
186
193
  for key, value in plist.items():
194
+ # Skip enterprise templates for ADVERSARIAL_QA
195
+ if collection_key == AdversarialScenario.ADVERSARIAL_QA.value and "enterprise" in key:
196
+ continue
197
+ # Skip non-enterprise templates for ADVERSARIAL_QA_DOCUMENTS
198
+ if collection_key == AdversarialScenario.ADVERSARIAL_QA_DOCUMENTS.value and "enterprise" not in key:
199
+ continue
200
+
187
201
  if value["category"] == template_category:
188
202
  params = value["parameters"]
189
203
  for p in params:
190
204
  p.update({"ch_template_placeholder": "{{ch_template_placeholder}}"})
191
205
 
192
206
  template = AdversarialTemplate(template_name=key, text=None, context_key=[], template_parameters=params)
193
-
194
207
  ch_templates.append(template)
208
+
195
209
  return ch_templates
196
210
 
197
211
  def get_template(self, template_name: str) -> Optional[AdversarialTemplate]:
@@ -478,7 +478,7 @@ class OpenAICompletionsModel(LLMBase):
478
478
  time_start = time.time()
479
479
  full_response = None
480
480
 
481
- if(isinstance(session, AIProjectClient)):
481
+ if isinstance(session, AIProjectClient):
482
482
  response_data = session.red_teams.submit_simulation(request_data, headers, params)
483
483
  else:
484
484
  response = await session.post(url=self.endpoint_url, headers=headers, json=request_data, params=params)
@@ -7,6 +7,7 @@ import asyncio
7
7
  import importlib.resources as pkg_resources
8
8
  import json
9
9
  import os
10
+ import random
10
11
  import re
11
12
  import warnings
12
13
  from typing import Any, Callable, Dict, List, Optional, Union, Tuple
@@ -19,15 +20,12 @@ from azure.ai.evaluation._common.utils import construct_prompty_model_config
19
20
  from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
20
21
 
21
22
  from .._exceptions import ErrorBlame, ErrorCategory, EvaluationException
22
- from .._user_agent import USER_AGENT
23
+ from .._user_agent import UserAgentSingleton
23
24
  from ._conversation.constants import ConversationRole
24
25
  from ._helpers import ConversationHistory, Turn
25
26
  from ._utils import JsonLineChatProtocol
26
27
 
27
28
 
28
- USER_AGENT += " (type=simulator; subtype=Simulator)"
29
-
30
-
31
29
  @experimental
32
30
  class Simulator:
33
31
  """
@@ -53,6 +51,10 @@ class Simulator:
53
51
  if "api_version" not in self.model_config:
54
52
  self.model_config["api_version"] = "2024-06-01" # type: ignore
55
53
 
54
+ @staticmethod
55
+ def __user_agent() -> str:
56
+ return f"{UserAgentSingleton().value} (type=simulator; subtype=Simulator)"
57
+
56
58
  @staticmethod
57
59
  def _validate_model_config(model_config: Any):
58
60
  """
@@ -103,6 +105,7 @@ class Simulator:
103
105
  user_simulator_prompty_options: Dict[str, Any] = {},
104
106
  conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
105
107
  concurrent_async_tasks: int = 5,
108
+ randomization_seed: Optional[int] = None,
106
109
  **kwargs,
107
110
  ) -> List[JsonLineChatProtocol]:
108
111
  """
@@ -133,6 +136,9 @@ class Simulator:
133
136
  :keyword concurrent_async_tasks: The number of asynchronous tasks to run concurrently during the simulation.
134
137
  Defaults to 5.
135
138
  :paramtype concurrent_async_tasks: int
139
+ :keyword randomization_seed: The seed used to randomize task/query order. If unset, the system's
140
+ default seed is used. Defaults to None.
141
+ :paramtype randomization_seed: Optional[int]
136
142
  :return: A list of simulated conversations represented as JsonLineChatProtocol objects.
137
143
  :rtype: List[JsonLineChatProtocol]
138
144
 
@@ -158,6 +164,13 @@ class Simulator:
158
164
  f"Only the first {num_queries} lines of the specified tasks will be simulated."
159
165
  )
160
166
 
167
+ # Apply randomization to tasks if seed is provided
168
+ if randomization_seed is not None and tasks:
169
+ # Create a local random instance to avoid polluting global state
170
+ local_random = random.Random(randomization_seed)
171
+ tasks = tasks.copy() # Don't modify the original list
172
+ local_random.shuffle(tasks)
173
+
161
174
  max_conversation_turns *= 2 # account for both user and assistant turns
162
175
 
163
176
  prompty_model_config = self.model_config
@@ -378,7 +391,7 @@ class Simulator:
378
391
  prompty_model_config = construct_prompty_model_config(
379
392
  model_config=prompty_model_config, # type: ignore
380
393
  default_api_version="2024-06-01",
381
- user_agent=USER_AGENT,
394
+ user_agent=self.__user_agent(),
382
395
  )
383
396
  return AsyncPrompty.load(source=prompty_path, model=prompty_model_config) # type: ignore
384
397
  except FileNotFoundError as e:
@@ -392,7 +405,7 @@ class Simulator:
392
405
  prompty_model_config = construct_prompty_model_config(
393
406
  model_config=prompty_model_config, # type: ignore
394
407
  default_api_version="2024-06-01",
395
- user_agent=USER_AGENT,
408
+ user_agent=self.__user_agent(),
396
409
  )
397
410
  return AsyncPrompty.load(
398
411
  source=user_simulator_prompty,
@@ -517,7 +530,7 @@ class Simulator:
517
530
  prompty_model_config = construct_prompty_model_config(
518
531
  model_config=prompty_model_config, # type: ignore
519
532
  default_api_version="2024-06-01",
520
- user_agent=USER_AGENT,
533
+ user_agent=self.__user_agent(),
521
534
  )
522
535
  return AsyncPrompty.load(source=prompty_path, model=prompty_model_config) # type: ignore
523
536
  except FileNotFoundError as e:
@@ -531,7 +544,7 @@ class Simulator:
531
544
  prompty_model_config = construct_prompty_model_config(
532
545
  model_config=prompty_model_config, # type: ignore
533
546
  default_api_version="2024-06-01",
534
- user_agent=USER_AGENT,
547
+ user_agent=self.__user_agent(),
535
548
  )
536
549
  return AsyncPrompty.load(
537
550
  source=query_response_generating_prompty,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: azure-ai-evaluation
3
- Version: 1.8.0
3
+ Version: 1.10.0
4
4
  Summary: Microsoft Azure Evaluation Library for Python
5
5
  Home-page: https://github.com/Azure/azure-sdk-for-python
6
6
  Author: Microsoft Corporation
@@ -21,8 +21,6 @@ Classifier: Operating System :: OS Independent
21
21
  Requires-Python: >=3.9
22
22
  Description-Content-Type: text/markdown
23
23
  License-File: NOTICE.txt
24
- Requires-Dist: promptflow-devkit>=1.17.1
25
- Requires-Dist: promptflow-core>=1.17.1
26
24
  Requires-Dist: pyjwt>=2.8.0
27
25
  Requires-Dist: azure-identity>=1.16.0
28
26
  Requires-Dist: azure-core>=1.30.2
@@ -400,6 +398,51 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
400
398
 
401
399
  # Release History
402
400
 
401
+ ## 1.10.0 (2025-07-31)
402
+
403
+ ### Breaking Changes
404
+
405
+ - Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`. Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
406
+
407
+ ### Features Added
408
+
409
+ - Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
410
+ - Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
411
+ tolerance for harmful responses).
412
+ - Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
413
+
414
+
415
+ ### Bugs Fixed
416
+
417
+ - Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
418
+ - Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
419
+ - Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
420
+
421
+
422
+ ### Other Changes
423
+
424
+ - The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
425
+ - Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
426
+ This is due to be removed in a future release.
427
+
428
+
429
+ ## 1.9.0 (2025-07-02)
430
+
431
+ ### Features Added
432
+
433
+ - Added support for Azure Open AI evaluation via `AzureOpenAIScoreModelGrader` class, which serves as a wrapper around Azure Open AI score model configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
434
+ - Added new experimental risk categories ProtectedMaterial and CodeVulnerability for redteam agent scan.
435
+
436
+
437
+ ### Bugs Fixed
438
+
439
+ - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
440
+
441
+ - Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
442
+ - Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
443
+ - Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
444
+ - `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
445
+
403
446
  ## 1.8.0 (2025-05-29)
404
447
 
405
448
  ### Features Added