PyPI - azure-ai-evaluation - Versions diffs - 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.8.0py3-none-any.whl → 1.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show

azure/ai/evaluation/simulator/_model_tools/_template_handler.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing_extensions import NotRequired
 from azure.ai.evaluation._model_configurations import AzureAIProject
 from azure.ai.evaluation._common.onedp._client import AIProjectClient
+from azure.ai.evaluation.simulator._adversarial_scenario import AdversarialScenario
 from ._rai_client import RAIClient
@@ -148,14 +149,16 @@ class AdversarialTemplateHandler:
     """
     Initialize the AdversarialTemplateHandler.
-    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
-        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
+    :param azure_ai_project: The Azure AI project, which can either be a string representing the project endpoint
+        or an instance of AzureAIProject. It contains subscription id, resource group, and project name.
     :type azure_ai_project: Union[str, AzureAIProject]
     :param rai_client: The RAI client or AI Project client used for fetching parameters.
     :type rai_client: Union[~azure.ai.evaluation.simulator._model_tools.RAIClient, ~azure.ai.evaluation._common.onedp._client.AIProjectClient]
     """
-    def __init__(self, azure_ai_project: Union[str, AzureAIProject], rai_client: Union[RAIClient, AIProjectClient]) -> None:
+    def __init__(
+        self, azure_ai_project: Union[str, AzureAIProject], rai_client: Union[RAIClient, AIProjectClient]
+    ) -> None:
         self.azure_ai_project = azure_ai_project
         self.categorized_ch_parameters: Optional[Dict[str, _CategorizedParameter]] = None
         self.rai_client = rai_client
@@ -164,12 +167,11 @@ class AdversarialTemplateHandler:
         if self.categorized_ch_parameters is None:
             categorized_parameters: Dict[str, _CategorizedParameter] = {}
             util = ContentHarmTemplatesUtils
             if isinstance(self.rai_client, RAIClient):
                 parameters = await self.rai_client.get_contentharm_parameters()
             elif isinstance(self.rai_client, AIProjectClient):
                 parameters = literal_eval(self.rai_client.red_teams.get_template_parameters())
             for k in parameters.keys():
                 template_key = util.get_template_key(k)
                 categorized_parameters[template_key] = {
@@ -181,17 +183,29 @@ class AdversarialTemplateHandler:
         template_category = collection_key.split("adv_")[-1]
+        # Handle both qa_enterprise and qa_documents mapping to qa
+        if template_category in ["qa_enterprise", "qa_documents"]:
+            template_category = "qa"
         plist = self.categorized_ch_parameters
         ch_templates = []
         for key, value in plist.items():
+            # Skip enterprise templates for ADVERSARIAL_QA
+            if collection_key == AdversarialScenario.ADVERSARIAL_QA.value and "enterprise" in key:
+                continue
+            # Skip non-enterprise templates for ADVERSARIAL_QA_DOCUMENTS
+            if collection_key == AdversarialScenario.ADVERSARIAL_QA_DOCUMENTS.value and "enterprise" not in key:
+                continue
             if value["category"] == template_category:
                 params = value["parameters"]
                 for p in params:
                     p.update({"ch_template_placeholder": "{{ch_template_placeholder}}"})
                 template = AdversarialTemplate(template_name=key, text=None, context_key=[], template_parameters=params)
                 ch_templates.append(template)
         return ch_templates
     def get_template(self, template_name: str) -> Optional[AdversarialTemplate]:

azure/ai/evaluation/simulator/_model_tools/models.py CHANGED Viewed

@@ -478,7 +478,7 @@ class OpenAICompletionsModel(LLMBase):
         time_start = time.time()
         full_response = None
-        if(isinstance(session, AIProjectClient)):
+        if isinstance(session, AIProjectClient):
             response_data = session.red_teams.submit_simulation(request_data, headers, params)
         else:
             response = await session.post(url=self.endpoint_url, headers=headers, json=request_data, params=params)

azure/ai/evaluation/simulator/_simulator.py CHANGED Viewed

@@ -7,6 +7,7 @@ import asyncio
 import importlib.resources as pkg_resources
 import json
 import os
+import random
 import re
 import warnings
 from typing import Any, Callable, Dict, List, Optional, Union, Tuple
@@ -19,15 +20,12 @@ from azure.ai.evaluation._common.utils import construct_prompty_model_config
 from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from .._exceptions import ErrorBlame, ErrorCategory, EvaluationException
-from .._user_agent import USER_AGENT
+from .._user_agent import UserAgentSingleton
 from ._conversation.constants import ConversationRole
 from ._helpers import ConversationHistory, Turn
 from ._utils import JsonLineChatProtocol
-USER_AGENT += " (type=simulator; subtype=Simulator)"
 @experimental
 class Simulator:
     """
@@ -53,6 +51,10 @@ class Simulator:
         if "api_version" not in self.model_config:
             self.model_config["api_version"] = "2024-06-01"  # type: ignore
+    @staticmethod
+    def __user_agent() -> str:
+        return f"{UserAgentSingleton().value} (type=simulator; subtype=Simulator)"
     @staticmethod
     def _validate_model_config(model_config: Any):
         """
@@ -103,6 +105,7 @@ class Simulator:
         user_simulator_prompty_options: Dict[str, Any] = {},
         conversation_turns: List[List[Union[str, Dict[str, Any]]]] = [],
         concurrent_async_tasks: int = 5,
+        randomization_seed: Optional[int] = None,
         **kwargs,
     ) -> List[JsonLineChatProtocol]:
         """
@@ -133,6 +136,9 @@ class Simulator:
         :keyword concurrent_async_tasks: The number of asynchronous tasks to run concurrently during the simulation.
             Defaults to 5.
         :paramtype concurrent_async_tasks: int
+        :keyword randomization_seed: The seed used to randomize task/query order. If unset, the system's
+            default seed is used. Defaults to None.
+        :paramtype randomization_seed: Optional[int]
         :return: A list of simulated conversations represented as JsonLineChatProtocol objects.
         :rtype: List[JsonLineChatProtocol]
@@ -158,6 +164,13 @@ class Simulator:
                 f"Only the first {num_queries} lines of the specified tasks will be simulated."
             )
+        # Apply randomization to tasks if seed is provided
+        if randomization_seed is not None and tasks:
+            # Create a local random instance to avoid polluting global state
+            local_random = random.Random(randomization_seed)
+            tasks = tasks.copy()  # Don't modify the original list
+            local_random.shuffle(tasks)
         max_conversation_turns *= 2  # account for both user and assistant turns
         prompty_model_config = self.model_config
@@ -378,7 +391,7 @@ class Simulator:
                     prompty_model_config = construct_prompty_model_config(
                         model_config=prompty_model_config,  # type: ignore
                         default_api_version="2024-06-01",
-                        user_agent=USER_AGENT,
+                        user_agent=self.__user_agent(),
                     )
                     return AsyncPrompty.load(source=prompty_path, model=prompty_model_config)  # type: ignore
             except FileNotFoundError as e:
@@ -392,7 +405,7 @@ class Simulator:
         prompty_model_config = construct_prompty_model_config(
             model_config=prompty_model_config,  # type: ignore
             default_api_version="2024-06-01",
-            user_agent=USER_AGENT,
+            user_agent=self.__user_agent(),
         )
         return AsyncPrompty.load(
             source=user_simulator_prompty,
@@ -517,7 +530,7 @@ class Simulator:
                     prompty_model_config = construct_prompty_model_config(
                         model_config=prompty_model_config,  # type: ignore
                         default_api_version="2024-06-01",
-                        user_agent=USER_AGENT,
+                        user_agent=self.__user_agent(),
                     )
                     return AsyncPrompty.load(source=prompty_path, model=prompty_model_config)  # type: ignore
             except FileNotFoundError as e:
@@ -531,7 +544,7 @@ class Simulator:
         prompty_model_config = construct_prompty_model_config(
             model_config=prompty_model_config,  # type: ignore
             default_api_version="2024-06-01",
-            user_agent=USER_AGENT,
+            user_agent=self.__user_agent(),
         )
         return AsyncPrompty.load(
             source=query_response_generating_prompty,

{azure_ai_evaluation-1.8.0.dist-info → azure_ai_evaluation-1.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: azure-ai-evaluation
-Version: 1.8.0
+Version: 1.10.0
 Summary: Microsoft Azure Evaluation Library for Python
 Home-page: https://github.com/Azure/azure-sdk-for-python
 Author: Microsoft Corporation
@@ -21,8 +21,6 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
 License-File: NOTICE.txt
-Requires-Dist: promptflow-devkit>=1.17.1
-Requires-Dist: promptflow-core>=1.17.1
 Requires-Dist: pyjwt>=2.8.0
 Requires-Dist: azure-identity>=1.16.0
 Requires-Dist: azure-core>=1.30.2
@@ -400,6 +398,51 @@ This project has adopted the [Microsoft Open Source Code of Conduct][code_of_con
 # Release History
+## 1.10.0 (2025-07-31)
+### Breaking Changes
+- Added `evaluate_query` parameter to all RAI service evaluators that can be passed as a keyword argument. This parameter controls whether queries are included in evaluation data when evaluating query-response pairs. Previously, queries were always included in evaluations. When set to `True`, both query and response will be evaluated; when set to `False` (default), only the response will be evaluated. This parameter is available across all RAI service evaluators including `ContentSafetyEvaluator`, `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator`, `ProtectedMaterialEvaluator`, `IndirectAttackEvaluator`, `CodeVulnerabilityEvaluator`, `UngroundedAttributesEvaluator`, `GroundednessProEvaluator`, and `EciEvaluator`.  Existing code that relies on queries being evaluated will need to explicitly set `evaluate_query=True` to maintain the previous behavior.
+### Features Added
+- Added support for Azure OpenAI Python grader via `AzureOpenAIPythonGrader` class, which serves as a wrapper around Azure Open AI Python grader configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
+- Added `attack_success_thresholds` parameter to `RedTeam` class for configuring custom thresholds that determine attack success. This allows users to set specific threshold values for each risk category, with scores greater than the threshold considered successful attacks (i.e. higher threshold means higher
+tolerance for harmful responses).
+- Enhanced threshold reporting in RedTeam results to include default threshold values when custom thresholds aren't specified, providing better transparency about the evaluation criteria used.
+### Bugs Fixed
+- Fixed red team scan `output_path` issue where individual evaluation results were overwriting each other instead of being preserved as separate files. Individual evaluations now create unique files while the user's `output_path` is reserved for final aggregated results.
+- Significant improvements to TaskAdherence evaluator. New version has less variance, is much faster and consumes fewer tokens.
+- Significant improvements to Relevance evaluator. New version has more concrete rubrics and has less variance, is much faster and consumes fewer tokens.
+### Other Changes
+- The default engine for evaluation was changed from `promptflow` (PFClient) to an in-SDK batch client (RunSubmitterClient)
+  - Note: We've temporarily kept an escape hatch to fall back to the legacy `promptflow` implementation by setting `_use_pf_client=True` when invoking `evaluate()`.
+    This is due to be removed in a future release.
+## 1.9.0 (2025-07-02)
+### Features Added
+- Added support for Azure Open AI evaluation via `AzureOpenAIScoreModelGrader` class, which serves as a wrapper around Azure Open AI score model configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
+- Added new experimental risk categories ProtectedMaterial and CodeVulnerability for redteam agent scan.
+### Bugs Fixed
+- Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
+- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
+- Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
+- Added a new enum `ADVERSARIAL_QA_DOCUMENTS` which moves all the "file_content" type prompts away from `ADVERSARIAL_QA` to the new enum
+- `AzureOpenAIScoreModelGrader` evaluator now supports `pass_threshold` parameter to set the minimum score required for a response to be considered passing. This allows users to define custom thresholds for evaluation results, enhancing flexibility in grading AI model responses.
 ## 1.8.0 (2025-05-29)
 ### Features Added

azure-ai-evaluation 1.8.0__py3-none-any.whl → 1.10.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.8.0py3-none-any.whl → 1.10.0py3-none-any.whl