PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b4py3-none-any.whl → 1.0.0b5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (79) hide show

azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py ADDED Viewed

@@ -0,0 +1,150 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Optional, Dict
+from typing_extensions import override
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._common.constants import EvaluationMetrics
+from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+@experimental
+class GroundednessProEvaluator(RaiServiceEvaluatorBase):
+    """
+    Initialize a Groundedness Pro evaluator for determine if the response is grounded
+    in the query and context.
+    If this evaluator is supplied to the `evaluate` function, the aggregated metric
+    for the groundedness pro label will be "groundedness_pro_passing_rate".
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
+    :param azure_ai_project: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+    **Usage**
+    .. code-block:: python
+        azure_ai_project = {
+            "subscription_id": "<subscription_id>",
+            "resource_group_name": "<resource_group_name>",
+            "project_name": "<project_name>",
+        }
+        credential = DefaultAzureCredential()
+        eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
+        result = eval_fn(query="What's the capital of France", response="Paris", context="Paris.")
+    **Output format**
+    .. code-block:: python
+        {
+            "groundedness_pro_label": True,
+            "reason": "'All Contents are grounded"
+        }
+    **Usage with conversation input**
+    .. code-block:: python
+        azure_ai_project = {
+            "subscription_id": "<subscription_id>",
+            "resource_group_name": "<resource_group_name>",
+            "project_name": "<project_name>",
+        }
+        credential = DefaultAzureCredential()
+        eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
+        conversation = {
+            "messages": [
+                {"role": "user", "content": "What is the capital of France?"},
+                {"role": "assistant", "content": "Paris.", "context": "Paris."}
+                {"role": "user", "content": "What is the capital of Germany?"},
+                {"role": "assistant", "content": "Berlin.", "context": "Berlin."}
+            ]
+        }
+        result = eval_fn(conversation=conversation)
+    **Output format**
+    .. code-block:: python
+            {
+                "groundedness_pro_label": 1.0,
+                "evaluation_per_turn": {
+                    "groundedness_pro_label": [True, True],
+                    "groundedness_pro_reason": ["All contents are grounded", "All contents are grounded"]
+                }
+            }
+    """
+    @override
+    def __init__(
+        self,
+        credential,
+        azure_ai_project,
+        **kwargs,
+    ):
+        self._passing_score = 3  # TODO update once the binarization PR is merged
+        self._output_prefix = "groundedness_pro"
+        super().__init__(
+            eval_metric=EvaluationMetrics.GROUNDEDNESS,
+            azure_ai_project=azure_ai_project,
+            credential=credential,
+            **kwargs,
+        )
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        context: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
+        """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
+        or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
+        the evaluator will aggregate the results of each turn, with the per-turn results available
+        in the output under the "evaluation_per_turn" key.
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :keyword context: The context to be evaluated.
+        :paramtype context: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The relevance score.
+        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
+        """
+        return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
+    @override
+    async def _do_eval(self, eval_input: Dict):
+        """This evaluator has some unique post-processing that requires data that
+        the rai_service script is not currently built to handle. So we post-post-process
+        the result here to message it into the right form.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        result = await super()._do_eval(eval_input)
+        real_result = {}
+        real_result[self._output_prefix + "_label"] = (
+            result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self._passing_score
+        )
+        real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
+        return real_result

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -5,13 +5,11 @@
 import math
 import os
 import re
-from typing import Union
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from promptflow.core import AsyncPrompty
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
 from ..._common.utils import construct_prompty_model_config, validate_model_config
@@ -23,19 +21,19 @@ except ImportError:
 class _AsyncSimilarityEvaluator:
     # Constants must be defined within eval's directory to be save/loadable
-    PROMPTY_FILE = "similarity.prompty"
-    LLM_CALL_TIMEOUT = 600
-    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
+    _PROMPTY_FILE = "similarity.prompty"
+    _LLM_CALL_TIMEOUT = 600
+    _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]):
+    def __init__(self, model_config: dict):
         prompty_model_config = construct_prompty_model_config(
-            model_config,
-            self.DEFAULT_OPEN_API_VERSION,
+            validate_model_config(model_config),
+            self._DEFAULT_OPEN_API_VERSION,
             USER_AGENT,
         )
         current_dir = os.path.dirname(__file__)
-        prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
     async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
@@ -68,7 +66,7 @@ class _AsyncSimilarityEvaluator:
         # Run the evaluation flow
         llm_output = await self._flow(
-            query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
+            query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
         )
         score = math.nan
@@ -77,7 +75,7 @@ class _AsyncSimilarityEvaluator:
             if match:
                 score = float(match.group())
-        return {"gpt_similarity": float(score)}
+        return {"similarity": float(score), "gpt_similarity": float(score)}
 class SimilarityEvaluator:
@@ -103,12 +101,17 @@ class SimilarityEvaluator:
     .. code-block:: python
         {
-            "gpt_similarity": 3.0
+            "similarity": 3.0,
+            "gpt_similarity": 3.0,
         }
+    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
+    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
-    def __init__(self, model_config: dict):
-        self._async_evaluator = _AsyncSimilarityEvaluator(validate_model_config(model_config))
+    def __init__(self, model_config):
+        self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
     def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
         """

azure/ai/evaluation/_evaluators/_xpia/xpia.py CHANGED Viewed

@@ -2,19 +2,24 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import logging
+from typing import Optional
 from typing_extensions import override
+from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 logger = logging.getLogger(__name__)
+@experimental
 class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
     """A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator.
     Detect whether cross domain injected attacks are present in your AI system's response.
+    Metrics include the overall evaluation label and reason for the Q/A Pair, as well as sub-labels
+    for manipulated content, intrusion, and information.
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
@@ -24,10 +29,7 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
     :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
         focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
     :type eval_last_turn: bool
-    :return: A function that evaluates and generates metrics for XPIA chat scenario. Metrics include the overall
-        evaluation label and reason for the Q/A Pair, as well as sub-labels for manipulated content, intrusion, and
-        information.
-    :rtype: Callable
         **Usage**
         .. code-block:: python
@@ -53,7 +55,7 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
     def __init__(
         self,
         credential,
-        azure_ai_project: dict,
+        azure_ai_project,
         eval_last_turn: bool = False,
     ):
         super().__init__(
@@ -62,3 +64,28 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
             credential=credential,
             eval_last_turn=eval_last_turn,
         )
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
+        """
+        Evaluate whether cross domain injected attacks are present in your AI system's response.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages". Conversation turns are expected
+            to be dictionaries with keys "content" and "role".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The fluency score.
+        :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
+        """
+        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)

azure/ai/evaluation/_exceptions.py CHANGED Viewed

@@ -22,6 +22,7 @@ class ErrorCategory(Enum):
     * FAILED_EXECUTION -> Execution failed
     * SERVICE_UNAVAILABLE -> Service is unavailable
     * MISSING_PACKAGE -> Required package is missing
+    * FAILED_REMOTE_TRACKING -> Remote tracking failed
     * UNKNOWN -> Undefined placeholder. Avoid using.
     """
@@ -33,6 +34,7 @@ class ErrorCategory(Enum):
     FAILED_EXECUTION = "FAILED_EXECUTION"
     SERVICE_UNAVAILABLE = "SERVICE UNAVAILABLE"
     MISSING_PACKAGE = "MISSING PACKAGE"
+    FAILED_REMOTE_TRACKING = "FAILED REMOTE TRACKING"
     UNKNOWN = "UNKNOWN"
@@ -59,12 +61,15 @@ class ErrorTarget(Enum):
     RAI_CLIENT = "RAIClient"
     COHERENCE_EVALUATOR = "CoherenceEvaluator"
     CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
+    CONTENT_SAFETY_MULTIMODAL_EVALUATOR = "ContentSafetyMultimodalEvaluator"
     ECI_EVALUATOR = "ECIEvaluator"
     F1_EVALUATOR = "F1Evaluator"
     GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
     PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
     RELEVANCE_EVALUATOR = "RelevanceEvaluator"
     SIMILARITY_EVALUATOR = "SimilarityEvaluator"
+    FLUENCY_EVALUATOR = "FluencyEvaluator"
+    RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
     INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
     INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator"
     ADVERSARIAL_SIMULATOR = "AdversarialSimulator"
@@ -90,6 +95,8 @@ class EvaluationException(AzureError):
     :type category: ~azure.ai.evaluation._exceptions.ErrorCategory
     :param blame: The source of blame for the error, defaults to Unknown.
     :type balance: ~azure.ai.evaluation._exceptions.ErrorBlame
+    :param tsg_link: A link to the TSG page for troubleshooting the error.
+    :type tsg_link: str
     """
     def __init__(
@@ -100,10 +107,20 @@ class EvaluationException(AzureError):
         target: ErrorTarget = ErrorTarget.UNKNOWN,
         category: ErrorCategory = ErrorCategory.UNKNOWN,
         blame: ErrorBlame = ErrorBlame.UNKNOWN,
+        tsg_link: Optional[str] = None,
         **kwargs,
     ) -> None:
         self.category = category
         self.target = target
         self.blame = blame
         self.internal_message = internal_message
+        self.tsg_link = tsg_link
         super().__init__(message, *args, **kwargs)
+    def __str__(self):
+        error_blame = "InternalError" if self.blame != ErrorBlame.USER_ERROR else "UserError"
+        msg = f"({error_blame}) {super().__str__()}"
+        if self.tsg_link:
+            msg += f"\nVisit {self.tsg_link} to troubleshoot this issue."
+        return msg

azure/ai/evaluation/_model_configurations.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, Literal, TypedDict
+from typing import Any, Dict, List, Literal, TypedDict, Union
 from typing_extensions import NotRequired
@@ -53,3 +53,20 @@ class EvaluatorConfig(TypedDict, total=False):
     column_mapping: Dict[str, str]
     """Dictionary mapping evaluator input name to column in data"""
+class Message(TypedDict):
+    role: str
+    content: Union[str, List[Dict]]
+    context: NotRequired[Dict[str, Any]]
+class Conversation(TypedDict):
+    messages: Union[List[Message], List[Dict]]
+    context: NotRequired[Dict[str, Any]]
+class EvaluationResult(TypedDict):
+    metrics: Dict
+    studio_url: NotRequired[str]
+    rows: List[Dict]

azure/ai/evaluation/_version.py CHANGED Viewed

@@ -2,4 +2,4 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-VERSION = "1.0.0b4"
+VERSION = "1.0.0b5"

azure/ai/evaluation/simulator/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from ._adversarial_scenario import AdversarialScenario
+from ._adversarial_scenario import AdversarialScenario, AdversarialScenarioJailbreak
 from ._adversarial_simulator import AdversarialSimulator
 from ._constants import SupportedLanguages
 from ._direct_attack_simulator import DirectAttackSimulator
@@ -8,6 +8,7 @@ from ._simulator import Simulator
 __all__ = [
     "AdversarialSimulator",
     "AdversarialScenario",
+    "AdversarialScenarioJailbreak",
     "DirectAttackSimulator",
     "IndirectAttackSimulator",
     "SupportedLanguages",

azure/ai/evaluation/simulator/_adversarial_scenario.py CHANGED Viewed

@@ -16,6 +16,11 @@ class AdversarialScenario(Enum):
     ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded"
     ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded"
     ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material"
+class AdversarialScenarioJailbreak(Enum):
+    """Adversarial scenario types for XPIA Jailbreak"""
     ADVERSARIAL_INDIRECT_JAILBREAK = "adv_xpia"

azure/ai/evaluation/simulator/_adversarial_simulator.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
 from tqdm import tqdm
+from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._http_utils import get_async_http_client
@@ -21,7 +22,6 @@ from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
 from ._constants import SupportedLanguages
 from ._conversation import CallbackConversationBot, ConversationBot, ConversationRole, ConversationTurn
 from ._conversation._conversation import simulate_conversation
-from ._helpers import experimental
 from ._model_tools import (
     AdversarialTemplateHandler,
     ManagedIdentityAPITokenManager,
@@ -276,6 +276,9 @@ class AdversarialSimulator:
             "target_population",
             "topic",
             "ch_template_placeholder",
+            "chatbot_name",
+            "name",
+            "group",
         ):
             template_parameters.pop(key, None)
         if conversation_category:

azure/ai/evaluation/simulator/_data_sources/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------

azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b4py3-none-any.whl → 1.0.0b5py3-none-any.whl