PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b5py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (72) hide show

azure/ai/evaluation/_evaluators/_eci/_eci.py CHANGED Viewed

@@ -1,11 +1,12 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing_extensions import override
+from typing_extensions import overload, override
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 @experimental
@@ -49,16 +50,40 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
         }
     """
+    id = "eci"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
-        eval_last_turn: bool = False,
     ):
         super().__init__(
             eval_metric=_InternalEvaluationMetrics.ECI,
             azure_ai_project=azure_ai_project,
             credential=credential,
-            eval_last_turn=eval_last_turn,
         )
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ): ...
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ): ...
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py CHANGED Viewed

@@ -106,27 +106,34 @@ class _AsyncF1ScoreEvaluator:
 class F1ScoreEvaluator:
     """
-    Initialize a f1 score evaluator for calculating F1 score.
+    Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
-    **Usage**
+    F1 Scores range from 0 to 1, with 1 being the best possible score.
-    .. code-block:: python
+    The F1-score computes the ratio of the number of shared words between the model generation and
+    the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
+    truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
+    precision is the ratio of the number of shared words to the total number of words in the generation, and recall
+    is the ratio of the number of shared words to the total number of words in the ground truth.
-        eval_fn = F1ScoreEvaluator()
-        result = eval_fn(
-            response="The capital of Japan is Tokyo.",
-            ground_truth="Tokyo is Japan's capital, known for its blend of traditional culture \
-                and technological advancements.")
+    Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
+    model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
+    information in the response.
-    **Output format**
-    .. code-block:: python
+    .. admonition:: Example:
-        {
-            "f1_score": 0.42
-        }
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START f1_score_evaluator]
+            :end-before: [END f1_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an F1ScoreEvaluator.
     """
+    id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self):
         self._async_evaluator = _AsyncF1ScoreEvaluator()

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -3,57 +3,89 @@
 # ---------------------------------------------------------
 import os
-from typing import Optional
+from typing import Dict, List, Union
-from typing_extensions import override
+from typing_extensions import overload, override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
-class FluencyEvaluator(PromptyEvaluatorBase):
+class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
-    Initialize a fluency evaluator configured for a specific Azure OpenAI model.
+    Evaluates the fluency of a given response or a multi-turn conversation, including reasoning.
+    The fluency measure assesses the extent to which the generated text conforms to grammatical rules, syntactic
+    structures, and appropriate vocabulary usage, resulting in linguistically correct responses.
+    Fluency scores range from 1 to 5, with 1 being the least fluent and 5 being the most fluent.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
-    **Usage**
-    .. code-block:: python
+    .. admonition:: Example:
-        eval_fn = FluencyEvaluator(model_config)
-        result = eval_fn(response="The capital of Japan is Tokyo.")
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START fluency_evaluator]
+            :end-before: [END fluency_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a FluencyEvaluator.
-    **Output format**
+    .. note::
-    .. code-block:: python
-        {
-            "fluency": 4.0,
-            "gpt_fluency": 4.0,
-        }
-    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
-    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
     _PROMPTY_FILE = "fluency.prompty"
     _RESULT_KEY = "fluency"
+    id = "azureml://registries/azureml/models/Fluency-Evaluator/versions/4"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
     def __init__(self, model_config):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
-    @override
+    @overload
     def __call__(
         self,
         *,
-        response: Optional[str] = None,
-        conversation=None,
+        response: str,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate fluency in given response
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The fluency score
+        :rtype: Dict[str, float]
+        """
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
+        """Evaluate fluency for a conversation
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The fluency score
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
         **kwargs,
     ):
         """
@@ -62,12 +94,11 @@ class FluencyEvaluator(PromptyEvaluatorBase):
         the evaluator will aggregate the results of each turn.
         :keyword response: The response to be evaluated. Mutually exclusive with the "conversation" parameter.
-        :paramtype response: str
+        :paramtype response: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The fluency score.
         :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
         """
-        return super().__call__(response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_gleu/_gleu.py CHANGED Viewed

@@ -24,31 +24,29 @@ class _AsyncGleuScoreEvaluator:
 class GleuScoreEvaluator:
     """
-    Evaluator that computes the BLEU Score between two strings.
+    Calculates the GLEU (Google-BLEU) score between a response and the ground truth.
     The GLEU (Google-BLEU) score evaluator measures the similarity between generated and reference texts by
     evaluating n-gram overlap, considering both precision and recall. This balanced evaluation, designed for
     sentence-level assessment, makes it ideal for detailed analysis of translation quality. GLEU is well-suited for
     use cases such as machine translation, text summarization, and text generation.
-    **Usage**
+    GLEU scores range from 0 to 1, where a value of 1 represents perfect overlap between the response and
+    the ground truth and a value of 0 indicates no overlap.
-    .. code-block:: python
+    .. admonition:: Example:
-        eval_fn = GleuScoreEvaluator()
-        result = eval_fn(
-            response="Tokyo is the capital of Japan.",
-            ground_truth="The capital of Japan is Tokyo.")
-    **Output format**
-    .. code-block:: python
-        {
-            "gleu_score": 0.41
-        }
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START gleu_score_evaluator]
+            :end-before: [END gleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a GleuScoreEvaluator.
     """
+    id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self):
         self._async_evaluator = _AsyncGleuScoreEvaluator()

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -2,12 +2,13 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os
-from typing import Optional
+from typing import Dict, List, Optional, Union
-from typing_extensions import override
+from typing_extensions import overload, override
 from promptflow.core import AsyncPrompty
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
 from ..._common.utils import construct_prompty_model_config, validate_model_config
 try:
@@ -16,36 +17,37 @@ except ImportError:
     USER_AGENT = "None"
-class GroundednessEvaluator(PromptyEvaluatorBase):
+class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
-    Initialize a groundedness evaluator configured for a specific Azure OpenAI model.
+    Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
+    including reasoning.
+    The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
+    context, making sure that these claims are substantiated by the context. Even if the responses from LLM are
+    factually correct, they'll be considered ungrounded if they can't be verified against the provided sources
+    (such as your input source or your database). Use the groundedness metric when you need to verify that
+    AI-generated responses align with and are validated by the provided context.
+    Groundedness scores range from 1 to 5, with 1 being the least grounded and 5 being the most grounded.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
-    **Usage**
-    .. code-block:: python
+    .. admonition:: Example:
-        eval_fn = GroundednessEvaluator(model_config)
-        result = eval_fn(
-            response="The capital of Japan is Tokyo.",
-            context="Tokyo is Japan's capital, known for its blend of traditional culture \
-                and technological advancements.")
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START groundedness_evaluator]
+            :end-before: [END groundedness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a GroundednessEvaluator.
-    **Output format**
+    .. note::
-    .. code-block:: python
-        {
-            "groundedness": 5,
-            "gpt_groundedness": 5,
-        }
-    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
-    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
     _PROMPTY_FILE_NO_QUERY = "groundedness_without_query.prompty"
@@ -53,6 +55,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
     _RESULT_KEY = "groundedness"
     _OPTIONAL_PARAMS = ["query"]
+    id = "azureml://registries/azureml/models/Groundedness-Evaluator/versions/4"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
     def __init__(self, model_config):
         current_dir = os.path.dirname(__file__)
@@ -62,14 +67,47 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
         self._model_config = model_config
         # Needs to be set because it's used in call method to re-validate prompt if `query` is provided
-    @override
+    @overload
     def __call__(
         self,
         *,
+        response: str,
+        context: str,
         query: Optional[str] = None,
-        response: Optional[str] = None,
-        context: Optional[str] = None,
-        conversation=None,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate groundedness for given input of response, context
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be evaluated.
+        :paramtype context: str
+        :keyword query: The query to be evaluated. Optional parameter for use with the `response`
+            and `context` parameters. If provided, a different prompt template will be used for evaluation.
+        :paramtype query: Optional[str]
+        :return: The groundedness score.
+        :rtype: Dict[str, float]
+        """
+    @overload
+    def __call__(
+        self,
+        *,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
+        """Evaluate groundedness for a conversation
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The groundedness score.
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
         **kwargs,
     ):
         """Evaluate groundedness. Accepts either a query, response, and context for a single evaluation,
@@ -89,10 +127,10 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The relevance score.
-        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
-        if query:
+        if kwargs.get("query", None):
             current_dir = os.path.dirname(__file__)
             prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
             self._prompty_file = prompty_path
@@ -103,4 +141,4 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
             )
             self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
-        return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_meteor/_meteor.py CHANGED Viewed

@@ -34,7 +34,7 @@ class _AsyncMeteorScoreEvaluator:
 class MeteorScoreEvaluator:
     """
-    Evaluator that computes the METEOR Score between two strings.
+    Calculates the METEOR score for a given response and ground truth.
     The METEOR (Metric for Evaluation of Translation with Explicit Ordering) score grader evaluates generated text by
     comparing it to reference texts, focusing on precision, recall, and content alignment. It addresses limitations of
@@ -42,6 +42,12 @@ class MeteorScoreEvaluator:
     word stems to more accurately capture meaning and language variations. In addition to machine translation and
     text summarization, paraphrase detection is an optimal use case for the METEOR score.
+    Use the METEOR score when you want a more linguistically informed evaluation metric that captures not only
+    n-gram overlap but also accounts for synonyms, stemming, and word order. This is particularly useful for evaluating
+    tasks like machine translation, text summarization, and text generation.
+    The METEOR score ranges from 0 to 1, with 1 indicating a perfect match.
     :param alpha: The METEOR score alpha parameter. Default is 0.9.
     :type alpha: float
     :param beta: The METEOR score beta parameter. Default is 3.0.
@@ -49,28 +55,19 @@ class MeteorScoreEvaluator:
     :param gamma: The METEOR score gamma parameter. Default is 0.5.
     :type gamma: float
-    **Usage**
-    .. code-block:: python
-        eval_fn = MeteorScoreEvaluator(
-            alpha=0.9,
-            beta=3.0,
-            gamma=0.5
-        )
-        result = eval_fn(
-            response="Tokyo is the capital of Japan.",
-            ground_truth="The capital of Japan is Tokyo.")
+    .. admonition:: Example:
-    **Output format**
-    .. code-block:: python
-        {
-            "meteor_score": 0.62
-        }
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START meteor_score_evaluator]
+            :end-before: [END meteor_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
     """
+    id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
         self._async_evaluator = _AsyncMeteorScoreEvaluator(alpha=alpha, beta=beta, gamma=gamma)

azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py CHANGED Viewed

@@ -28,12 +28,10 @@ class ContentSafetyMultimodalEvaluator:
     :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID,
         resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param parallel: Specifies whether to use parallel execution for evaluators.
-        If True, evaluators execute in parallel; otherwise, they execute sequentially. Defaults to True.
-    :type parallel: bool
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
     :return: A function that evaluates multimodal chat messages and generates content safety metrics.
-    :rtype: Callable
     **Usage Example**
@@ -45,7 +43,7 @@ class ContentSafetyMultimodalEvaluator:
             "project_name": "<project_name>",
         }
         eval_fn = ContentSafetyMultimodalEvaluator(azure_ai_project)
-        result = eval_fn(
+        result = eval_fn(conversation=
             {
                 "messages": [
                     {
@@ -92,8 +90,11 @@ class ContentSafetyMultimodalEvaluator:
     """
-    def __init__(self, credential, azure_ai_project, parallel: bool = False):
-        self._parallel = parallel
+    id = "content_safety_multimodal"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    def __init__(self, credential, azure_ai_project, **kwargs):
+        self._parallel = kwargs.pop("_parallel", False)
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
             ViolenceMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project),
             SexualMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project),
@@ -104,8 +105,9 @@ class ContentSafetyMultimodalEvaluator:
     def __call__(self, *, conversation, **kwargs):
         """
         Evaluates content-safety metrics for list of messages.
         :keyword conversation: The conversation contains list of messages to be evaluated.
-            Each message should have "role" and "content" keys.
+            Each message should have "role" and "content" keys. It supports single turn only.
         :paramtype conversation: ~azure.ai.evaluation.Conversation
         :return: The evaluation score based on the Content Safety Metrics.
         :rtype: Dict[str, Union[float, str]]

azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py CHANGED Viewed

@@ -7,10 +7,8 @@ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_mu
 from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
 from azure.ai.evaluation._common.utils import validate_conversation
 from azure.core.credentials import TokenCredential
-from azure.ai.evaluation._common._experimental import experimental
-@experimental
 class ContentSafetyMultimodalEvaluatorBase(ABC):
     """
     Initialize a evaluator for a specified Evaluation Metric. Base class that is not

azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py CHANGED Viewed

@@ -28,7 +28,7 @@ class HateUnfairnessMultimodalEvaluator:
             "project_name": "<project_name>",
         }
         eval_fn = HateUnfairnessMultimodalEvaluator(azure_ai_project)
-        result = eval_fn(
+        result = eval_fn(conversation=
             {
                 "messages": [
                     {
@@ -66,14 +66,18 @@ class HateUnfairnessMultimodalEvaluator:
     """
+    id = "hate_unfairness_multimodal"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self, credential, azure_ai_project):
         self._async_evaluator = _AsyncHateUnfairnessMultimodalEvaluator(credential, azure_ai_project)
     def __call__(self, *, conversation, **kwargs):
         """
         Evaluates hate unfairness content.
         :keyword conversation: The conversation contains list of messages to be evaluated.
-            Each message should have "role" and "content" keys.
+            Each message should have "role" and "content" keys. It supports single turn only.
         :paramtype conversation: ~azure.ai.evaluation.Conversation
         :return: The hate unfairness score.
         :rtype: Dict[str, Union[float, str]]

azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py CHANGED Viewed

@@ -2,10 +2,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from promptflow._utils.async_utils import async_run_allowing_running_loop
-from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._common.utils import validate_conversation
 from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_multimodal
+from azure.ai.evaluation._common._experimental import experimental
 @experimental
@@ -22,7 +22,6 @@ class ProtectedMaterialMultimodalEvaluator:
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
     :return: A dictionary containing the evaluation result label and reasoning.
-    :rtype: Dict[str, str]
     **Usage Example**
@@ -34,7 +33,7 @@ class ProtectedMaterialMultimodalEvaluator:
             "project_name": "<project_name>",
         }
         eval_fn = ProtectedMaterialMultimodalEvaluator(azure_ai_project)
-        result = eval_fn(
+        result = eval_fn(conversation=
             {
                 "messages": [
                     {
@@ -71,6 +70,9 @@ class ProtectedMaterialMultimodalEvaluator:
     """
+    id = "protected_material_multimodal"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(
         self,
         credential,
@@ -82,8 +84,9 @@ class ProtectedMaterialMultimodalEvaluator:
         """
         Evaluates protected materials content.
-        :keyword messages: The messages to be evaluated. Each message should have "role" and "content" keys.
-        :paramtype messages: ~azure.ai.evaluation.Conversation
+        :keyword conversation: The conversation contains list of messages to be evaluated.
+            Each message should have "role" and "content" keys. It supports single turn only.
+        :paramtype conversation: ~azure.ai.evaluation.Conversation
         :return: A dictionary containing a boolean label and reasoning.
         :rtype: Dict[str, str]
         """
@@ -101,8 +104,9 @@ class _AsyncProtectedMaterialMultimodalEvaluator:
     async def __call__(self, *, conversation, **kwargs):
         """
         Evaluates content according to this evaluator's metric.
         :keyword conversation: The conversation contains list of messages to be evaluated.
-        Each message should have "role" and "content" keys.
+            Each message should have "role" and "content" keys. It supports single turn only.
         :paramtype conversation: ~azure.ai.evaluation.Conversation
         :return: The evaluation score computation based on the Content Safety metric (self.metric).
         :rtype: Any

azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b5py3-none-any.whl → 1.1.0py3-none-any.whl