PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b5py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (72) hide show

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -2,70 +2,101 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os
-from typing import Optional
+from typing import Dict, Union, List
-from typing_extensions import override
+from typing_extensions import overload, override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
-class CoherenceEvaluator(PromptyEvaluatorBase):
+class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
-    Initialize a coherence evaluator configured for a specific Azure OpenAI model.
+    Evaluates coherence score for a given query and response or a multi-turn conversation, including reasoning.
+    The coherence measure assesses the ability of the language model to generate text that reads naturally,
+    flows smoothly, and resembles human-like language in its responses. Use it when assessing the readability
+    and user-friendliness of a model's generated responses in real-world applications.
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
-    **Usage**
-    .. code-block:: python
-        eval_fn = CoherenceEvaluator(model_config)
-        result = eval_fn(
-            query="What is the capital of Japan?",
-            response="The capital of Japan is Tokyo.")
+    .. admonition:: Example:
-    **Output format**
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START coherence_evaluator]
+            :end-before: [END coherence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a CoherenceEvaluator with a query and response.
-    .. code-block:: python
+    .. note::
-        {
-            "coherence": 1.0,
-            "gpt_coherence": 1.0,
-        }
-    Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
-    To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
-    however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
+        To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
+        To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
+        however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
     """
     _PROMPTY_FILE = "coherence.prompty"
     _RESULT_KEY = "coherence"
+    id = "azureml://registries/azureml/models/Coherence-Evaluator/versions/4"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
     def __init__(self, model_config):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
-    @override
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate coherence for given input of query, response
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The coherence score.
+        :rtype: Dict[str, float]
+        """
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
+        """Evaluate coherence for a conversation
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The coherence score.
+        :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
         **kwargs,
     ):
         """Evaluate coherence. Accepts either a query and response for a single evaluation,
         or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
         turns, the evaluator will aggregate the results of each turn.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
         :keyword response: The response to be evaluated.
         :paramtype response: Optional[str]
-        :keyword context: The context to be evaluated.
-        :paramtype context: Optional[str]
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
@@ -73,4 +104,4 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
         :return: The relevance score.
         :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -7,11 +7,12 @@ from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
 from promptflow._utils.async_utils import async_run_allowing_running_loop
-from typing_extensions import ParamSpec, TypeAlias
+from typing_extensions import ParamSpec, TypeAlias, get_overloads
 from azure.ai.evaluation._common.math import list_mean
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._common.utils import remove_optional_singletons
+from azure.ai.evaluation._model_configurations import Conversation
 P = ParamSpec("P")
 T = TypeVar("T")
@@ -88,7 +89,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     # This needs to be overridden just to change the function header into something more informative,
     # and to be able to add a more specific docstring. The actual function contents should just be
     # super().__call__(<inputs>)
-    def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
         one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
         The actual behavior of this function shouldn't change beyond adding more inputs to the
@@ -127,11 +132,19 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         :rtype: List[str]
         """
+        overloads = get_overloads(self.__call__)
+        if not overloads:
+            call_signatures = [inspect.signature(self.__call__)]
+        else:
+            call_signatures = [inspect.signature(overload) for overload in overloads]
         call_signature = inspect.signature(self.__call__)
         singletons = []
-        for param in call_signature.parameters:
-            if param not in self._not_singleton_inputs:
-                singletons.append(param)
+        for call_signature in call_signatures:
+            params = call_signature.parameters
+            if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
+                continue
+            # exclude self since it is not a singleton input
+            singletons.extend([p for p in params if p != "self"])
         return singletons
     def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
@@ -190,6 +203,59 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         return converter
+    def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
+        """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
+        This uses the inputs derived from the _derive_singleton_inputs function to determine which
+        aspects of a conversation ought to be extracted.
+        :return: The function that will be used to convert conversations to evaluable inputs.
+        :rtype: Callable
+        """
+        def multi_modal_converter(conversation: Dict) -> List[Dict[str, Any]]:
+            messages = cast(List[Dict[str, Any]], conversation["messages"])
+            # Extract user messages, assistant messages from conversation
+            user_messages: List[Dict[str, Any]] = []
+            assistant_messages: List[Dict[str, Any]] = []
+            system_messages: List[Dict[str, Any]] = []
+            # Convert conversation slice into queries and responses.
+            # Assume that 'user' role is asking queries and 'assistant' role is responding.
+            if self._eval_last_turn and len(messages) > 1:
+                messages = messages[-2:]
+            for each_turn in messages:
+                role = each_turn["role"]
+                if role == "user":
+                    user_messages.append(each_turn)
+                elif role == "assistant":
+                    assistant_messages.append(each_turn)
+                elif role == "system":
+                    system_messages.append(each_turn)
+            # validation
+            if len(user_messages) != len(assistant_messages):
+                raise EvaluationException(
+                    message="Mismatched number of user and assistant messages.",
+                    internal_message=("Mismatched number of user and assistant messages."),
+                )
+            if len(assistant_messages) > 1:
+                raise EvaluationException(
+                    message="Conversation can have only one assistant message.",
+                    internal_message=("Conversation can have only one assistant message."),
+                )
+            eval_conv_inputs = []
+            for user_msg, assist_msg in zip(user_messages, assistant_messages):
+                conv_messages = []
+                if len(system_messages) == 1:
+                    conv_messages.append(system_messages[0])
+                conv_messages.append(user_msg)
+                conv_messages.append(assist_msg)
+                eval_conv_inputs.append({"conversation": Conversation(messages=conv_messages)})
+            return eval_conv_inputs
+        return multi_modal_converter
     def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
         """Convert an arbitrary input into a list of inputs for evaluators.
         It is assumed that evaluators generally make use of their inputs in one of two ways.
@@ -198,7 +264,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         values.
         The self._singleton_inputs list assigned during initialization is used to find and extract
-        singleton keywords, and self._allow_converssation_input is used to determine if a conversation
+        singleton keywords, and self._allow_conversation_input is used to determine if a conversation
         is a valid input.
         If both conversations and singletons are allowed, the function will raise an exception if both
@@ -229,6 +295,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             )
         # Handle Conversation
         if conversation is not None:
+            if self._is_multi_modal_conversation(conversation):
+                return self._derive_multi_modal_conversation_converter()(conversation)
             return self._derive_conversation_converter()(conversation)
         # Handle Singletons
         required_singletons = remove_optional_singletons(self, singletons)
@@ -243,6 +311,20 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             target=ErrorTarget.CONVERSATION,
         )
+    def _is_multi_modal_conversation(self, conversation: Dict) -> bool:
+        if "messages" not in conversation:
+            return False
+        messages = conversation["messages"]
+        if not isinstance(messages, list):
+            return False
+        for message in messages:
+            if "content" in message:
+                content = message.get("content", "")
+                if isinstance(content, list):
+                    if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content):
+                        return True
+        return False
     def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
         """Aggregate the evaluation results of each conversation turn into a single result.

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -4,12 +4,13 @@
 import math
 import re
-from typing import Dict, Union
+from typing import Dict, TypeVar, Union
 from promptflow.core import AsyncPrompty
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
 from . import EvaluatorBase
@@ -18,8 +19,10 @@ try:
 except ImportError:
     USER_AGENT = "None"
+T = TypeVar("T")
-class PromptyEvaluatorBase(EvaluatorBase[float]):
+class PromptyEvaluatorBase(EvaluatorBase[T]):
     """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
     make use of a prompty file, and return their results as a dictionary, with a single key-value pair
     linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
@@ -45,10 +48,12 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
         self._prompty_file = prompty_file
         super().__init__(eval_last_turn=eval_last_turn)
+        subclass_name = self.__class__.__name__
+        user_agent = f"{USER_AGENT} (type=evaluator subtype={subclass_name})"
         prompty_model_config = construct_prompty_model_config(
             validate_model_config(model_config),
             self._DEFAULT_OPEN_API_VERSION,
-            USER_AGENT,
+            user_agent,
         )
         self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
@@ -67,6 +72,14 @@ class PromptyEvaluatorBase(EvaluatorBase[float]):
         :return: The evaluation result.
         :rtype: Dict
         """
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message="Only text conversation inputs are supported.",
+                internal_message="Only text conversation inputs are supported.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.CONVERSATION,
+            )
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         score = math.nan

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, Optional, Union
+from typing import Dict, TypeVar, Union
 from typing_extensions import override
@@ -11,14 +11,15 @@ from azure.ai.evaluation._common.constants import (
     Tasks,
     _InternalAnnotationTasks,
 )
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
+from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._common.utils import validate_conversation
 from azure.core.credentials import TokenCredential
 from . import EvaluatorBase
-T = Union[str, float]
+T = TypeVar("T")
 class RaiServiceEvaluatorBase(EvaluatorBase[T]):
@@ -50,12 +51,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         self._credential = credential
     @override
-    def __call__(
+    def __call__(  # pylint: disable=docstring-missing-param
         self,
-        *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        *args,
         **kwargs,
     ):
         """Evaluate either a query and response or a conversation. Must supply either a query AND response,
@@ -71,7 +69,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
@@ -84,6 +82,36 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         :return: The evaluation result.
         :rtype: Dict
         """
+        if "query" in eval_input and "response" in eval_input:
+            return await self._evaluate_query_response(eval_input)
+        conversation = eval_input.get("conversation", None)
+        return await self._evaluate_conversation(conversation)
+    async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
+        """
+        Evaluates content according to this evaluator's metric.
+        :keyword conversation: The conversation contains list of messages to be evaluated.
+            Each message should have "role" and "content" keys.
+        :param conversation: The conversation to evaluate.
+        :type conversation: ~azure.ai.evaluation.Conversation
+        :return: The evaluation score computation based on the Content Safety metric (self.metric).
+        :rtype: Dict[str, Union[float, str]]
+        """
+        # validate inputs
+        validate_conversation(conversation)
+        messages = conversation["messages"]
+        # Run score computation based on supplied metric.
+        result = await evaluate_with_rai_service_multimodal(
+            messages=messages,
+            metric_name=self._eval_metric,
+            project_scope=self._azure_ai_project,
+            credential=self._credential,
+        )
+        return result
+    async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
         query = eval_input.get("query", None)
         response = eval_input.get("response", None)
         if query is None or response is None:
@@ -108,12 +136,13 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
                 )
             input_data["context"] = context
-        return await evaluate_with_rai_service(
+        return await evaluate_with_rai_service(  # type: ignore
             metric_name=self._eval_metric,
             data=input_data,
             project_scope=self._azure_ai_project,
             credential=self._credential,
             annotation_task=self._get_task(),
+            evaluator_name=self.__class__.__name__,
         )
     def _get_task(self):

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -2,13 +2,14 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from concurrent.futures import as_completed
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, List, Union
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
-from typing_extensions import override
+from typing_extensions import overload, override
-from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._common._experimental import experimental
 from ._hate_unfairness import HateUnfairnessEvaluator
 from ._self_harm import SelfHarmEvaluator
@@ -17,61 +18,36 @@ from ._violence import ViolenceEvaluator
 @experimental
-class ContentSafetyEvaluator(EvaluatorBase):
+class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     """
-    Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
+    Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
     :param credential: The credential for connecting to Azure AI project. Required
     :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
-    :type eval_last_turn: bool
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
-    :rtype: Callable
-    **Usage**
-    .. code-block:: python
-        azure_ai_project = {
-            "subscription_id": "<subscription_id>",
-            "resource_group_name": "<resource_group_name>",
-            "project_name": "<project_name>",
-        }
-        eval_fn = ContentSafetyEvaluator(azure_ai_project)
-        result = eval_fn(
-            query="What is the capital of France?",
-            response="Paris.",
-        )
-    **Output format**
-    .. code-block:: python
-        {
-            "violence": "Medium",
-            "violence_score": 5.0,
-            "violence_reason": "Some reason",
-            "sexual": "Medium",
-            "sexual_score": 5.0,
-            "sexual_reason": "Some reason",
-            "self_harm": "Medium",
-            "self_harm_score": 5.0,
-            "self_harm_reason": "Some reason",
-            "hate_unfairness": "Medium",
-            "hate_unfairness_score": 5.0,
-            "hate_unfairness_reason": "Some reason"
-        }
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START content_safety_evaluator]
+            :end-before: [END content_safety_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a ContentSafetyEvaluator.
     """
+    id = "content_safety"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     # TODO address 3579092 to re-enabled parallel evals.
-    def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
-        super().__init__(eval_last_turn=eval_last_turn)
-        self._parallel = kwargs.pop("parallel", False)
+    def __init__(self, credential, azure_ai_project, **kwargs):
+        super().__init__()
+        self._parallel = kwargs.pop("_parallel", True)
         self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
             ViolenceEvaluator(credential, azure_ai_project),
             SexualEvaluator(credential, azure_ai_project),
@@ -79,13 +55,43 @@ class ContentSafetyEvaluator(EvaluatorBase):
             HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
-    @override
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate a collection of content safety metrics for the given query/response pair
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The content safety scores.
+        :rtype: Dict[str, Union[str, float]]
+        """
+    @overload
     def __call__(
         self,
         *,
-        query: Optional[str] = None,
-        response: Optional[str] = None,
-        conversation=None,
+        conversation: Conversation,
+    ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
+        """Evaluate a collection of content safety metrics for a conversation
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The content safety scores.
+        :rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
         **kwargs,
     ):
         """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
@@ -100,9 +106,9 @@ class ContentSafetyEvaluator(EvaluatorBase):
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
         :return: The evaluation result.
-        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
-        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+        return super().__call__(*args, **kwargs)
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
@@ -124,7 +130,7 @@ class ContentSafetyEvaluator(EvaluatorBase):
             with ThreadPoolExecutor() as executor:
                 # pylint: disable=no-value-for-parameter
                 futures = {
-                    executor.submit(query=query, response=response, conversation=conversation): evaluator
+                    executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
                     for evaluator in self._evaluators
                 }

azure-ai-evaluation 1.0.0b5__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b5py3-none-any.whl → 1.1.0py3-none-any.whl