PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b3py3-none-any.whl → 1.0.0b5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (93) hide show

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -2,19 +2,56 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import List, Dict, Callable, Any
 import inspect
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
-from abc import ABC
-import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import ParamSpec, TypeAlias
+from azure.ai.evaluation._common.math import list_mean
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._common.utils import remove_optional_singletons
+P = ParamSpec("P")
+T = TypeVar("T")
+T_EvalValue = TypeVar("T_EvalValue")
+class DerivedEvalInput(TypedDict, total=False):
+    """The eval input generated by EvaluatorBase._derive_conversation_starter."""
+    query: Dict[str, Any]
+    response: Dict[str, Any]
+    context: str
+AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
+"""TypeAlias that models the return value of EvaluatorBase._aggregate_results
+    .. code-block:: python
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+    foo: AggregateResult[float] = {
+        "evaluation_per_turn": {
+            "coherence": [1.0, 2.0, 3.0]
+        },
+        "coherence": 2.0
+    }
+"""
+DoEvalResult: TypeAlias = Dict[str, T]
+"""TypeAlias that models the return value of EvaluatorBase._do_eval
+    .. code-block:: python
+    foo: DoEvalResult[float] = {
+        "coherence": 2.0
+    }
+"""
 # TODO exception target pass down?
-class EvaluatorBase(ABC):
+class EvaluatorBase(ABC, Generic[T_EvalValue]):
     """Base class for all evaluators that are capable of accepting either a group of single values,
     or conversation as input. All such evaluators need to implement two functions of their own:
         - _convert_conversation_to_eval_input
@@ -51,7 +88,7 @@ class EvaluatorBase(ABC):
     # This needs to be overridden just to change the function header into something more informative,
     # and to be able to add a more specific docstring. The actual function contents should just be
     # super().__call__(<inputs>)
-    def __call__(self, **kwargs) -> Dict:
+    def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
         one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
         The actual behavior of this function shouldn't change beyond adding more inputs to the
@@ -60,13 +97,12 @@ class EvaluatorBase(ABC):
         :keyword kwargs: A dictionary that contains inputs needed to evaluate a conversation.
         :type kwargs: Dict
         :return: The evaluation result
-        :rtype: Dict
+        :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
         """
         return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
-    # Probably the only thing that can't be simplified. Each evaluator, or at least each family
-    # of evaluators, will need to implement their own version of this function.
-    async def _do_eval(self, eval_input: Any) -> Dict:
+    @abstractmethod
+    async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
         """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
         In the default case, all required inputs are assumed to be within eval_input, as user-friendly
         typing is handled above this function in favor of polymorphic simplicity. This function must be
@@ -75,13 +111,8 @@ class EvaluatorBase(ABC):
         :param eval_input: Whatever inputs are needed for this evaluator to perform a single evaluation.
         :type eval_input: Any
         :return: A single evaluation result
-        :rtype: Dict
+        :rtype: DoEvalResult[T_EvalValue]
         """
-        raise EvaluationException(
-            message="Not implemented",
-            internal_message="BaseConversationEval's _do_eval method called somehow. This should be overridden.",
-        )
     # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -103,7 +134,7 @@ class EvaluatorBase(ABC):
                 singletons.append(param)
         return singletons
-    def _derive_conversation_converter(self) -> Callable:
+    def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
         """Produce the function that will be used to convert conversations to a list of evaluable inputs.
         This uses the inputs derived from the _derive_singleton_inputs function to determine which
         aspects of a conversation ought to be extracted.
@@ -115,12 +146,12 @@ class EvaluatorBase(ABC):
         include_query = "query" in self._singleton_inputs
         include_response = "response" in self._singleton_inputs
-        def converter(conversation: Dict) -> List:
-            messages = conversation["messages"]
+        def converter(conversation: Dict) -> List[DerivedEvalInput]:
+            messages = cast(List[Dict[str, Any]], conversation["messages"])
             global_context = conversation.get("context", None)
             # Extract queries, responses from conversation
-            queries = []
-            responses = []
+            queries: List[Dict[str, Any]] = []
+            responses: List[Dict[str, Any]] = []
             # Convert conversation slice into queries and responses.
             # Assume that 'user' role is asking queries and 'assistant' role is responding.
@@ -142,16 +173,16 @@ class EvaluatorBase(ABC):
                     response_context = response.get("context", None)
                     if global_context:
                         context["global_context"] = global_context
-                    if query_context and not include_query:
+                    if query_context and include_query:
                         context["query_context"] = query_context
-                    if response_context and not include_response:
+                    if response_context and include_response:
                         context["response_context"] = response_context
-                eval_input = {}
+                eval_input: DerivedEvalInput = {}
                 if include_query:
-                    eval_input["query"] = query
+                    eval_input["query"] = query.get("content", "")
                 if include_response:
-                    eval_input["response"] = response
+                    eval_input["response"] = response.get("content", "")
                 if include_context:
                     eval_input["context"] = str(context)
                 eval_inputs.append(eval_input)
@@ -159,7 +190,7 @@ class EvaluatorBase(ABC):
         return converter
-    def _convert_kwargs_to_eval_input(self, **kwargs) -> List:
+    def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
         """Convert an arbitrary input into a list of inputs for evaluators.
         It is assumed that evaluators generally make use of their inputs in one of two ways.
         Either they receive a collection of keyname inputs that are all single values
@@ -189,9 +220,9 @@ class EvaluatorBase(ABC):
             singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
         # Check that both conversation and other inputs aren't set
         if conversation is not None and any(singletons.values()):
+            msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
             raise EvaluationException(
-                message="Invalid input",
-                internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
+                message=msg,
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.CONVERSATION,
@@ -200,18 +231,19 @@ class EvaluatorBase(ABC):
         if conversation is not None:
             return self._derive_conversation_converter()(conversation)
         # Handle Singletons
-        if all(value is not None for value in singletons.values()):
-            return [singletons]  # TODO loosen requirements to allow for optional singletons?
+        required_singletons = remove_optional_singletons(self, singletons)
+        if all(value is not None for value in required_singletons.values()):
+            return [singletons]
         # Missing input
+        msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
         raise EvaluationException(
-            message="Missing input",
-            internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
+            message=msg,
             blame=ErrorBlame.USER_ERROR,
             category=ErrorCategory.INVALID_VALUE,
             target=ErrorTarget.CONVERSATION,
         )
-    def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict:
+    def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
         """Aggregate the evaluation results of each conversation turn into a single result.
         Exact implementation might need to vary slightly depending on the results produced.
@@ -224,11 +256,11 @@ class EvaluatorBase(ABC):
         values (including non-numerics) located in under the "evaluation_per_turn" key,
         which each sub-key being a metric and each sub-value being a the list of that metric's
         per-turn values.
-        :rtype: Dict
+        :rtype: AggregateResult[T_EvalValue]
         """
-        aggregated = {}
-        evaluation_per_turn = {}
+        aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
+        evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
         # Go over each turn, and rotate the results into a
         # metric: List[values] format for the evals_per_turn dictionary.
@@ -241,19 +273,18 @@ class EvaluatorBase(ABC):
         # Find and average all numeric values
         for metric, values in evaluation_per_turn.items():
             if all(isinstance(value, (int, float)) for value in values):
-                aggregated[metric] = np.mean(values)
+                aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
         # Slap the per-turn results back in.
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
-    async def _real_call(self, **kwargs):
+    async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """The asynchronous call where real end-to-end evaluation logic is performed.
         :keyword kwargs: The inputs to evaluate.
         :type kwargs: Dict
         :return: The evaluation result.
-        :rtype: Dict
+        :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
         """
         # Convert inputs into list of evaluable inputs.
         eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
@@ -270,9 +301,8 @@ class EvaluatorBase(ABC):
         # Otherwise, aggregate results.
         return self._aggregate_results(per_turn_results=per_turn_results)
-    # ~~~ METHODS THAT SHOULD NEVER BE OVERRIDDEN BY CHILDREN~~~
-    def _to_async(self):
+    @final
+    def _to_async(self) -> "AsyncEvaluatorBase":
         return self._async_evaluator
@@ -286,7 +316,7 @@ class AsyncEvaluatorBase:
     # Don't look at my shame. Nothing to see here....
     # Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
-    # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature#
+    # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
     # are just not passed into this function instead of ending up in kwargs.
     # Since we want this to be relatively call-agnostic, we just account for every input that any children
     # are known to throw at this, mash them into kwargs, and then pass them into the real call.

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -2,26 +2,24 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import math
 import re
-from typing import Dict
-from typing_extensions import override
-import numpy as np
+from typing import Dict, Union
 from promptflow.core import AsyncPrompty
+from typing_extensions import override
-from ..._common.utils import construct_prompty_model_config
+from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
+from . import EvaluatorBase
 try:
     from ..._user_agent import USER_AGENT
 except ImportError:
-    USER_AGENT = None
-from . import EvaluatorBase
+    USER_AGENT = "None"
-class PromptyEvaluatorBase(EvaluatorBase):
+class PromptyEvaluatorBase(EvaluatorBase[float]):
     """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
     make use of a prompty file, and return their results as a dictionary, with a single key-value pair
     linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
@@ -39,17 +37,17 @@ class PromptyEvaluatorBase(EvaluatorBase):
     :type ignore_queries: bool
     """
-    LLM_CALL_TIMEOUT = 600
-    DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
+    _LLM_CALL_TIMEOUT = 600
+    _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, *, result_key: str, prompty_file: str, model_config: Dict, eval_last_turn: bool = False):
+    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
         self._result_key = result_key
         self._prompty_file = prompty_file
         super().__init__(eval_last_turn=eval_last_turn)
         prompty_model_config = construct_prompty_model_config(
-            model_config,
-            self.DEFAULT_OPEN_API_VERSION,
+            validate_model_config(model_config),
+            self._DEFAULT_OPEN_API_VERSION,
             USER_AGENT,
         )
@@ -59,7 +57,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
     # defining a default here.
     @override
-    async def _do_eval(self, eval_input: Dict) -> Dict:
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
         """Do a relevance evaluation.
         :param eval_input: The input to the evaluator. Expected to contain
@@ -69,11 +67,20 @@ class PromptyEvaluatorBase(EvaluatorBase):
         :return: The evaluation result.
         :rtype: Dict
         """
-        llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
+        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        score = np.nan
+        score = math.nan
         if llm_output:
+            # Parse out score and reason from evaluators known to possess them.
+            if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
+                score, reason = parse_quality_evaluator_reason_score(llm_output)
+                return {
+                    self._result_key: float(score),
+                    f"gpt_{self._result_key}": float(score),
+                    f"{self._result_key}_reason": reason,
+                }
             match = re.search(r"\d", llm_output)
             if match:
                 score = float(match.group())
-        return {self._result_key: float(score)}
+            return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
+        return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -1,48 +1,53 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict, Optional, Union
-from typing import Dict, Optional
 from typing_extensions import override
-from azure.identity import DefaultAzureCredential
-from azure.ai.evaluation._common.constants import EvaluationMetrics
+from azure.ai.evaluation._common.constants import (
+    EvaluationMetrics,
+    _InternalEvaluationMetrics,
+    Tasks,
+    _InternalAnnotationTasks,
+)
 from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
+from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import EvaluationException
+from azure.core.credentials import TokenCredential
 from . import EvaluatorBase
+T = Union[str, float]
-class RaiServiceEvaluatorBase(EvaluatorBase):
+class RaiServiceEvaluatorBase(EvaluatorBase[T]):
     """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
     This includes content safety evaluators, protected material evaluators, and others. These evaluators
     are all assumed to be of the "query and response or conversation" input variety.
-    param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
-    to specify which evaluation to perform.
-    type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
-    param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
+    :param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
+        to specify which evaluation to perform.
+    :type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
+    :param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
         aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
         aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
         when this occurs. Default is False, resulting full conversation evaluation and aggregation.
-    type eval_last_turn: bool
+    :type eval_last_turn: bool
     """
     @override
     def __init__(
         self,
-        eval_metric: EvaluationMetrics,
+        eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
         azure_ai_project: dict,
-        credential: Optional[dict] = None,
+        credential: TokenCredential,
         eval_last_turn: bool = False,
     ):
         super().__init__(eval_last_turn=eval_last_turn)
         self._eval_metric = eval_metric
-        self._azure_ai_project = azure_ai_project
-        if credential is None:
-            # Use DefaultCredential if no credential is provided
-            self._credential = DefaultAzureCredential()
-        else:
-            self._credential = credential
+        self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
+        self._credential = credential
     @override
     def __call__(
@@ -50,8 +55,8 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
         *,
         query: Optional[str] = None,
         response: Optional[str] = None,
-        conversation: Optional[dict] = None,
-        **kwargs
+        conversation=None,
+        **kwargs,
     ):
         """Evaluate either a query and response or a conversation. Must supply either a query AND response,
         or a conversation, but not both.
@@ -63,14 +68,13 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
         :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
             key "messages", and potentially a global context under the key "context". Conversation turns are expected
             to be dictionaries with keys "content", "role", and possibly "context".
-        :paramtype conversation: Optional[Dict]
-        :return: The evaluation result.
-        :rtype: Dict
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :rtype: Union[Dict[str, T], Dict[str, Union[float, Dict[str, List[T]]]]]
         """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
     @override
-    async def _do_eval(self, eval_input: Dict):
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
         """Perform the evaluation using the Azure AI RAI service.
         The exact evaluation performed is determined by the evaluation metric supplied
         by the child class initializer.
@@ -90,10 +94,43 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
                     + " This should have failed earlier."
                 ),
             )
+        input_data = {"query": query, "response": response}
+        if "context" in self._singleton_inputs:
+            context = eval_input.get("context", None)
+            if context is None:
+                raise EvaluationException(
+                    message="Not implemented",
+                    internal_message=(
+                        "Attempted context-based evaluation without supplying context."
+                        + " This should have failed earlier."
+                    ),
+                )
+            input_data["context"] = context
         return await evaluate_with_rai_service(
             metric_name=self._eval_metric,
-            query=query,
-            response=response,
+            data=input_data,
             project_scope=self._azure_ai_project,
             credential=self._credential,
+            annotation_task=self._get_task(),
         )
+    def _get_task(self):
+        """Get the annotation task for the current evaluation metric.
+        The annotation task is used by the RAI service script to determine a the message format
+        of the API call, and how the output is processed, among other things.
+        :return: The annotation task for the evaluator's self._eval_metric value.
+        :rtype: ~azure.ai.evaluation._common.constants.Tasks
+        """
+        if self._eval_metric == EvaluationMetrics.GROUNDEDNESS:
+            return Tasks.GROUNDEDNESS
+        if self._eval_metric == EvaluationMetrics.XPIA:
+            return Tasks.XPIA
+        if self._eval_metric == _InternalEvaluationMetrics.ECI:
+            return _InternalAnnotationTasks.ECI
+        if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
+            return Tasks.PROTECTED_MATERIAL
+        return Tasks.CONTENT_HARM

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -2,32 +2,34 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from concurrent.futures import as_completed
+from typing import Callable, Dict, List, Optional, Union
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from typing_extensions import override
-try:
-    from ._hate_unfairness import HateUnfairnessEvaluator
-    from ._self_harm import SelfHarmEvaluator
-    from ._sexual import SexualEvaluator
-    from ._violence import ViolenceEvaluator
-except ImportError:
-    from _hate_unfairness import HateUnfairnessEvaluator
-    from _self_harm import SelfHarmEvaluator
-    from _sexual import SexualEvaluator
-    from _violence import ViolenceEvaluator
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from ._hate_unfairness import HateUnfairnessEvaluator
+from ._self_harm import SelfHarmEvaluator
+from ._sexual import SexualEvaluator
+from ._violence import ViolenceEvaluator
-class ContentSafetyEvaluator:
+@experimental
+class ContentSafetyEvaluator(EvaluatorBase):
     """
     Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
-        Default is True.
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: ~azure.core.credentials.TokenCredential
+    :param eval_last_turn: Whether to evaluate the last turn of a conversation. Default is False.
+    :type eval_last_turn: bool
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
     :rtype: Callable
@@ -66,33 +68,63 @@ class ContentSafetyEvaluator:
         }
     """
-    def __init__(self, azure_ai_project: dict, parallel: bool = True, credential=None):
-        self._parallel = parallel
-        self._evaluators = [
-            ViolenceEvaluator(azure_ai_project, credential),
-            SexualEvaluator(azure_ai_project, credential),
-            SelfHarmEvaluator(azure_ai_project, credential),
-            HateUnfairnessEvaluator(azure_ai_project, credential),
+    # TODO address 3579092 to re-enabled parallel evals.
+    def __init__(self, credential, azure_ai_project, eval_last_turn: bool = False, **kwargs):
+        super().__init__(eval_last_turn=eval_last_turn)
+        self._parallel = kwargs.pop("parallel", False)
+        self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
+            ViolenceEvaluator(credential, azure_ai_project),
+            SexualEvaluator(credential, azure_ai_project),
+            SelfHarmEvaluator(credential, azure_ai_project),
+            HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
-    def __call__(self, *, query: str, response: str, **kwargs):
+    @override
+    def __call__(
+        self,
+        *,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        conversation=None,
+        **kwargs,
+    ):
+        """Evaluate a collection of content safety metrics for the given query/response pair or conversation.
+        This inputs must supply either a query AND response, or a conversation, but not both.
+        :keyword query: The query to evaluate.
+        :paramtype query: Optional[str]
+        :keyword response: The response to evaluate.
+        :paramtype response: Optional[str]
+        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
+            key "messages", and potentially a global context under the key "context". Conversation turns are expected
+            to be dictionaries with keys "content", "role", and possibly "context".
+        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :return: The evaluation result.
+        :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
-        Evaluates content-safety metrics for "question-answering" scenario.
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword parallel: Whether to evaluate in parallel.
-        :paramtype parallel: bool
-        :return: The scores for content-safety.
-        :rtype: dict
+        return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
+        """Perform the evaluation using the Azure AI RAI service.
+        The exact evaluation performed is determined by the evaluation metric supplied
+        by the child class initializer.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
         """
-        results = {}
+        query = eval_input.get("query", None)
+        response = eval_input.get("response", None)
+        conversation = eval_input.get("conversation", None)
+        results: Dict[str, Union[str, float]] = {}
+        # TODO fix this to not explode on empty optional inputs (PF SKD error)
         if self._parallel:
             with ThreadPoolExecutor() as executor:
+                # pylint: disable=no-value-for-parameter
                 futures = {
-                    executor.submit(evaluator, query=query, response=response, **kwargs): evaluator
+                    executor.submit(query=query, response=response, conversation=conversation): evaluator
                     for evaluator in self._evaluators
                 }
@@ -100,7 +132,7 @@ class ContentSafetyEvaluator:
                     results.update(future.result())
         else:
             for evaluator in self._evaluators:
-                result = evaluator(query=query, response=response, **kwargs)
+                result = evaluator(query=query, response=response, conversation=conversation)
                 results.update(result)
         return results

azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b3py3-none-any.whl → 1.0.0b5py3-none-any.whl