PyPI - azure-ai-evaluation - Versions diffs - 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.1.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -4,15 +4,18 @@
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
+from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
 from promptflow._utils.async_utils import async_run_allowing_running_loop
 from typing_extensions import ParamSpec, TypeAlias, get_overloads
-from azure.ai.evaluation._common.math import list_mean
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
 from azure.ai.evaluation._common.utils import remove_optional_singletons
+from azure.ai.evaluation._constants import _AggregationType
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._common._experimental import experimental
+from ._conversation_aggregators import GetAggregator, GetAggregatorType
 P = ParamSpec("P")
 T = TypeVar("T")
@@ -25,6 +28,7 @@ class DerivedEvalInput(TypedDict, total=False):
     query: Dict[str, Any]
     response: Dict[str, Any]
     context: str
+    ground_truth: str
 AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
@@ -69,6 +73,13 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     :type not_singleton_inputs: List[str]
     :param eval_last_turn: If True, only the last turn of the conversation will be evaluated. Default is False.
     :type eval_last_turn: bool
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
+        to produce a single result.
+        Default is ~azure.ai.evaluation._AggregationType.MEAN.
+    :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
+    :param conversation_aggregator_override: A function that will be used to aggregate per-turn results. If provided,
+        overrides the standard aggregator implied by conversation_aggregation_type. None by default.
+    :type conversation_aggregator_override: Optional[Callable[[List[float]], float]]
     """
     # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -80,11 +91,17 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         *,
         not_singleton_inputs: List[str] = ["conversation", "kwargs"],
         eval_last_turn: bool = False,
+        conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
+        conversation_aggregator_override: Optional[Callable[[List[float]], float]] = None,
     ):
         self._not_singleton_inputs = not_singleton_inputs
         self._eval_last_turn = eval_last_turn
         self._singleton_inputs = self._derive_singleton_inputs()
         self._async_evaluator = AsyncEvaluatorBase(self._real_call)
+        self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+        if conversation_aggregator_override is not None:
+            # Type ignore since we already checked for None, but mypy doesn't know that.
+            self._conversation_aggregation_function = conversation_aggregator_override  # type: ignore[assignment]
     # This needs to be overridden just to change the function header into something more informative,
     # and to be able to add a more specific docstring. The actual function contents should just be
@@ -158,6 +175,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         include_context = "context" in self._singleton_inputs
         include_query = "query" in self._singleton_inputs
         include_response = "response" in self._singleton_inputs
+        include_ground_truth = "ground_truth" in self._singleton_inputs
         def converter(conversation: Dict) -> List[DerivedEvalInput]:
             messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -198,6 +216,8 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
                     eval_input["response"] = response.get("content", "")
                 if include_context:
                     eval_input["context"] = str(context)
+                if include_ground_truth:
+                    eval_input["ground_truth"] = response.get("ground_truth", "")
                 eval_inputs.append(eval_input)
             return eval_inputs
@@ -355,7 +375,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         # Find and average all numeric values
         for metric, values in evaluation_per_turn.items():
             if all(isinstance(value, (int, float)) for value in values):
-                aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
+                aggregated[metric] = self._conversation_aggregation_function(cast(List[Union[int, float]], values))
         # Slap the per-turn results back in.
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
@@ -383,10 +403,51 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         # Otherwise, aggregate results.
         return self._aggregate_results(per_turn_results=per_turn_results)
+    # ~~~ METHODS THAT SHOULD NOT BE OVERRIDDEN BY CHILDREN~~~``
     @final
     def _to_async(self) -> "AsyncEvaluatorBase":
         return self._async_evaluator
+    @experimental
+    @final
+    def _set_conversation_aggregation_type(self, conversation_aggregation_type: _AggregationType) -> None:
+        """Input a conversation aggregation type to re-assign the aggregator function used by this evaluator for
+        multi-turn conversations. This aggregator is used to combine numeric outputs from each evaluation of a
+        multi-turn conversation into a single top-level result.
+        :param conversation_aggregation_type: The type of aggregation to perform on the per-turn
+            results of a conversation to produce a single result.
+        :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
+        """
+        self._conversation_aggregation_function = GetAggregator(conversation_aggregation_type)
+    @experimental
+    @final
+    def _set_conversation_aggregator(self, aggregator: Callable[[List[float]], float]) -> None:
+        """Set the conversation aggregator function directly. This function will be applied to all numeric outputs
+        of an evaluator when it evaluates a conversation with multiple-turns thus ends up with multiple results per
+        evaluation that is needs to coalesce into a single result. Use when built-in aggregators do not
+        suit your needs, but use with caution.
+        :param aggregator: The function to use to aggregate per-turn results.
+        :type aggregator: Callable[[List[float]], float]
+        """
+        self._conversation_aggregation_function = aggregator
+    @experimental
+    @final
+    def _get_conversation_aggregator_type(self) -> _AggregationType:
+        """Get the current conversation aggregation type used by this evaluator. This refers to the
+        method used when a single input produces multiple evaluation results (ex: when a multi-turn conversation
+        is inputted into an evaluator that evaluates each turn individually). The individual inputs
+        are combined by the function implied here to produce a single overall result.
+        :return: The conversation aggregation type.
+        :rtype: ~azure.ai.evaluation._AggregationType
+        """
+        return GetAggregatorType(self._conversation_aggregation_function)
 class AsyncEvaluatorBase:
     """The asynchronous evaluator hidden underneath all evaluators. This makes generous use passing functions
@@ -402,7 +463,9 @@ class AsyncEvaluatorBase:
     # are just not passed into this function instead of ending up in kwargs.
     # Since we want this to be relatively call-agnostic, we just account for every input that any children
     # are known to throw at this, mash them into kwargs, and then pass them into the real call.
-    async def __call__(self, *, query=None, response=None, context=None, conversation=None, **kwargs):
+    async def __call__(
+        self, *, query=None, response=None, context=None, conversation=None, ground_truth=None, **kwargs
+    ):
         if conversation is not None:
             kwargs["conversation"] = conversation
         if query is not None:
@@ -411,4 +474,6 @@ class AsyncEvaluatorBase:
             kwargs["response"] = response
         if context is not None:
             kwargs["context"] = context
+        if ground_truth is not None:
+            kwargs["ground_truth"] = ground_truth
         return await self._real_call(**kwargs)

azure/ai/evaluation/_evaluators/_common/_base_multi_eval.py ADDED Viewed

@@ -0,0 +1,61 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from concurrent.futures import as_completed
+from typing import TypeVar, Dict, List
+from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from typing_extensions import override
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
+T = TypeVar("T")
+class MultiEvaluatorBase(EvaluatorBase[T]):
+    """
+    Base class for evaluators that contain and run multiple other evaluators to produce a
+    suite of metrics.
+    Child classes still need to implement the __call__ methods, but they shouldn't need a _do_eval.
+    :param evaluators: The list of evaluators to run when this evaluator is called.
+    :type evaluators: List[~azure.ai.evaluation._evaluators._common.EvaluatorBase]
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+    :return: An evaluator that runs multiple other evaluators and combines their results.
+    """
+    def __init__(self, evaluators: List[EvaluatorBase[T]], **kwargs):
+        super().__init__()
+        self._parallel = kwargs.pop("_parallel", True)
+        self._evaluators = evaluators
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, T]:
+        """Run each evaluator, possibly in parallel, and combine the results into
+        a single large dictionary containing each evaluation. Inputs are passed
+        directly to each evaluator without additional processing.
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        results: Dict[str, T] = {}
+        if self._parallel:
+            with ThreadPoolExecutor() as executor:
+                # pylint: disable=no-value-for-parameter
+                futures = {executor.submit(evaluator, **eval_input): evaluator for evaluator in self._evaluators}
+                for future in as_completed(futures):
+                    results.update(future.result())
+        else:
+            for evaluator in self._evaluators:
+                result = evaluator(**eval_input)
+                # Ignore is to avoid mypy getting upset over the amount of duck-typing
+                # that's going on to shove evaluators around like this.
+                results.update(result)  # type: ignore[arg-type]
+        return results

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -15,6 +15,7 @@ from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, e
 from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import EvaluationException
 from azure.ai.evaluation._common.utils import validate_conversation
+from azure.ai.evaluation._constants import _AggregationType
 from azure.core.credentials import TokenCredential
 from . import EvaluatorBase
@@ -35,6 +36,10 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
         when this occurs. Default is False, resulting full conversation evaluation and aggregation.
     :type eval_last_turn: bool
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
+        to produce a single result.
+        Default is ~azure.ai.evaluation._AggregationType.MEAN.
+    :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
     """
     @override
@@ -44,8 +49,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         azure_ai_project: dict,
         credential: TokenCredential,
         eval_last_turn: bool = False,
+        conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
     ):
-        super().__init__(eval_last_turn=eval_last_turn)
+        super().__init__(eval_last_turn=eval_last_turn, conversation_aggregation_type=conversation_aggregation_type)
         self._eval_metric = eval_metric
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential

azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py ADDED Viewed

@@ -0,0 +1,49 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Callable, List
+from azure.ai.evaluation._common.math import list_mean
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._constants import _AggregationType
+def GetAggregator(aggregation_type: _AggregationType) -> Callable[[List[float]], float]:
+    if aggregation_type == _AggregationType.SUM:
+        return sum
+    if aggregation_type == _AggregationType.MEAN:
+        return list_mean
+    if aggregation_type == _AggregationType.MAX:
+        return max
+    if aggregation_type == _AggregationType.MIN:
+        return min
+    if aggregation_type == _AggregationType.CUSTOM:
+        msg = (
+            "Cannot 'get' aggregator function associated with custom aggregation enum."
+            + " This enum value should only be outputted as an indicator of an injected"
+            + " aggregation function, not inputted directly"
+        )
+        raise EvaluationException(
+            message=msg,
+            blame=ErrorBlame.UNKNOWN,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.EVALUATE,
+        )
+    raise EvaluationException(
+        message=f"Unaccounted for aggregation type: {aggregation_type}",
+        blame=ErrorBlame.UNKNOWN,
+        category=ErrorCategory.INVALID_VALUE,
+        target=ErrorTarget.EVALUATE,
+    )
+def GetAggregatorType(aggregation_function: Callable) -> _AggregationType:
+    if aggregation_function == sum:  # pylint: disable=comparison-with-callable
+        return _AggregationType.SUM
+    if aggregation_function == list_mean:  # pylint: disable=comparison-with-callable
+        return _AggregationType.MEAN
+    if aggregation_function == max:  # pylint: disable=comparison-with-callable
+        return _AggregationType.MAX
+    if aggregation_function == min:  # pylint: disable=comparison-with-callable
+        return _AggregationType.MIN
+    return _AggregationType.CUSTOM

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -1,13 +1,11 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from concurrent.futures import as_completed
-from typing import Callable, Dict, List, Union
+from typing import Dict, List, Union
-from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
 from typing_extensions import overload, override
-from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._evaluators._common import MultiEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
 from azure.ai.evaluation._common._experimental import experimental
@@ -18,7 +16,7 @@ from ._violence import ViolenceEvaluator
 @experimental
-class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
+class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
     """
     Initialize a content safety evaluator configured to evaluate content safety metrics for QA scenario.
@@ -44,16 +42,14 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
     id = "content_safety"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    # TODO address 3579092 to re-enabled parallel evals.
     def __init__(self, credential, azure_ai_project, **kwargs):
-        super().__init__()
-        self._parallel = kwargs.pop("_parallel", True)
-        self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
+        evaluators = [
             ViolenceEvaluator(credential, azure_ai_project),
             SexualEvaluator(credential, azure_ai_project),
             SelfHarmEvaluator(credential, azure_ai_project),
             HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
+        super().__init__(evaluators=evaluators, **kwargs)
     @overload
     def __call__(
@@ -109,36 +105,3 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
         """
         return super().__call__(*args, **kwargs)
-    @override
-    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
-        """Perform the evaluation using the Azure AI RAI service.
-        The exact evaluation performed is determined by the evaluation metric supplied
-        by the child class initializer.
-        :param eval_input: The input to the evaluation function.
-        :type eval_input: Dict
-        :return: The evaluation result.
-        :rtype: Dict
-        """
-        query = eval_input.get("query", None)
-        response = eval_input.get("response", None)
-        conversation = eval_input.get("conversation", None)
-        results: Dict[str, Union[str, float]] = {}
-        # TODO fix this to not explode on empty optional inputs (PF SKD error)
-        if self._parallel:
-            with ThreadPoolExecutor() as executor:
-                # pylint: disable=no-value-for-parameter
-                futures = {
-                    executor.submit(evaluator, query=query, response=response, conversation=conversation): evaluator
-                    for evaluator in self._evaluators
-                }
-                for future in as_completed(futures):
-                    results.update(future.result())
-        else:
-            for evaluator in self._evaluators:
-                result = evaluator(query=query, response=response, conversation=conversation)
-                results.update(result)
-        return results

azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py CHANGED Viewed

@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._constants import _AggregationType
 @experimental
@@ -71,6 +72,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             eval_metric=EvaluationMetrics.HATE_FAIRNESS,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            conversation_aggregation_type=_AggregationType.MAX,
         )
     @overload

azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py CHANGED Viewed

@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._constants import _AggregationType
 @experimental
@@ -65,6 +66,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             eval_metric=EvaluationMetrics.SELF_HARM,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            conversation_aggregation_type=_AggregationType.MAX,
         )
     @overload

azure/ai/evaluation/_evaluators/_content_safety/_sexual.py CHANGED Viewed

@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._constants import _AggregationType
 @experimental
@@ -67,6 +68,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             eval_metric=EvaluationMetrics.SEXUAL,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            conversation_aggregation_type=_AggregationType.MAX,
         )
     @overload

azure/ai/evaluation/_evaluators/_content_safety/_violence.py CHANGED Viewed

@@ -9,6 +9,7 @@ from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
+from azure.ai.evaluation._constants import _AggregationType
 @experimental
@@ -67,6 +68,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             eval_metric=EvaluationMetrics.VIOLENCE,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            conversation_aggregation_type=_AggregationType.MAX,
         )
     @overload

azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py CHANGED Viewed

@@ -3,45 +3,44 @@
 # ---------------------------------------------------------
 from collections import Counter
-from typing import List
+from typing import List, Dict
+from typing_extensions import overload, override
-from promptflow._utils.async_utils import async_run_allowing_running_loop
+from azure.ai.evaluation._evaluators._common import EvaluatorBase
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+class F1ScoreEvaluator(EvaluatorBase):
+    """
+    Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
-class _AsyncF1ScoreEvaluator:
-    def __init__(self):
-        pass
+    F1 Scores range from 0 to 1, with 1 being the best possible score.
-    async def __call__(self, *, response: str, ground_truth: str, **kwargs):
-        """
-        Evaluate F1 score.
+    The F1-score computes the ratio of the number of shared words between the model generation and
+    the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
+    truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
+    precision is the ratio of the number of shared words to the total number of words in the generation, and recall
+    is the ratio of the number of shared words to the total number of words in the ground truth.
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword ground_truth: The ground truth to be evaluated.
-        :paramtype ground_truth: str
-        :return: The F1 score.
-        :rtype: Dict[str, float]
-        """
-        # Validate inputs
-        if not (response and response.strip() and response != "None") or not (
-            ground_truth and ground_truth.strip() and ground_truth != "None"
-        ):
-            msg = "Both 'response' and 'ground_truth' must be non-empty strings."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                error_category=ErrorCategory.MISSING_FIELD,
-                error_blame=ErrorBlame.USER_ERROR,
-                error_target=ErrorTarget.F1_EVALUATOR,
-            )
+    Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
+    model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
+    information in the response.
-        # Run f1 score computation.
-        f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
-        return {"f1_score": f1_result}
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START f1_score_evaluator]
+            :end-before: [END f1_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an F1ScoreEvaluator.
+    """
+    id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    def __init__(self):
+        super().__init__()
     @classmethod
     def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
@@ -103,41 +102,24 @@ class _AsyncF1ScoreEvaluator:
         return f1
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
+        """Produce an f1 score evaluation result.
-class F1ScoreEvaluator:
-    """
-    Calculates the F1 score for a given response and ground truth or a multi-turn conversation.
-    F1 Scores range from 0 to 1, with 1 being the best possible score.
-    The F1-score computes the ratio of the number of shared words between the model generation and
-    the ground truth. Ratio is computed over the individual words in the generated response against those in the ground
-    truth answer. The number of shared words between the generation and the truth is the basis of the F1 score:
-    precision is the ratio of the number of shared words to the total number of words in the generation, and recall
-    is the ratio of the number of shared words to the total number of words in the ground truth.
-    Use the F1 score when you want a single comprehensive metric that combines both recall and precision in your
-    model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
-    information in the response.
-    .. admonition:: Example:
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START f1_score_evaluator]
-            :end-before: [END f1_score_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call an F1ScoreEvaluator.
-    """
-    id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
-    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+        :param eval_input: The input to the evaluation function.
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        ground_truth = eval_input["ground_truth"]
+        response = eval_input["response"]
+        # Run f1 score computation.
+        f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
-    def __init__(self):
-        self._async_evaluator = _AsyncF1ScoreEvaluator()
+        return {"f1_score": f1_result}
-    def __call__(self, *, response: str, ground_truth: str, **kwargs):
+    @overload  # type: ignore
+    def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:
         """
         Evaluate F1 score.
@@ -149,9 +131,20 @@ class F1ScoreEvaluator:
         :rtype: Dict[str, float]
         """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, response=response, ground_truth=ground_truth, **kwargs
-        )
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate F1 score.
-    def _to_async(self):
-        return self._async_evaluator
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The F1 score.
+        :rtype: Dict[str, float]
+        """
+        return super().__call__(*args, **kwargs)

azure-ai-evaluation 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

azure-ai-evaluation 1.1.0py3-none-any.whl → 1.3.0py3-none-any.whl