PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b3py3-none-any.whl → 1.0.0b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -2,32 +2,27 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 from concurrent.futures import as_completed
+from typing import Callable, Dict, List, Union
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
-try:
-    from ._hate_unfairness import HateUnfairnessEvaluator
-    from ._self_harm import SelfHarmEvaluator
-    from ._sexual import SexualEvaluator
-    from ._violence import ViolenceEvaluator
-except ImportError:
-    from _hate_unfairness import HateUnfairnessEvaluator
-    from _self_harm import SelfHarmEvaluator
-    from _sexual import SexualEvaluator
-    from _violence import ViolenceEvaluator
+from ._hate_unfairness import HateUnfairnessEvaluator
+from ._self_harm import SelfHarmEvaluator
+from ._sexual import SexualEvaluator
+from ._violence import ViolenceEvaluator
 class ContentSafetyEvaluator:
     """
     Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
     :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
         Default is True.
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: ~azure.core.credentials.TokenCredential
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
     :rtype: Callable
@@ -66,13 +61,13 @@ class ContentSafetyEvaluator:
         }
     """
-    def __init__(self, azure_ai_project: dict, parallel: bool = True, credential=None):
+    def __init__(self, credential, azure_ai_project: dict, parallel: bool = True):
         self._parallel = parallel
-        self._evaluators = [
-            ViolenceEvaluator(azure_ai_project, credential),
-            SexualEvaluator(azure_ai_project, credential),
-            SelfHarmEvaluator(azure_ai_project, credential),
-            HateUnfairnessEvaluator(azure_ai_project, credential),
+        self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
+            ViolenceEvaluator(credential, azure_ai_project),
+            SexualEvaluator(credential, azure_ai_project),
+            SelfHarmEvaluator(credential, azure_ai_project),
+            HateUnfairnessEvaluator(credential, azure_ai_project),
         ]
     def __call__(self, *, query: str, response: str, **kwargs):
@@ -86,9 +81,9 @@ class ContentSafetyEvaluator:
         :keyword parallel: Whether to evaluate in parallel.
         :paramtype parallel: bool
         :return: The scores for content-safety.
-        :rtype: dict
+        :rtype: Dict[str, Union[str, float]]
         """
-        results = {}
+        results: Dict[str, Union[str, float]] = {}
         if self._parallel:
             with ThreadPoolExecutor() as executor:
                 futures = {

azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py CHANGED Viewed

@@ -2,32 +2,36 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import logging
+import math
 from concurrent.futures import as_completed
-from typing import Dict, List
+from typing import Callable, Dict, List, TypedDict, Union, cast
-import numpy as np
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
+from azure.ai.evaluation._common.constants import HarmSeverityLevel
+from azure.ai.evaluation._common.math import list_mean_nan_safe
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
-try:
-    from ._hate_unfairness import HateUnfairnessEvaluator
-    from ._self_harm import SelfHarmEvaluator
-    from ._sexual import SexualEvaluator
-    from ._violence import ViolenceEvaluator
-except ImportError:
-    from _hate_unfairness import HateUnfairnessEvaluator
-    from _self_harm import SelfHarmEvaluator
-    from _sexual import SexualEvaluator
-    from _violence import ViolenceEvaluator
+from ._hate_unfairness import HateUnfairnessEvaluator
+from ._self_harm import SelfHarmEvaluator
+from ._sexual import SexualEvaluator
+from ._violence import ViolenceEvaluator
 logger = logging.getLogger(__name__)
+class _EvaluationPerTurn(TypedDict):
+    severity: List[str]
+    score: List[float]
+    reason: List[str]
 class ContentSafetyChatEvaluator:
     """
     Initialize a content safety chat evaluator configured to evaluate content safetry metrics for chat scenario.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
@@ -37,8 +41,6 @@ class ContentSafetyChatEvaluator:
     :param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
         Default is True.
     :type parallel: bool
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: ~azure.core.credentials.TokenCredential
     :return: A function that evaluates and generates metrics for "chat" scenario.
     :rtype: Callable
@@ -87,24 +89,30 @@ class ContentSafetyChatEvaluator:
         }
     """
-    def __init__(self, azure_ai_project: dict, eval_last_turn: bool = False, parallel: bool = True, credential=None):
+    def __init__(
+        self,
+        credential,
+        azure_ai_project: dict,
+        eval_last_turn: bool = False,
+        parallel: bool = True,
+    ):
         self._eval_last_turn = eval_last_turn
         self._parallel = parallel
-        self._evaluators = [
+        self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
             ViolenceEvaluator(azure_ai_project, credential),
             SexualEvaluator(azure_ai_project, credential),
             SelfHarmEvaluator(azure_ai_project, credential),
             HateUnfairnessEvaluator(azure_ai_project, credential),
         ]
-    def __call__(self, *, conversation, **kwargs):
+    def __call__(self, *, conversation: list, **kwargs):
         """
         Evaluates content-safety metrics for "chat" scenario.
         :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys.
         :paramtype conversation: List[Dict]
         :return: The scores for Chat scenario.
-        :rtype: dict
+        :rtype: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]
         """
         self._validate_conversation(conversation)
@@ -141,7 +149,7 @@ class ContentSafetyChatEvaluator:
                     }
                     for future in as_completed(future_to_evaluator):
-                        result = future.result()
+                        result: Dict[str, Union[str, float]] = future.result()
                         current_turn_result.update(result)
             else:
                 # Sequential execution
@@ -154,7 +162,13 @@ class ContentSafetyChatEvaluator:
         aggregated = self._aggregate_results(per_turn_results)
         return aggregated
-    def _evaluate_turn(self, turn_num, queries, responses, evaluator):
+    def _evaluate_turn(
+        self,
+        turn_num: int,
+        queries: List[str],
+        responses: List[str],
+        evaluator: Callable[..., Dict[str, Union[str, float]]],
+    ) -> Dict[str, Union[str, float]]:
         try:
             query = queries[turn_num] if turn_num < len(queries) else ""
             response = responses[turn_num] if turn_num < len(responses) else ""
@@ -171,41 +185,48 @@ class ContentSafetyChatEvaluator:
             )
             return {}
-    def _aggregate_results(self, per_turn_results: List[Dict]):
-        scores = {}
-        reasons = {}
-        levels = {}
+    def _aggregate_results(
+        self, per_turn_results: List[Dict[str, Union[str, float]]]
+    ) -> Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]]:
+        scores: Dict[str, List[float]] = {}
+        reasons: Dict[str, List[str]] = {}
+        levels: Dict[str, List[str]] = {}
         for turn in per_turn_results:
             for metric, value in turn.items():
                 if "_score" in metric:
                     if metric not in scores:
                         scores[metric] = []
-                    scores[metric].append(value)
+                    scores[metric].append(cast(float, value))
                 elif "_reason" in metric:
                     if metric not in reasons:
                         reasons[metric] = []
-                    reasons[metric].append(value)
+                    reasons[metric].append(cast(str, value))
                 else:
                     if metric not in levels:
                         levels[metric] = []
-                    levels[metric].append(value)
+                    levels[metric].append(cast(str, value))
-        aggregated = {}
-        evaluation_per_turn = {}
+        aggregated: Dict[str, Union[float, str, Dict[str, _EvaluationPerTurn]]] = {}
+        evaluation_per_turn: Dict[str, _EvaluationPerTurn] = {}
         for metric, values in levels.items():
             score_key = f"{metric}_score"
             reason_key = f"{metric}_reason"
-            aggregated_score = np.nanmean(scores[score_key])
-            aggregated[metric] = self._get_harm_severity_level(aggregated_score)
+            aggregated_score = list_mean_nan_safe(scores[score_key])
+            harm_severity_level = self._get_harm_severity_level(aggregated_score)
+            aggregated[metric] = (
+                harm_severity_level.value if isinstance(harm_severity_level, HarmSeverityLevel) else harm_severity_level
+            )
             aggregated[score_key] = aggregated_score
             # Prepare per-turn evaluations
-            evaluation_per_turn[metric] = {"severity": values}
-            evaluation_per_turn[metric]["score"] = scores[score_key]
-            evaluation_per_turn[metric]["reason"] = reasons[reason_key]
+            evaluation_per_turn[metric] = {
+                "severity": values,
+                "score": scores[score_key],
+                "reason": reasons[reason_key],
+            }
         aggregated["evaluation_per_turn"] = evaluation_per_turn
@@ -283,19 +304,19 @@ class ContentSafetyChatEvaluator:
                 blame=ErrorBlame.USER_ERROR,
             )
-    def _get_harm_severity_level(self, harm_score: float) -> str:
+    def _get_harm_severity_level(self, harm_score: float) -> Union[HarmSeverityLevel, float]:
         HARM_SEVERITY_LEVEL_MAPPING = {
-            "Very low": [0, 1],
-            "Low": [2, 3],
-            "Medium": [4, 5],
-            "High": [6, 7],
+            HarmSeverityLevel.VeryLow: (0, 1),
+            HarmSeverityLevel.Low: (2, 3),
+            HarmSeverityLevel.Medium: (4, 5),
+            HarmSeverityLevel.High: (6, 7),
         }
-        if harm_score == np.nan or harm_score is None:
-            return np.nan
+        if math.isnan(harm_score) or harm_score is None:
+            return math.nan
         for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items():
             if harm_score_range[0] <= harm_score <= harm_score_range[1]:
                 return harm_level
-        return np.nan
+        return math.nan

azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
@@ -11,11 +11,11 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a hate-unfairness evaluator for hate unfairness score.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: Optional[~azure.core.credentials.TokenCredential]
     **Usage**
@@ -43,8 +43,8 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase):
     @override
     def __init__(
         self,
+        credential,
         azure_ai_project: dict,
-        credential: Optional[dict] = None,
         eval_last_turn: bool = False,
     ):
         super().__init__(

azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
@@ -11,11 +11,11 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a self harm evaluator for self harm score.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: Optional[~azure.core.credentials.TokenCredential]
     **Usage**
@@ -43,8 +43,8 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase):
     @override
     def __init__(
         self,
+        credential,
         azure_ai_project: dict,
-        credential: Optional[dict] = None,
         eval_last_turn: bool = False,
     ):
         super().__init__(

azure/ai/evaluation/_evaluators/_content_safety/_sexual.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
@@ -11,11 +11,11 @@ class SexualEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a sexual evaluator for sexual score.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: Optional[~azure.core.credentials.TokenCredential]
     **Usage**
@@ -43,8 +43,8 @@ class SexualEvaluator(RaiServiceEvaluatorBase):
     @override
     def __init__(
         self,
+        credential,
         azure_ai_project: dict,
-        credential: Optional[dict] = None,
         eval_last_turn: bool = False,
     ):
         super().__init__(

azure/ai/evaluation/_evaluators/_content_safety/_violence.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
@@ -11,11 +11,11 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase):
     """
     Initialize a violence evaluator for violence score.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: Optional[~azure.core.credentials.TokenCredential]
     **Usage**
@@ -43,8 +43,8 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase):
     @override
     def __init__(
         self,
+        credential,
         azure_ai_project: dict,
-        credential: Optional[dict] = None,
         eval_last_turn: bool = False,
     ):
         super().__init__(

azure/ai/evaluation/_evaluators/_eci/_eci.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import _InternalEvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
@@ -17,11 +17,11 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
     "AI-generated content may be incorrect. If you are seeking ECI-related information, please go to Bing Search."
     Outputs True or False with AI-generated reasoning.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: Optional[~azure.core.credentials.TokenCredential]
     :return: Whether or not ECI was found in the response without a disclaimer, with AI-generated reasoning
     :rtype: Dict[str, str]
@@ -50,8 +50,8 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
     @override
     def __init__(
         self,
+        credential,
         azure_ai_project: dict,
-        credential: Optional[dict] = None,
         eval_last_turn: bool = False,
     ):
         super().__init__(

azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py CHANGED Viewed

@@ -15,6 +15,16 @@ class _AsyncF1ScoreEvaluator:
         pass
     async def __call__(self, *, response: str, ground_truth: str, **kwargs):
+        """
+        Evaluate F1 score.
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword ground_truth: The ground truth to be evaluated.
+        :paramtype ground_truth: str
+        :return: The F1 score.
+        :rtype: Dict[str, float]
+        """
         # Validate inputs
         if not (response and response.strip() and response != "None") or not (
             ground_truth and ground_truth.strip() and ground_truth != "None"
@@ -34,7 +44,7 @@ class _AsyncF1ScoreEvaluator:
         return {"f1_score": f1_result}
     @classmethod
-    def _compute_f1_score(cls, response: str, ground_truth: str) -> str:
+    def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
         import re
         import string
@@ -76,11 +86,9 @@ class _AsyncF1ScoreEvaluator:
             return white_space_fix(remove_articles(remove_punctuation(lower(text))))
-        prediction_tokens = normalize_text(response)
-        reference_tokens = normalize_text(ground_truth)
         tokenizer = QASplitTokenizer()
-        prediction_tokens = tokenizer(prediction_tokens)
-        reference_tokens = tokenizer(reference_tokens)
+        prediction_tokens = tokenizer(normalize_text(response))
+        reference_tokens = tokenizer(normalize_text(ground_truth))
         common_tokens = Counter(prediction_tokens) & Counter(reference_tokens)
         num_common_tokens = sum(common_tokens.values())
@@ -131,7 +139,7 @@ class F1ScoreEvaluator:
         :keyword ground_truth: The ground truth to be evaluated.
         :paramtype ground_truth: str
         :return: The F1 score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
         return async_run_allowing_running_loop(

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import os
 from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -51,7 +52,7 @@ class FluencyEvaluator(PromptyEvaluatorBase):
         query: Optional[str] = None,
         response: Optional[str] = None,
         conversation: Optional[dict] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Evaluate fluency. Accepts either a query and response for a single evaluation,
@@ -67,6 +68,6 @@ class FluencyEvaluator(PromptyEvaluatorBase):
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[Dict]
         :return: The fluency score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # ---------------------------------------------------------
 import os
 from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -51,7 +52,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
         response: Optional[str] = None,
         context: Optional[str] = None,
         conversation: Optional[dict] = None,
-        **kwargs
+        **kwargs,
     ):
         """Evaluate groundedless. Accepts either a response and context a single evaluation,
         or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
@@ -66,6 +67,6 @@ class GroundednessEvaluator(PromptyEvaluatorBase):
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[Dict]
         :return: The relevance score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
         return super().__call__(response=response, context=context, conversation=conversation, **kwargs)

azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
@@ -12,11 +12,11 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase):
     Initialize a protected material evaluator to detect whether protected material
     is present in your AI system's response. Outputs True or False with AI-generated reasoning.
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
     :param azure_ai_project: The scope of the Azure AI project.
         It contains subscription id, resource group, and project name.
     :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
-    :param credential: The credential for connecting to Azure AI project.
-    :type credential: Optional[~azure.core.credentials.TokenCredential]
     :return: Whether or not protected material was found in the response, with AI-generated reasoning.
     :rtype: Dict[str, str]
@@ -45,8 +45,8 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase):
     @override
     def __init__(
         self,
+        credential,
         azure_ai_project: dict,
-        credential: Optional[dict] = None,
         eval_last_turn: bool = False,
     ):
         super().__init__(

azure/ai/evaluation/_evaluators/_qa/_qa.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # ---------------------------------------------------------
 from concurrent.futures import as_completed
+from typing import Callable, Dict, List
 from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
@@ -53,7 +54,7 @@ class QAEvaluator:
     def __init__(self, model_config: dict, parallel: bool = True):
         self._parallel = parallel
-        self._evaluators = [
+        self._evaluators: List[Callable[..., Dict[str, float]]] = [
             GroundednessEvaluator(model_config),
             RelevanceEvaluator(model_config),
             CoherenceEvaluator(model_config),
@@ -77,9 +78,9 @@ class QAEvaluator:
         :keyword parallel: Whether to evaluate in parallel. Defaults to True.
         :paramtype parallel: bool
         :return: The scores for QA scenario.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
-        results = {}
+        results: Dict[str, float] = {}
         if self._parallel:
             with ThreadPoolExecutor() as executor:
                 futures = {

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import os
 from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -55,7 +56,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
         response: Optional[str] = None,
         context: Optional[str] = None,
         conversation: Optional[dict] = None,
-        **kwargs
+        **kwargs,
     ):
         """Evaluate relevance. Accepts either a response and context a single evaluation,
         or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
@@ -72,6 +73,6 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[Dict]
         :return: The relevance score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
         return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)

azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b3py3-none-any.whl → 1.0.0b4py3-none-any.whl