PyPI - azure-ai-evaluation - Versions diffs - 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.0b3py3-none-any.whl → 1.0.0b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (64) hide show

azure/ai/evaluation/_evaluate/_telemetry/__init__.py CHANGED Viewed

@@ -6,7 +6,7 @@ import functools
 import inspect
 import json
 import logging
-from typing import Callable, Dict, TypeVar
+from typing import Callable, Dict, Literal, Optional, Union, cast
 import pandas as pd
 from promptflow._sdk.entities._flows import FlexFlow as flex_flow
@@ -16,31 +16,30 @@ from promptflow.client import PFClient
 from promptflow.core import Prompty as prompty_core
 from typing_extensions import ParamSpec
+from azure.ai.evaluation._model_configurations import AzureAIProject
 from ..._user_agent import USER_AGENT
-from .._utils import _trace_destination_from_project_scope
+from .._utils import EvaluateResult, _trace_destination_from_project_scope
 LOGGER = logging.getLogger(__name__)
 P = ParamSpec("P")
-R = TypeVar("R")
-def _get_evaluator_type(evaluator: Dict[str, Callable]):
+def _get_evaluator_type(evaluator: Dict[str, Callable]) -> Literal["content-safety", "built-in", "custom"]:
     """
     Get evaluator type for telemetry.
     :param evaluator: The evaluator object
     :type evaluator: Dict[str, Callable]
     :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety".
-    :rtype: str
+    :rtype: Literal["content-safety", "built-in", "custom"]
     """
-    built_in = False
-    content_safety = False
     module = inspect.getmodule(evaluator)
-    built_in = module and module.__name__.startswith("azure.ai.evaluation._evaluators.")
-    if built_in:
-        content_safety = module.__name__.startswith("azure.ai.evaluation._evaluators._content_safety")
+    module_name = module.__name__ if module else ""
+    built_in = module_name.startswith("azure.ai.evaluation._evaluators.")
+    content_safety = built_in and module_name.startswith("azure.ai.evaluation._evaluators._content_safety")
     if content_safety:
         return "content-safety"
@@ -98,22 +97,22 @@ def _get_evaluator_properties(evaluator, evaluator_name):
 # cspell:ignore isna
-def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
+def log_evaluate_activity(func: Callable[P, EvaluateResult]) -> Callable[P, EvaluateResult]:
     """Decorator to log evaluate activity
     :param func: The function to be decorated
     :type func: Callable
     :returns: The decorated function
-    :rtype: Callable[P, R]
+    :rtype: Callable[P, EvaluateResult]
     """
     @functools.wraps(func)
-    def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> EvaluateResult:
         from promptflow._sdk._telemetry import ActivityType, log_activity
         from promptflow._sdk._telemetry.telemetry import get_telemetry_logger
-        evaluators = kwargs.get("evaluators", [])
-        azure_ai_project = kwargs.get("azure_ai_project", None)
+        evaluators = cast(Optional[Dict[str, Callable]], kwargs.get("evaluators", {})) or {}
+        azure_ai_project = cast(Optional[AzureAIProject], kwargs.get("azure_ai_project", None))
         pf_client = PFClient(
             config=(
@@ -127,7 +126,7 @@ def log_evaluate_activity(func: Callable[P, R]) -> Callable[P, R]:
         track_in_cloud = bool(pf_client._config.get_trace_destination())  # pylint: disable=protected-access
         evaluate_target = bool(kwargs.get("target", None))
         evaluator_config = bool(kwargs.get("evaluator_config", None))
-        custom_dimensions = {
+        custom_dimensions: Dict[str, Union[str, bool]] = {
             "track_in_cloud": track_in_cloud,
             "evaluate_target": evaluate_target,
             "evaluator_config": evaluator_config,

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -6,15 +6,22 @@ import logging
 import os
 import re
 import tempfile
-from collections import namedtuple
 from pathlib import Path
-from typing import Dict
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypedDict, Union
 import pandas as pd
-from azure.ai.evaluation._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME, DefaultOpenEncoding, Prefixes
+from promptflow.client import PFClient
+from promptflow.entities import Run
+from azure.ai.evaluation._constants import (
+    DEFAULT_EVALUATION_RESULTS_FILE_NAME,
+    DefaultOpenEncoding,
+    EvaluationRunProperties,
+    Prefixes,
+)
 from azure.ai.evaluation._evaluate._eval_run import EvalRun
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._model_configurations import AzureAIProject
 LOGGER = logging.getLogger(__name__)
@@ -23,14 +30,26 @@ AZURE_WORKSPACE_REGEX_FORMAT = (
     "(/providers/Microsoft.MachineLearningServices)?/workspaces/([^/]+)$"
 )
-AzureMLWorkspaceTriad = namedtuple("AzureMLWorkspace", ["subscription_id", "resource_group_name", "workspace_name"])
+class AzureMLWorkspace(NamedTuple):
+    subscription_id: str
+    resource_group_name: str
+    workspace_name: str
-def is_none(value):
+class EvaluateResult(TypedDict):
+    metrics: Dict[str, float]
+    studio_url: Optional[str]
+    rows: List[Dict]
+def is_none(value) -> bool:
     return value is None or str(value).lower() == "none"
-def extract_workspace_triad_from_trace_provider(trace_provider: str):  # pylint: disable=name-too-long
+def extract_workspace_triad_from_trace_provider(  # pylint: disable=name-too-long
+    trace_provider: str,
+) -> AzureMLWorkspace:
     match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider)
     if not match or len(match.groups()) != 5:
         raise EvaluationException(
@@ -47,7 +66,7 @@ def extract_workspace_triad_from_trace_provider(trace_provider: str):  # pylint:
     subscription_id = match.group(1)
     resource_group_name = match.group(3)
     workspace_name = match.group(5)
-    return AzureMLWorkspaceTriad(subscription_id, resource_group_name, workspace_name)
+    return AzureMLWorkspace(subscription_id, resource_group_name, workspace_name)
 def load_jsonl(path):
@@ -55,7 +74,7 @@ def load_jsonl(path):
         return [json.loads(line) for line in f.readlines()]
-def _azure_pf_client_and_triad(trace_destination):
+def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWorkspace]:
     from promptflow.azure._cli._utils import _get_azure_pf_client
     ws_triad = extract_workspace_triad_from_trace_provider(trace_destination)
@@ -69,14 +88,14 @@ def _azure_pf_client_and_triad(trace_destination):
 def _log_metrics_and_instance_results(
-    metrics,
-    instance_results,
-    trace_destination,
-    run,
-    evaluation_name,
-) -> str:
+    metrics: Dict[str, Any],
+    instance_results: pd.DataFrame,
+    trace_destination: Optional[str],
+    run: Run,
+    evaluation_name: Optional[str],
+) -> Optional[str]:
     if trace_destination is None:
-        LOGGER.error("Unable to log traces as trace destination was not defined.")
+        LOGGER.debug("Skip uploading evaluation results to AI Studio since no trace destination was provided.")
         return None
     azure_pf_client, ws_triad = _azure_pf_client_and_triad(trace_destination)
@@ -94,7 +113,6 @@ def _log_metrics_and_instance_results(
         ml_client=azure_pf_client.ml_client,
         promptflow_run=run,
     ) as ev_run:
         artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -112,7 +130,8 @@ def _log_metrics_and_instance_results(
             if run is None:
                 ev_run.write_properties_to_run_history(
                     properties={
-                        "_azureml.evaluation_run": "azure-ai-generative-parent",
+                        EvaluationRunProperties.RUN_TYPE: "eval_run",
+                        EvaluationRunProperties.EVALUATION_RUN: "azure-ai-generative-parent",
                         "_azureml.evaluate_artifacts": json.dumps([{"path": artifact_name, "type": "table"}]),
                         "isEvaluatorRun": "true",
                     }
@@ -138,7 +157,7 @@ def _get_ai_studio_url(trace_destination: str, evaluation_id: str) -> str:
     return studio_url
-def _trace_destination_from_project_scope(project_scope: dict) -> str:
+def _trace_destination_from_project_scope(project_scope: AzureAIProject) -> str:
     subscription_id = project_scope["subscription_id"]
     resource_group_name = project_scope["resource_group_name"]
     workspace_name = project_scope["project_name"]
@@ -151,9 +170,9 @@ def _trace_destination_from_project_scope(project_scope: dict) -> str:
     return trace_destination
-def _write_output(path, data_dict):
+def _write_output(path: Union[str, os.PathLike], data_dict: Any) -> None:
     p = Path(path)
-    if os.path.isdir(path):
+    if p.is_dir():
         p = p / DEFAULT_EVALUATION_RESULTS_FILE_NAME
     with open(p, "w", encoding=DefaultOpenEncoding.WRITE) as f:
@@ -161,7 +180,7 @@ def _write_output(path, data_dict):
 def _apply_column_mapping(
-    source_df: pd.DataFrame, mapping_config: Dict[str, str], inplace: bool = False
+    source_df: pd.DataFrame, mapping_config: Optional[Dict[str, str]], inplace: bool = False
 ) -> pd.DataFrame:
     """
     Apply column mapping to source_df based on mapping_config.
@@ -211,7 +230,7 @@ def _apply_column_mapping(
     return result_df
-def _has_aggregator(evaluator):
+def _has_aggregator(evaluator: object) -> bool:
     return hasattr(evaluator, "__aggregate__")
@@ -234,11 +253,11 @@ def get_int_env_var(env_var_name: str, default_value: int) -> int:
         return default_value
-def set_event_loop_policy():
+def set_event_loop_policy() -> None:
     import asyncio
     import platform
     if platform.system().lower() == "windows":
         # Reference: https://stackoverflow.com/questions/45600579/asyncio-event-loop-is-closed-when-getting-loop
         # On Windows seems to be a problem with EventLoopPolicy, use this snippet to work around it
-        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())  # type: ignore[attr-defined]

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # ---------------------------------------------------------
 import os
 from typing import Optional
 from typing_extensions import override
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
@@ -50,7 +51,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
         query: Optional[str] = None,
         response: Optional[str] = None,
         conversation: Optional[dict] = None,
-        **kwargs
+        **kwargs,
     ):
         """Evaluate coherence. Accepts either a query and response for a single evaluation,
         or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
@@ -65,6 +66,6 @@ class CoherenceEvaluator(PromptyEvaluatorBase):
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[Dict]
         :return: The relevance score.
-        :rtype: dict
+        :rtype: Dict[str, float]
         """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -2,19 +2,55 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import List, Dict, Callable, Any
 import inspect
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final
-from abc import ABC
-import numpy as np
 from promptflow._utils.async_utils import async_run_allowing_running_loop
+from typing_extensions import ParamSpec, TypeAlias
+from azure.ai.evaluation._common.math import list_mean
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+P = ParamSpec("P")
+T = TypeVar("T")
+T_EvalValue = TypeVar("T_EvalValue")
+class DerivedEvalInput(TypedDict, total=False):
+    """The eval input generated by EvaluatorBase._derive_conversation_starter."""
+    query: Dict[str, Any]
+    response: Dict[str, Any]
+    context: str
+AggregateResult: TypeAlias = Dict[str, Union[float, Dict[str, List[T]]]]
+"""TypeAlias that models the return value of EvaluatorBase._aggregate_results
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+    .. code-block:: python
+    foo: AggregateResult[float] = {
+        "evaluation_per_turn": {
+            "gpt_coherence": [1.0, 2.0, 3.0]
+        },
+        "gpt_coherence": 2.0
+    }
+"""
+DoEvalResult: TypeAlias = Dict[str, T]
+"""TypeAlias that models the return value of EvaluatorBase._do_eval
+    .. code-block:: python
+    foo: DoEvalResult[float] = {
+        "gpt_coherence": 2.0
+    }
+"""
 # TODO exception target pass down?
-class EvaluatorBase(ABC):
+class EvaluatorBase(ABC, Generic[T_EvalValue]):
     """Base class for all evaluators that are capable of accepting either a group of single values,
     or conversation as input. All such evaluators need to implement two functions of their own:
         - _convert_conversation_to_eval_input
@@ -51,7 +87,7 @@ class EvaluatorBase(ABC):
     # This needs to be overridden just to change the function header into something more informative,
     # and to be able to add a more specific docstring. The actual function contents should just be
     # super().__call__(<inputs>)
-    def __call__(self, **kwargs) -> Dict:
+    def __call__(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """Evaluate a given input. This method serves as a wrapper and is meant to be overridden by child classes for
         one main reason - to overwrite the method headers and docstring to include additional inputs as needed.
         The actual behavior of this function shouldn't change beyond adding more inputs to the
@@ -64,9 +100,8 @@ class EvaluatorBase(ABC):
         """
         return async_run_allowing_running_loop(self._async_evaluator, **kwargs)
-    # Probably the only thing that can't be simplified. Each evaluator, or at least each family
-    # of evaluators, will need to implement their own version of this function.
-    async def _do_eval(self, eval_input: Any) -> Dict:
+    @abstractmethod
+    async def _do_eval(self, eval_input: Any) -> DoEvalResult[T_EvalValue]:
         """Evaluate the input and produce a response. Must be overridden to produce a functional evaluator.
         In the default case, all required inputs are assumed to be within eval_input, as user-friendly
         typing is handled above this function in favor of polymorphic simplicity. This function must be
@@ -76,12 +111,7 @@ class EvaluatorBase(ABC):
         :type eval_input: Any
         :return: A single evaluation result
         :rtype: Dict
         """
-        raise EvaluationException(
-            message="Not implemented",
-            internal_message="BaseConversationEval's _do_eval method called somehow. This should be overridden.",
-        )
     # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -103,7 +133,7 @@ class EvaluatorBase(ABC):
                 singletons.append(param)
         return singletons
-    def _derive_conversation_converter(self) -> Callable:
+    def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
         """Produce the function that will be used to convert conversations to a list of evaluable inputs.
         This uses the inputs derived from the _derive_singleton_inputs function to determine which
         aspects of a conversation ought to be extracted.
@@ -115,12 +145,12 @@ class EvaluatorBase(ABC):
         include_query = "query" in self._singleton_inputs
         include_response = "response" in self._singleton_inputs
-        def converter(conversation: Dict) -> List:
-            messages = conversation["messages"]
+        def converter(conversation: Dict) -> List[DerivedEvalInput]:
+            messages = cast(List[Dict[str, Any]], conversation["messages"])
             global_context = conversation.get("context", None)
             # Extract queries, responses from conversation
-            queries = []
-            responses = []
+            queries: List[Dict[str, Any]] = []
+            responses: List[Dict[str, Any]] = []
             # Convert conversation slice into queries and responses.
             # Assume that 'user' role is asking queries and 'assistant' role is responding.
@@ -147,7 +177,7 @@ class EvaluatorBase(ABC):
                     if response_context and not include_response:
                         context["response_context"] = response_context
-                eval_input = {}
+                eval_input: DerivedEvalInput = {}
                 if include_query:
                     eval_input["query"] = query
                 if include_response:
@@ -159,7 +189,7 @@ class EvaluatorBase(ABC):
         return converter
-    def _convert_kwargs_to_eval_input(self, **kwargs) -> List:
+    def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
         """Convert an arbitrary input into a list of inputs for evaluators.
         It is assumed that evaluators generally make use of their inputs in one of two ways.
         Either they receive a collection of keyname inputs that are all single values
@@ -211,7 +241,7 @@ class EvaluatorBase(ABC):
             target=ErrorTarget.CONVERSATION,
         )
-    def _aggregate_results(self, per_turn_results: List[Dict]) -> Dict:
+    def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]]) -> AggregateResult[T_EvalValue]:
         """Aggregate the evaluation results of each conversation turn into a single result.
         Exact implementation might need to vary slightly depending on the results produced.
@@ -227,8 +257,8 @@ class EvaluatorBase(ABC):
         :rtype: Dict
         """
-        aggregated = {}
-        evaluation_per_turn = {}
+        aggregated: Dict[str, Union[float, Dict[str, List[T_EvalValue]]]] = {}
+        evaluation_per_turn: Dict[str, List[T_EvalValue]] = {}
         # Go over each turn, and rotate the results into a
         # metric: List[values] format for the evals_per_turn dictionary.
@@ -241,13 +271,13 @@ class EvaluatorBase(ABC):
         # Find and average all numeric values
         for metric, values in evaluation_per_turn.items():
             if all(isinstance(value, (int, float)) for value in values):
-                aggregated[metric] = np.mean(values)
+                aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
         # Slap the per-turn results back in.
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
-    async def _real_call(self, **kwargs):
+    async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """The asynchronous call where real end-to-end evaluation logic is performed.
         :keyword kwargs: The inputs to evaluate.
@@ -270,9 +300,8 @@ class EvaluatorBase(ABC):
         # Otherwise, aggregate results.
         return self._aggregate_results(per_turn_results=per_turn_results)
-    # ~~~ METHODS THAT SHOULD NEVER BE OVERRIDDEN BY CHILDREN~~~
-    def _to_async(self):
+    @final
+    def _to_async(self) -> "AsyncEvaluatorBase":
         return self._async_evaluator

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -2,26 +2,23 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+import math
 import re
 from typing import Dict
-from typing_extensions import override
-import numpy as np
 from promptflow.core import AsyncPrompty
+from typing_extensions import override
-from ..._common.utils import construct_prompty_model_config
+from ..._common.utils import construct_prompty_model_config, validate_model_config
+from . import EvaluatorBase
 try:
     from ..._user_agent import USER_AGENT
 except ImportError:
-    USER_AGENT = None
-from . import EvaluatorBase
+    USER_AGENT = "None"
-class PromptyEvaluatorBase(EvaluatorBase):
+class PromptyEvaluatorBase(EvaluatorBase[float]):
     """Base class for all evaluators that make use of context as an input. It's also assumed that such evaluators
     make use of a prompty file, and return their results as a dictionary, with a single key-value pair
     linking the result name to a float value (unless multi-turn evaluation occurs, in which case the
@@ -42,13 +39,13 @@ class PromptyEvaluatorBase(EvaluatorBase):
     LLM_CALL_TIMEOUT = 600
     DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, *, result_key: str, prompty_file: str, model_config: Dict, eval_last_turn: bool = False):
+    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
         self._result_key = result_key
         self._prompty_file = prompty_file
         super().__init__(eval_last_turn=eval_last_turn)
         prompty_model_config = construct_prompty_model_config(
-            model_config,
+            validate_model_config(model_config),
             self.DEFAULT_OPEN_API_VERSION,
             USER_AGENT,
         )
@@ -59,7 +56,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
     # defining a default here.
     @override
-    async def _do_eval(self, eval_input: Dict) -> Dict:
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
         """Do a relevance evaluation.
         :param eval_input: The input to the evaluator. Expected to contain
@@ -71,7 +68,7 @@ class PromptyEvaluatorBase(EvaluatorBase):
         """
         llm_output = await self._flow(timeout=self.LLM_CALL_TIMEOUT, **eval_input)
-        score = np.nan
+        score = math.nan
         if llm_output:
             match = re.search(r"\d", llm_output)
             if match:

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -1,48 +1,46 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
+from typing import Dict, Optional, Union
-from typing import Dict, Optional
 from typing_extensions import override
-from azure.identity import DefaultAzureCredential
-from azure.ai.evaluation._common.constants import EvaluationMetrics
+from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics
 from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
+from azure.ai.evaluation._common.utils import validate_azure_ai_project
 from azure.ai.evaluation._exceptions import EvaluationException
+from azure.core.credentials import TokenCredential
 from . import EvaluatorBase
-class RaiServiceEvaluatorBase(EvaluatorBase):
+class RaiServiceEvaluatorBase(EvaluatorBase[Union[str, float]]):
     """Base class for all evaluators that require the use of the Azure AI RAI service for evaluation.
     This includes content safety evaluators, protected material evaluators, and others. These evaluators
     are all assumed to be of the "query and response or conversation" input variety.
-    param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
-    to specify which evaluation to perform.
-    type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
-    param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
+    :param eval_metric: The evaluation metric to be used for evaluation. This is used by the API call logic
+        to specify which evaluation to perform.
+    :type eval_metric: ~azure.ai.evaluation._common.constants.EvaluationMetrics
+    :param eval_last_turn: If True, only the last turn of the conversation will be evaluated, and no
         aggregation will be performed. If False, all turns will be evaluated and the numeric results will be,
         aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
         when this occurs. Default is False, resulting full conversation evaluation and aggregation.
-    type eval_last_turn: bool
+    :type eval_last_turn: bool
     """
     @override
     def __init__(
         self,
-        eval_metric: EvaluationMetrics,
+        eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
         azure_ai_project: dict,
-        credential: Optional[dict] = None,
+        credential: TokenCredential,
         eval_last_turn: bool = False,
     ):
         super().__init__(eval_last_turn=eval_last_turn)
         self._eval_metric = eval_metric
-        self._azure_ai_project = azure_ai_project
-        if credential is None:
-            # Use DefaultCredential if no credential is provided
-            self._credential = DefaultAzureCredential()
-        else:
-            self._credential = credential
+        self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
+        self._credential = credential
     @override
     def __call__(
@@ -51,7 +49,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
         query: Optional[str] = None,
         response: Optional[str] = None,
         conversation: Optional[dict] = None,
-        **kwargs
+        **kwargs,
     ):
         """Evaluate either a query and response or a conversation. Must supply either a query AND response,
         or a conversation, but not both.
@@ -65,12 +63,12 @@ class RaiServiceEvaluatorBase(EvaluatorBase):
             to be dictionaries with keys "content", "role", and possibly "context".
         :paramtype conversation: Optional[Dict]
         :return: The evaluation result.
-        :rtype: Dict
+        :rtype: Dict[str, Union[str, float]]
         """
         return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
     @override
-    async def _do_eval(self, eval_input: Dict):
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
         """Perform the evaluation using the Azure AI RAI service.
         The exact evaluation performed is determined by the evaluation metric supplied
         by the child class initializer.

azure-ai-evaluation 1.0.0b3__py3-none-any.whl → 1.0.0b4__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.0b3py3-none-any.whl → 1.0.0b4py3-none-any.whl