PyPI - azure-ai-evaluation - Versions diffs - 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -4,14 +4,34 @@
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Generic, List, TypedDict, TypeVar, Union, cast, final, Optional
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    List,
+    TypedDict,
+    TypeVar,
+    Union,
+    cast,
+    final,
+    Optional,
+)
 from azure.ai.evaluation._legacy._adapters.utils import async_run_allowing_running_loop
 from typing_extensions import ParamSpec, TypeAlias, get_overloads
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._exceptions import (
+    ErrorBlame,
+    ErrorCategory,
+    ErrorTarget,
+    EvaluationException,
+)
 from azure.ai.evaluation._common.utils import remove_optional_singletons
-from azure.ai.evaluation._constants import _AggregationType, EVALUATION_PASS_FAIL_MAPPING
+from azure.ai.evaluation._constants import (
+    _AggregationType,
+    EVALUATION_PASS_FAIL_MAPPING,
+)
 from azure.ai.evaluation._model_configurations import Conversation
 from azure.ai.evaluation._common._experimental import experimental
@@ -150,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
-    def _derive_singleton_inputs(self) -> List[str]:
+    def _derive_singleton_inputs(self) -> List[List[str]]:
         """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
         when the evaluator is being used in a non-conversation context.
         By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
         Thankfully this works the way you'd hope, with the call_signature being based on the child
         function's signature, not the parent's.
-        :return: A list of strings representing the names of singleton inputs.
-        :rtype: List[str]
+        :return: A list of lists, where each inner list represents the singleton inputs for each overload.
+        :rtype: List[List[str]]
         """
         overloads = get_overloads(self.__call__)
@@ -166,17 +186,70 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             call_signatures = [inspect.signature(self.__call__)]
         else:
             call_signatures = [inspect.signature(overload) for overload in overloads]
-        call_signature = inspect.signature(self.__call__)
-        singletons = []
+        overload_inputs = []
         for call_signature in call_signatures:
             params = call_signature.parameters
             if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
                 continue
             # exclude self since it is not a singleton input
-            singletons.extend([p for p in params if p != "self"])
-        return singletons
+            overload_inputs.append([p for p in params if p != "self"])
+        return overload_inputs
+    def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
+        """Find the overload that matches the provided kwargs and return its input parameters.
+        :keyword kwargs: The keyword arguments to match against overloads.
+        :type kwargs: Dict
+        :return: List of input parameter names for the matching overload.
+        :rtype: List[str]
+        """
+        overload_inputs = self._singleton_inputs
+        provided_keys = set(key for key, value in kwargs.items() if value is not None)
+        # Find the overload that best matches the provided parameters
+        best_match = None
+        best_score = -1
+        for inputs in overload_inputs:
+            input_set = set(inputs)
+            # Calculate match score: how many of the overload's params are provided
+            if input_set.issubset(provided_keys):
+                score = len(input_set)
+                if score > best_score:
+                    best_score = score
+                    best_match = inputs
+        # If exact match found, return it
+        if best_match is not None:
+            return best_match
+        # If no exact match, find the overload with the most overlap
+        for inputs in overload_inputs:
+            input_set = set(inputs)
+            overlap = len(input_set.intersection(provided_keys))
+            if overlap > best_score:
+                best_score = overlap
+                best_match = inputs
+        # Return the best match or the first overload as fallback
+        return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
+    def _get_all_singleton_inputs(self) -> List[str]:
+        """Get a flattened list of all possible singleton inputs across all overloads.
+        :return: Flattened list of all singleton input names.
+        :rtype: List[str]
+        """
+        all_inputs = set()
+        for inputs in self._singleton_inputs:
+            all_inputs.update(inputs)
+        return list(all_inputs)
-    def _derive_conversation_converter(self) -> Callable[[Dict], List[DerivedEvalInput]]:
+    def _derive_conversation_converter(
+        self,
+    ) -> Callable[[Dict], List[DerivedEvalInput]]:
         """Produce the function that will be used to convert conversations to a list of evaluable inputs.
         This uses the inputs derived from the _derive_singleton_inputs function to determine which
         aspects of a conversation ought to be extracted.
@@ -184,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         :return: The function that will be used to convert conversations to evaluable inputs.
         :rtype: Callable
         """
-        include_context = "context" in self._singleton_inputs
-        include_query = "query" in self._singleton_inputs
-        include_response = "response" in self._singleton_inputs
-        include_ground_truth = "ground_truth" in self._singleton_inputs
+        all_singleton_inputs = self._get_all_singleton_inputs()
+        include_context = "context" in all_singleton_inputs
+        include_query = "query" in all_singleton_inputs
+        include_response = "response" in all_singleton_inputs
+        include_ground_truth = "ground_truth" in all_singleton_inputs
         def converter(conversation: Dict) -> List[DerivedEvalInput]:
             messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -235,7 +309,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         return converter
-    def _derive_multi_modal_conversation_converter(self) -> Callable[[Dict], List[Dict[str, Any]]]:
+    def _derive_multi_modal_conversation_converter(
+        self,
+    ) -> Callable[[Dict], List[Dict[str, Any]]]:
         """Produce the function that will be used to convert multi-modal conversations to a list of evaluable inputs.
         This uses the inputs derived from the _derive_singleton_inputs function to determine which
         aspects of a conversation ought to be extracted.
@@ -288,16 +364,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         return multi_modal_converter
-    def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput]]:
+    def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[DerivedEvalInput], Dict[str, Any]]:
         """Convert an arbitrary input into a list of inputs for evaluators.
         It is assumed that evaluators generally make use of their inputs in one of two ways.
         Either they receive a collection of keyname inputs that are all single values
         (like a query and response), or they receive conversation that iss a list of dictionary
         values.
-        The self._singleton_inputs list assigned during initialization is used to find and extract
-        singleton keywords, and self._allow_conversation_input is used to determine if a conversation
-        is a valid input.
+        The self._singleton_inputs list (containing overload signatures) assigned during initialization
+        is used to find and extract singleton keywords, and determine which overload matches the
+        provided arguments.
         If both conversations and singletons are allowed, the function will raise an exception if both
         are inputted.
@@ -315,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         conversation = kwargs.get("conversation", None)
         singletons = {}
         if len(self._singleton_inputs) > 0:
-            singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
+            # Get all possible singleton inputs and check what's provided
+            all_singleton_inputs = self._get_all_singleton_inputs()
+            singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
         # Check that both conversation and other inputs aren't set
         if conversation is not None and any(singletons.values()):
             msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
@@ -330,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             if self._is_multi_modal_conversation(conversation):
                 return self._derive_multi_modal_conversation_converter()(conversation)
             return self._derive_conversation_converter()(conversation)
-        # Handle Singletons
-        required_singletons = remove_optional_singletons(self, singletons)
-        if all(value is not None for value in required_singletons.values()):
-            return [singletons]
+        # Handle Singletons - find matching overload
+        matching_inputs = self._get_matching_overload_inputs(**kwargs)
+        if matching_inputs:
+            # Check if all required inputs for this overload are provided
+            required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
+            required_singletons = remove_optional_singletons(self, required_singletons)
+            if all(value is not None for value in required_singletons.values()):
+                return [singletons]
         # Missing input
         msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
         raise EvaluationException(
@@ -392,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
+    def _parse_tools_from_response(self, response):
+        """Parse the response to extract tool calls and results.
+        :param response: The response to parse.
+        :type response: Union[str, List[dict]]
+        :return: List of tool calls extracted from the response.
+        :rtype: List[dict]
+        """
+        tool_calls = []
+        tool_results_map = {}
+        if isinstance(response, list):
+            for message in response:
+                # Extract tool calls from assistant messages
+                if message.get("role") == "assistant" and isinstance(message.get("content"), list):
+                    for content_item in message.get("content"):
+                        if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
+                            tool_calls.append(content_item)
+                # Extract tool results from tool messages
+                elif message.get("role") == "tool" and message.get("tool_call_id"):
+                    tool_call_id = message.get("tool_call_id")
+                    if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
+                        result_content = message.get("content")[0]
+                        if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
+                            tool_results_map[tool_call_id] = result_content
+        # Attach results to their corresponding calls
+        for tool_call in tool_calls:
+            tool_call_id = tool_call.get("tool_call_id")
+            if tool_call_id in tool_results_map:
+                tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
+        return tool_calls
     async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """The asynchronous call where real end-to-end evaluation logic is performed.

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import math
 import re
 import os
-from typing import Dict, TypeVar, Union
+from typing import Dict, Optional, TypeVar, Union
 if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
     from promptflow.core._flow import AsyncPrompty
@@ -13,6 +13,7 @@ else:
     from azure.ai.evaluation._legacy.prompty import AsyncPrompty
 from typing_extensions import override
+from azure.core.credentials import TokenCredential
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
 from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         model_config: dict,
         eval_last_turn: bool = False,
         threshold: int = 3,
+        credential: Optional[TokenCredential] = None,
         _higher_is_better: bool = False,
         **kwargs,
     ) -> None:
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         )
         self._flow = AsyncPrompty.load(
-            source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
+            source=self._prompty_file,
+            model=prompty_model_config,
+            token_credential=credential,
+            is_reasoning_model=self._is_reasoning_model,
         )
     # __call__ not overridden here because child classes have such varied signatures that there's no point

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -36,14 +36,17 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
         when this occurs. Default is False, resulting full conversation evaluation and aggregation.
     :type eval_last_turn: bool
-    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation
-        to produce a single result.
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation        to produce a single result.
         Default is ~azure.ai.evaluation._AggregationType.MEAN.
     :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
     :param threshold: The threshold for the evaluation. Default is 3.
     :type threshold: Optional[int]
     :param _higher_is_better: If True, higher scores are better. Default is True.
     :type _higher_is_better: Optional[bool]
+    :param evaluate_query: If True, the query will be included in the evaluation data when evaluating
+        query-response pairs. If False, only the response will be evaluated. Default is False.
+        Can be passed as a keyword argument.
+    :type evaluate_query: bool
     """
     @override
@@ -56,6 +59,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
         threshold: int = 3,
         _higher_is_better: Optional[bool] = False,
+        **kwargs,
     ):
         super().__init__(
             eval_last_turn=eval_last_turn,
@@ -67,6 +71,9 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential
         self._threshold = threshold
+        # Handle evaluate_query parameter from kwargs
+        self._evaluate_query = kwargs.get("evaluate_query", False)
         self._higher_is_better = _higher_is_better
     @override
@@ -101,7 +108,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         :return: The evaluation result.
         :rtype: Dict
         """
-        if "query" in eval_input and "response" in eval_input:
+        if "response" in eval_input:
             return await self._evaluate_query_response(eval_input)
         conversation = eval_input.get("conversation", None)
@@ -131,19 +138,22 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         return result
     async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
-        query = str(eval_input.get("query", None))
-        response = str(eval_input.get("response", None))
-        if query is None or response is None:
+        query = eval_input.get("query", None)
+        response = eval_input.get("response", None)
+        if response is None:
             raise EvaluationException(
                 message="Not implemented",
                 internal_message=(
-                    "Reached query/response evaluation without supplying query or response."
+                    "Reached query/response evaluation without supplying response."
                     + " This should have failed earlier."
                 ),
             )
-        input_data = {"query": query, "response": response}
+        input_data = {"response": str(response)}
+        if query is not None and self._evaluate_query:
+            input_data["query"] = str(query)
-        if "context" in self._singleton_inputs:
+        if "context" in self._get_all_singleton_inputs():
             context = eval_input.get("context", None)
             if context is None:
                 raise EvaluationException(

azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py CHANGED Viewed

@@ -33,6 +33,8 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
     :type self_harm_threshold: int
     :param hate_unfairness_threshold: The threshold for hate/unfairness evaluation. Default is 3.
     :type hate_unfairness_threshold: int
+    :param evaluate_query: Whether to also evaluate the query in addition to the response. Default is False.
+    :type evaluate_query: bool
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
     :return: A function that evaluates content-safety metrics for "question-answering" scenario.
@@ -66,8 +68,9 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a ContentSafetyEvaluator with a query and response.
     """
-    id = "content_safety"
+    id = "azureai://built-in/evaluators/content_safety"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     def __init__(
         self,
@@ -90,11 +93,18 @@ class ContentSafetyEvaluator(MultiEvaluatorBase[Union[str, float]]):
             if not isinstance(value, int):
                 raise TypeError(f"{name} must be an int, got {type(value)}")
+        # Extract evaluate_query from kwargs if present
+        evaluate_query_kwargs = {}
+        if "evaluate_query" in kwargs:
+            evaluate_query_kwargs["evaluate_query"] = kwargs["evaluate_query"]
         evaluators = [
-            ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold),
-            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold),
-            SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold),
-            HateUnfairnessEvaluator(credential, azure_ai_project, threshold=hate_unfairness_threshold),
+            ViolenceEvaluator(credential, azure_ai_project, threshold=violence_threshold, **evaluate_query_kwargs),
+            SexualEvaluator(credential, azure_ai_project, threshold=sexual_threshold, **evaluate_query_kwargs),
+            SelfHarmEvaluator(credential, azure_ai_project, threshold=self_harm_threshold, **evaluate_query_kwargs),
+            HateUnfairnessEvaluator(
+                credential, azure_ai_project, threshold=hate_unfairness_threshold, **evaluate_query_kwargs
+            ),
         ]
         super().__init__(evaluators=evaluators, **kwargs)

azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py CHANGED Viewed

@@ -80,8 +80,9 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a HateUnfairnessEvaluator with a query and response.
     """
-    id = "azureml://registries/azureml/models/Hate-and-Unfairness-Evaluator/versions/4"
+    id = "azureai://built-in/evaluators/hate_unfairness"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
@@ -90,6 +91,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
         azure_ai_project,
         *,
         threshold: int = 3,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.HATE_FAIRNESS,
@@ -98,6 +100,7 @@ class HateUnfairnessEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py CHANGED Viewed

@@ -65,8 +65,9 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}.
     """
-    id = "azureml://registries/azureml/models/Self-Harm-Related-Content-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/self_harm"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
@@ -75,6 +76,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
         azure_ai_project,
         *,
         threshold: int = 3,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SELF_HARM,
@@ -83,6 +85,7 @@ class SelfHarmEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_content_safety/_sexual.py CHANGED Viewed

@@ -76,8 +76,9 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a SexualEvaluator.
     """
-    id = "azureml://registries/azureml/models/Sexual-Content-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/sexual"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
@@ -86,6 +87,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
         azure_ai_project,
         *,
         threshold: int = 3,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.SEXUAL,
@@ -94,6 +96,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            **kwargs,
         )
     @overload
@@ -146,7 +149,7 @@ class SexualEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             key "messages". Conversation turns are expected
             to be dictionaries with keys "content" and "role".
         :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
-        :return: The fluency score.
+        :return: The sexual score.
         :rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[str, float, Dict[str, List[Union[str, float]]]]]]
         """
         return super().__call__(*args, **kwargs)

azure/ai/evaluation/_evaluators/_content_safety/_violence.py CHANGED Viewed

@@ -76,8 +76,9 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             :caption: Initialize with threshold and call a ViolenceEvaluator.
     """
-    id = "azureml://registries/azureml/models/Violent-Content-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/violence"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
@@ -86,6 +87,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
         azure_ai_project,
         *,
         threshold: int = 3,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=EvaluationMetrics.VIOLENCE,
@@ -94,6 +96,7 @@ class ViolenceEvaluator(RaiServiceEvaluatorBase[Union[str, float]]):
             conversation_aggregation_type=_AggregationType.MAX,
             threshold=threshold,
             _higher_is_better=False,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py CHANGED Viewed

@@ -49,6 +49,9 @@ class DocumentRetrievalEvaluator(EvaluatorBase):
             :caption: Initialize with threshold and call a DocumentRetrievalEvaluator.
     """
+    id = "azureai://built-in/evaluators/document_retrieval"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(
         self,
         *,

azure/ai/evaluation/_evaluators/_eci/_eci.py CHANGED Viewed

@@ -52,17 +52,20 @@ class ECIEvaluator(RaiServiceEvaluatorBase):
     id = "eci"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    _OPTIONAL_PARAMS = ["query"]
     @override
     def __init__(
         self,
         credential,
         azure_ai_project,
+        **kwargs,
     ):
         super().__init__(
             eval_metric=_InternalEvaluationMetrics.ECI,
             azure_ai_project=azure_ai_project,
             credential=credential,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py CHANGED Viewed

@@ -58,7 +58,7 @@ class F1ScoreEvaluator(EvaluatorBase):
             :caption: Initialize with threshold and call an F1ScoreEvaluator.
     """
-    id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/f1_score"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     def __init__(self, *, threshold=0.5):

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -64,11 +64,11 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     _PROMPTY_FILE = "fluency.prompty"
     _RESULT_KEY = "fluency"
-    id = "azureml://registries/azureml/models/Fluency-Evaluator/versions/4"
+    id = "azureai://built-in/evaluators/fluency"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, credential=None, threshold=3):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_gleu/_gleu.py CHANGED Viewed

@@ -55,7 +55,7 @@ class GleuScoreEvaluator(EvaluatorBase):
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
-    id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
+    id = "azureai://built-in/evaluators/gleu_score"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override

azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.11.0py3-none-any.whl