PyPI - azure-ai-evaluation - Versions diffs - 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (49) hide show

azure/ai/evaluation/_evaluate/_utils.py CHANGED Viewed

@@ -138,6 +138,7 @@ def _log_metrics_and_instance_results_onedp(
     project_url: str,
     evaluation_name: Optional[str],
     name_map: Dict[str, str],
+    tags: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> Optional[str]:
@@ -191,6 +192,7 @@ def _log_metrics_and_instance_results_onedp(
             evaluation=EvaluationUpload(
                 display_name=evaluation_name,
                 properties=properties,
+                tags=tags,
             )
         )
@@ -215,6 +217,7 @@ def _log_metrics_and_instance_results(
     run: Optional[Run],
     evaluation_name: Optional[str],
     name_map: Dict[str, str],
+    tags: Optional[Dict[str, str]] = None,
     **kwargs,
 ) -> Optional[str]:
     from azure.ai.evaluation._evaluate._eval_run import EvalRun
@@ -244,6 +247,7 @@ def _log_metrics_and_instance_results(
         workspace_name=ws_triad.workspace_name,
         management_client=management_client,
         promptflow_run=run,
+        tags=tags,
     ) as ev_run:
         artifact_name = EvalRun.EVALUATION_ARTIFACT

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -66,7 +66,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, threshold=3, credential=None):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -76,6 +76,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -170,15 +170,15 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     # ~~~ METHODS THAT MIGHT NEED TO BE OVERRIDDEN BY CHILDREN~~~
-    def _derive_singleton_inputs(self) -> List[str]:
+    def _derive_singleton_inputs(self) -> List[List[str]]:
         """Inspect the evaluator's __call__ function to determine what singleton inputs are expected
         when the evaluator is being used in a non-conversation context.
         By default, it's assumed that any input that is NOT kwargs or a conversation are singleton inputs.
         Thankfully this works the way you'd hope, with the call_signature being based on the child
         function's signature, not the parent's.
-        :return: A list of strings representing the names of singleton inputs.
-        :rtype: List[str]
+        :return: A list of lists, where each inner list represents the singleton inputs for each overload.
+        :rtype: List[List[str]]
         """
         overloads = get_overloads(self.__call__)
@@ -186,15 +186,66 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             call_signatures = [inspect.signature(self.__call__)]
         else:
             call_signatures = [inspect.signature(overload) for overload in overloads]
-        call_signature = inspect.signature(self.__call__)
-        singletons = []
+        overload_inputs = []
         for call_signature in call_signatures:
             params = call_signature.parameters
             if any(not_singleton_input in params for not_singleton_input in self._not_singleton_inputs):
                 continue
             # exclude self since it is not a singleton input
-            singletons.extend([p for p in params if p != "self"])
-        return singletons
+            overload_inputs.append([p for p in params if p != "self"])
+        return overload_inputs
+    def _get_matching_overload_inputs(self, **kwargs) -> List[str]:
+        """Find the overload that matches the provided kwargs and return its input parameters.
+        :keyword kwargs: The keyword arguments to match against overloads.
+        :type kwargs: Dict
+        :return: List of input parameter names for the matching overload.
+        :rtype: List[str]
+        """
+        overload_inputs = self._singleton_inputs
+        provided_keys = set(key for key, value in kwargs.items() if value is not None)
+        # Find the overload that best matches the provided parameters
+        best_match = None
+        best_score = -1
+        for inputs in overload_inputs:
+            input_set = set(inputs)
+            # Calculate match score: how many of the overload's params are provided
+            if input_set.issubset(provided_keys):
+                score = len(input_set)
+                if score > best_score:
+                    best_score = score
+                    best_match = inputs
+        # If exact match found, return it
+        if best_match is not None:
+            return best_match
+        # If no exact match, find the overload with the most overlap
+        for inputs in overload_inputs:
+            input_set = set(inputs)
+            overlap = len(input_set.intersection(provided_keys))
+            if overlap > best_score:
+                best_score = overlap
+                best_match = inputs
+        # Return the best match or the first overload as fallback
+        return best_match if best_match is not None else (overload_inputs[0] if overload_inputs else [])
+    def _get_all_singleton_inputs(self) -> List[str]:
+        """Get a flattened list of all possible singleton inputs across all overloads.
+        :return: Flattened list of all singleton input names.
+        :rtype: List[str]
+        """
+        all_inputs = set()
+        for inputs in self._singleton_inputs:
+            all_inputs.update(inputs)
+        return list(all_inputs)
     def _derive_conversation_converter(
         self,
@@ -206,10 +257,11 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         :return: The function that will be used to convert conversations to evaluable inputs.
         :rtype: Callable
         """
-        include_context = "context" in self._singleton_inputs
-        include_query = "query" in self._singleton_inputs
-        include_response = "response" in self._singleton_inputs
-        include_ground_truth = "ground_truth" in self._singleton_inputs
+        all_singleton_inputs = self._get_all_singleton_inputs()
+        include_context = "context" in all_singleton_inputs
+        include_query = "query" in all_singleton_inputs
+        include_response = "response" in all_singleton_inputs
+        include_ground_truth = "ground_truth" in all_singleton_inputs
         def converter(conversation: Dict) -> List[DerivedEvalInput]:
             messages = cast(List[Dict[str, Any]], conversation["messages"])
@@ -319,9 +371,9 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         (like a query and response), or they receive conversation that iss a list of dictionary
         values.
-        The self._singleton_inputs list assigned during initialization is used to find and extract
-        singleton keywords, and self._allow_conversation_input is used to determine if a conversation
-        is a valid input.
+        The self._singleton_inputs list (containing overload signatures) assigned during initialization
+        is used to find and extract singleton keywords, and determine which overload matches the
+        provided arguments.
         If both conversations and singletons are allowed, the function will raise an exception if both
         are inputted.
@@ -339,7 +391,10 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         conversation = kwargs.get("conversation", None)
         singletons = {}
         if len(self._singleton_inputs) > 0:
-            singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
+            # Get all possible singleton inputs and check what's provided
+            all_singleton_inputs = self._get_all_singleton_inputs()
+            singletons = {key: kwargs.get(key, None) for key in all_singleton_inputs}
         # Check that both conversation and other inputs aren't set
         if conversation is not None and any(singletons.values()):
             msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
@@ -354,10 +409,16 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
             if self._is_multi_modal_conversation(conversation):
                 return self._derive_multi_modal_conversation_converter()(conversation)
             return self._derive_conversation_converter()(conversation)
-        # Handle Singletons
-        required_singletons = remove_optional_singletons(self, singletons)
-        if all(value is not None for value in required_singletons.values()):
-            return [singletons]
+        # Handle Singletons - find matching overload
+        matching_inputs = self._get_matching_overload_inputs(**kwargs)
+        if matching_inputs:
+            # Check if all required inputs for this overload are provided
+            required_singletons = {key: kwargs.get(key, None) for key in matching_inputs}
+            required_singletons = remove_optional_singletons(self, required_singletons)
+            if all(value is not None for value in required_singletons.values()):
+                return [singletons]
         # Missing input
         msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
         raise EvaluationException(
@@ -416,6 +477,39 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         aggregated["evaluation_per_turn"] = evaluation_per_turn
         return aggregated
+    def _parse_tools_from_response(self, response):
+        """Parse the response to extract tool calls and results.
+        :param response: The response to parse.
+        :type response: Union[str, List[dict]]
+        :return: List of tool calls extracted from the response.
+        :rtype: List[dict]
+        """
+        tool_calls = []
+        tool_results_map = {}
+        if isinstance(response, list):
+            for message in response:
+                # Extract tool calls from assistant messages
+                if message.get("role") == "assistant" and isinstance(message.get("content"), list):
+                    for content_item in message.get("content"):
+                        if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
+                            tool_calls.append(content_item)
+                # Extract tool results from tool messages
+                elif message.get("role") == "tool" and message.get("tool_call_id"):
+                    tool_call_id = message.get("tool_call_id")
+                    if isinstance(message.get("content"), list) and len(message.get("content")) > 0:
+                        result_content = message.get("content")[0]
+                        if isinstance(result_content, dict) and result_content.get("type") == "tool_result":
+                            tool_results_map[tool_call_id] = result_content
+        # Attach results to their corresponding calls
+        for tool_call in tool_calls:
+            tool_call_id = tool_call.get("tool_call_id")
+            if tool_call_id in tool_results_map:
+                tool_call["tool_result"] = tool_results_map[tool_call_id]["tool_result"]
+        return tool_calls
     async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """The asynchronous call where real end-to-end evaluation logic is performed.

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import math
 import re
 import os
-from typing import Dict, TypeVar, Union
+from typing import Dict, Optional, TypeVar, Union
 if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
     from promptflow.core._flow import AsyncPrompty
@@ -13,6 +13,7 @@ else:
     from azure.ai.evaluation._legacy.prompty import AsyncPrompty
 from typing_extensions import override
+from azure.core.credentials import TokenCredential
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
 from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
@@ -63,6 +64,7 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         model_config: dict,
         eval_last_turn: bool = False,
         threshold: int = 3,
+        credential: Optional[TokenCredential] = None,
         _higher_is_better: bool = False,
         **kwargs,
     ) -> None:
@@ -82,7 +84,10 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         )
         self._flow = AsyncPrompty.load(
-            source=self._prompty_file, model=prompty_model_config, is_reasoning_model=self._is_reasoning_model
+            source=self._prompty_file,
+            model=prompty_model_config,
+            token_credential=credential,
+            is_reasoning_model=self._is_reasoning_model,
         )
     # __call__ not overridden here because child classes have such varied signatures that there's no point

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -153,7 +153,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         if query is not None and self._evaluate_query:
             input_data["query"] = str(query)
-        if "context" in self._singleton_inputs:
+        if "context" in self._get_all_singleton_inputs():
             context = eval_input.get("context", None)
             if context is None:
                 raise EvaluationException(

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -68,7 +68,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, credential=None, threshold=3):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -78,6 +78,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-import os
+import os, logging
 from typing import Dict, List, Optional, Union
 from typing_extensions import overload, override
@@ -9,7 +9,14 @@ from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
-from ..._common.utils import construct_prompty_model_config, validate_model_config
+from ..._common.utils import (
+    ErrorBlame,
+    ErrorTarget,
+    EvaluationException,
+    ErrorCategory,
+    construct_prompty_model_config,
+    validate_model_config,
+)
 try:
     from ..._user_agent import UserAgentSingleton
@@ -21,6 +28,9 @@ except ImportError:
             return "None"
+logger = logging.getLogger(__name__)
 class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """
     Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
@@ -78,12 +88,13 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     _PROMPTY_FILE_WITH_QUERY = "groundedness_with_query.prompty"
     _RESULT_KEY = "groundedness"
     _OPTIONAL_PARAMS = ["query"]
+    _SUPPORTED_TOOLS = ["file_search"]
     id = "azureai://built-in/evaluators/groundedness"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3, **kwargs):
+    def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY)  # Default to no query
@@ -93,6 +104,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )
         self._model_config = model_config
@@ -120,6 +132,26 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         :rtype: Dict[str, float]
         """
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: List[dict],
+        tool_definitions: List[dict],
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate groundedness for agent response with tool calls. Only file_search tool is supported.
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response from the agent to be evaluated.
+        :paramtype response: List[dict]
+        :keyword tool_definitions: The tool definitions used by the agent.
+        :paramtype tool_definitions: List[dict]
+        :return: The groundedness score.
+        :rtype: Dict[str, Union[str, float]]
+        """
     @overload
     def __call__(
         self,
@@ -174,3 +206,81 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
         return super().__call__(*args, **kwargs)
+    async def _real_call(self, **kwargs):
+        """The asynchronous call where real end-to-end evaluation logic is performed.
+        :keyword kwargs: The inputs to evaluate.
+        :type kwargs: Dict
+        :return: The evaluation result.
+        :rtype: Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]
+        """
+        # Convert inputs into list of evaluable inputs.
+        try:
+            return await super()._real_call(**kwargs)
+        except EvaluationException as ex:
+            if ex.category == ErrorCategory.NOT_APPLICABLE:
+                return {
+                    self._result_key: self._NOT_APPLICABLE_RESULT,
+                    f"{self._result_key}_result": "pass",
+                    f"{self._result_key}_threshold": self.threshold,
+                    f"{self._result_key}_reason": f"Supported tools were not called. Supported tools for groundedness are {self._SUPPORTED_TOOLS}.",
+                }
+            else:
+                raise ex
+    def _convert_kwargs_to_eval_input(self, **kwargs):
+        if "context" in kwargs or "conversation" in kwargs:
+            return super()._convert_kwargs_to_eval_input(**kwargs)
+        query = kwargs.get("query")
+        response = kwargs.get("response")
+        tool_definitions = kwargs.get("tool_definitions")
+        if not query or not response or not tool_definitions:
+            msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query', 'response' and 'tool_definitions' are required."
+            raise EvaluationException(
+                message=msg,
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
+            )
+        context = self._get_context_from_agent_response(response, tool_definitions)
+        if not context:
+            raise EvaluationException(
+                message=f"Context could not be extracted from agent response. Supported tools for groundedness are {self._SUPPORTED_TOOLS}. If supported tools are not used groundedness is not calculated.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.NOT_APPLICABLE,
+                target=ErrorTarget.GROUNDEDNESS_EVALUATOR,
+            )
+        return super()._convert_kwargs_to_eval_input(response=response[-1], context=context, query=query)
+    def _get_context_from_agent_response(self, response, tool_definitions):
+        context = ""
+        try:
+            logger.debug("Extracting context from response")
+            tool_calls = self._parse_tools_from_response(response=response)
+            logger.debug(f"Tool Calls parsed successfully : {tool_calls}")
+            if tool_calls:
+                for tool_call in tool_calls:
+                    if isinstance(tool_call, dict) and tool_call.get("type") == "tool_call":
+                        tool_name = tool_call.get("name")
+                        for tool in tool_definitions:
+                            if tool.get("name") == tool_name and tool.get("type") in self._SUPPORTED_TOOLS:
+                                if tool_name == "file_search":
+                                    tool_result = tool_call.get("tool_result")
+                                    if tool_result:
+                                        for result in tool_result:
+                                            content_list = result.get("content")
+                                            if content_list:
+                                                for content in content_list:
+                                                    text = content.get("text")
+                                                    if text:
+                                                        context = context + "\n" + str(text)
+        except Exception as ex:
+            logger.debug(f"Error extracting context from agent response : {str(ex)}")
+            context = ""
+        return context if context else None

azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py CHANGED Viewed

@@ -61,11 +61,17 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_INTENT_RESOLUTION_THRESHOLD, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            **kwargs,
+        )
     @overload
     def __call__(

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -79,7 +79,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, credential=None, threshold=3):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -89,6 +89,7 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py CHANGED Viewed

@@ -73,11 +73,19 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, **kwargs):
+    def __init__(
+        self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD, credential=None, **kwargs
+    ):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            **kwargs,
+        )
     @overload
     def __call__(

azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py CHANGED Viewed

@@ -78,7 +78,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold: float = 3):  # pylint: disable=super-init-not-called
+    def __init__(self, model_config, *, threshold: float = 3, credential=None):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -88,6 +88,7 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -75,7 +75,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, threshold=3, credential=None):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -85,6 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py CHANGED Viewed

@@ -69,11 +69,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            **kwargs,
+        )
     @overload
     def __call__(

azure-ai-evaluation 1.10.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.10.0py3-none-any.whl → 1.11.0py3-none-any.whl