PyPI - azure-ai-evaluation - Versions diffs - 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.11.2py3-none-any.whl → 1.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

azure/ai/evaluation/_evaluator_definition.py ADDED Viewed

@@ -0,0 +1,76 @@
+from abc import ABC
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, field
+@dataclass
+class EvaluatorMetric:
+    type: str = "ordinal"
+    desirable_direction: Optional[str] = None
+    min_value: Optional[float] = None
+    max_value: Optional[float] = None
+    def to_dict(self) -> Dict[str, Any]:
+        result = {"type": self.type}
+        if self.desirable_direction is not None:
+            result["desirable_direction"] = self.desirable_direction
+        if self.min_value is not None:
+            result["min_value"] = self.min_value
+        if self.max_value is not None:
+            result["max_value"] = self.max_value
+        return result
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorMetric":
+        return cls(
+            type=data.get("type", "ordinal"),
+            desirable_direction=data.get("desirable_direction"),
+            min_value=data.get("min_value"),
+            max_value=data.get("max_value"),
+        )
+@dataclass
+class ObjectParameterDescriptorWithRequired:
+    required: List[str] = field(default_factory=list)
+    type: str = "object"
+    properties: Dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> Dict[str, Any]:
+        return {"required": self.required, "type": self.type, "properties": self.properties}
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ObjectParameterDescriptorWithRequired":
+        return cls(
+            required=data.get("required", []), type=data.get("type", "object"), properties=data.get("properties", {})
+        )
+class EvaluatorDefinition(ABC):
+    """Base class for evaluator definitions"""
+    def __init__(self):
+        self.init_parameters: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
+        self.metrics: Dict[str, EvaluatorMetric] = {}
+        self.data_schema: ObjectParameterDescriptorWithRequired = ObjectParameterDescriptorWithRequired()
+        self.type: str = "unknown"
+    def to_dict(self) -> Dict[str, Any]:
+        result = {
+            "type": self.type,
+            "init_parameters": self.init_parameters.to_dict(),
+            "metrics": {k: v.to_dict() for k, v in self.metrics.items()},
+            "data_schema": self.data_schema.to_dict(),
+        }
+        return result
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvaluatorDefinition":
+        # Create a generic instance since specific subclasses are not defined
+        instance = cls.__new__(cls)
+        instance.__init__()
+        instance.init_parameters = ObjectParameterDescriptorWithRequired.from_dict(data.get("init_parameters", {}))
+        instance.metrics = {k: EvaluatorMetric.from_dict(v) for k, v in data.get("metrics", {}).items()}
+        instance.data_schema = ObjectParameterDescriptorWithRequired.from_dict(data.get("data_schema", {}))
+        return instance

azure/ai/evaluation/_evaluators/_bleu/_bleu.py CHANGED Viewed

@@ -46,6 +46,7 @@ class BleuScoreEvaluator(EvaluatorBase):
                 https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
             :start-after: [START threshold_bleu_score_evaluator]
             :end-before: [END threshold_bleu_score_evaluator]

azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py CHANGED Viewed

@@ -56,23 +56,6 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
-    .. admonition:: Example:
-        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START code_vulnerability_evaluator]
-            :end-before: [END code_vulnerability_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call CodeVulnerabilityEvaluator with a query and response using azure.ai.evaluation.AzureAIProject.
-        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
-            :start-after: [START code_vulnerability_evaluator]
-            :end-before: [END code_vulnerability_evaluator]
-            :language: python
-            :dedent: 8
-            :caption: Initialize and call CodeVulnerabilityEvaluator using Azure AI Project URL in following format
-                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::
         If this evaluator is supplied to the `evaluate` function, the metric

azure/ai/evaluation/_evaluators/_coherence/_coherence.py CHANGED Viewed

@@ -23,6 +23,11 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the coherence evaluator. Default is 3.
     :type threshold: int
+    :param credential: The credential for authenticating to Azure AI service.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
     .. admonition:: Example:
@@ -66,7 +71,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3, credential=None):
+    def __init__(self, model_config, *, threshold=3, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -78,6 +83,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             threshold=threshold,
             credential=credential,
             _higher_is_better=self._higher_is_better,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_common/_base_eval.py CHANGED Viewed

@@ -4,12 +4,15 @@
 import inspect
 from abc import ABC, abstractmethod
+import json
+import copy
 from typing import (
     Any,
     Callable,
     Dict,
     Generic,
     List,
+    Tuple,
     TypedDict,
     TypeVar,
     Union,
@@ -111,6 +114,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
     _NOT_APPLICABLE_RESULT = "not applicable"
     _PASS_RESULT = "pass"
     _FAIL_RESULT = "fail"
+    _type = "azure_ai_evaluator"
     # ~~~ METHODS THAT ALMOST ALWAYS NEED TO BE OVERRIDDEN BY CHILDREN~~~
@@ -498,7 +502,7 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
                 if message.get("role") == "assistant" and isinstance(message.get("content"), list):
                     for content_item in message.get("content"):
                         if isinstance(content_item, dict) and content_item.get("type") == "tool_call":
-                            tool_calls.append(content_item)
+                            tool_calls.append(copy.deepcopy(content_item))
                 # Extract tool results from tool messages
                 elif message.get("role") == "tool" and message.get("tool_call_id"):
@@ -516,6 +520,67 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
         return tool_calls
+    def _extract_tool_names_and_params_from_response(self, response) -> List[Tuple[str, Dict[str, str]]]:
+        """Extract tool names and parameters from the response.
+        :param response: The response to parse.
+        :type response: Union[str, List[dict]]
+        :return: List of tuples containing (tool_name, parameters_dict) extracted from the response.
+        :rtype: List[Tuple[str, Dict[str, str]]]
+        """
+        tool_calls = self._parse_tools_from_response(response)
+        tool_name_param_pairs = []
+        for tool_call in tool_calls:
+            if not isinstance(tool_call, dict):
+                raise EvaluationException(
+                    "Tool call must be a dictionary.",
+                    internal_message=str(tool_call),
+                    target=ErrorTarget.EVALUATE,
+                    category=ErrorCategory.UNKNOWN,
+                )
+            if tool_call.get("type") != "tool_call":
+                raise EvaluationException(
+                    "Tool call must have 'type' set to 'tool_call'.",
+                    internal_message=str(tool_call),
+                    target=ErrorTarget.EVALUATE,
+                    category=ErrorCategory.INVALID_VALUE,
+                )
+            if "name" not in tool_call:
+                raise EvaluationException(
+                    "Tool call missing 'name' field.",
+                    internal_message=str(tool_call),
+                    target=ErrorTarget.EVALUATE,
+                    category=ErrorCategory.MISSING_FIELD,
+                )
+            tool_name = str(tool_call["name"]).strip()
+            # Extract parameters/arguments
+            parameters = {}
+            if "arguments" in tool_call:
+                args = tool_call["arguments"]
+                if isinstance(args, dict):
+                    # Convert all values to strings for consistent comparison
+                    parameters = {str(k): str(v) for k, v in args.items()}
+                elif isinstance(args, str):
+                    # If arguments is a string, try to parse it as JSON
+                    try:
+                        parsed_args = json.loads(args)
+                        if isinstance(parsed_args, dict):
+                            parameters = {str(k): str(v) for k, v in parsed_args.items()}
+                    except json.JSONDecodeError:
+                        raise EvaluationException(
+                            "Failed to parse tool call arguments as JSON.",
+                            internal_message=str(tool_call),
+                            target=ErrorTarget.EVALUATE,
+                            category=ErrorCategory.INVALID_VALUE,
+                        )
+            tool_name_param_pairs.append((tool_name, parameters))
+        return tool_name_param_pairs
     async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
         """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -542,14 +607,25 @@ class EvaluatorBase(ABC, Generic[T_EvalValue]):
                         base_key = key[:-6]  # Remove "_score" suffix
                         result_key = f"{base_key}_result"
                         threshold_key = f"{base_key}_threshold"
-                        result[threshold_key] = self._threshold
+                        threshold_value = (
+                            self._threshold.get(base_key) if isinstance(self._threshold, dict) else self._threshold
+                        )
+                        if not isinstance(threshold_value, (int, float)):
+                            raise EvaluationException(
+                                "Threshold value must be a number.",
+                                internal_message=str(threshold_value),
+                                target=ErrorTarget.EVALUATE,
+                                category=ErrorCategory.INVALID_VALUE,
+                            )
+                        result[threshold_key] = threshold_value
                         if self._higher_is_better:
-                            if float(score_value) >= self._threshold:
+                            if float(score_value) >= threshold_value:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
                             else:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]
                         else:
-                            if float(score_value) <= self._threshold:
+                            if float(score_value) <= threshold_value:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[True]
                             else:
                                 result[result_key] = EVALUATION_PASS_FAIL_MAPPING[False]

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -5,7 +5,8 @@
 import math
 import re
 import os
-from typing import Dict, Optional, TypeVar, Union
+from itertools import chain
+from typing import Dict, Optional, TypeVar, Union, List
 if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
     from promptflow.core._flow import AsyncPrompty
@@ -132,10 +133,19 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
                 category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.CONVERSATION,
             )
-        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        # Call the prompty flow to get the evaluation result.
+        prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         score = math.nan
-        if llm_output:
+        if prompty_output_dict:
+            llm_output = prompty_output_dict.get("llm_output", "")
+            input_token_count = prompty_output_dict.get("input_token_count", 0)
+            output_token_count = prompty_output_dict.get("output_token_count", 0)
+            total_token_count = prompty_output_dict.get("total_token_count", 0)
+            finish_reason = prompty_output_dict.get("finish_reason", "")
+            model_id = prompty_output_dict.get("model_id", "")
+            sample_input = prompty_output_dict.get("sample_input", "")
+            sample_output = prompty_output_dict.get("sample_output", "")
             # Parse out score and reason from evaluators known to possess them.
             if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
                 score, reason = parse_quality_evaluator_reason_score(llm_output)
@@ -146,6 +156,13 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
                     f"{self._result_key}_reason": reason,
                     f"{self._result_key}_result": binary_result,
                     f"{self._result_key}_threshold": self._threshold,
+                    f"{self._result_key}_prompt_tokens": input_token_count,
+                    f"{self._result_key}_completion_tokens": output_token_count,
+                    f"{self._result_key}_total_tokens": total_token_count,
+                    f"{self._result_key}_finish_reason": finish_reason,
+                    f"{self._result_key}_model": model_id,
+                    f"{self._result_key}_sample_input": sample_input,
+                    f"{self._result_key}_sample_output": sample_output,
                 }
             match = re.search(r"\d", llm_output)
             if match:
@@ -156,6 +173,13 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
                 f"gpt_{self._result_key}": float(score),
                 f"{self._result_key}_result": binary_result,
                 f"{self._result_key}_threshold": self._threshold,
+                f"{self._result_key}_prompt_tokens": input_token_count,
+                f"{self._result_key}_completion_tokens": output_token_count,
+                f"{self._result_key}_total_tokens": total_token_count,
+                f"{self._result_key}_finish_reason": finish_reason,
+                f"{self._result_key}_model": model_id,
+                f"{self._result_key}_sample_input": sample_input,
+                f"{self._result_key}_sample_output": sample_output,
             }
         binary_result = self._get_binary_result(score)
@@ -165,3 +189,157 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
             f"{self._result_key}_result": binary_result,
             f"{self._result_key}_threshold": self._threshold,
         }
+    @staticmethod
+    def _get_built_in_tool_definition(tool_name: str):
+        """Get the definition for the built-in tool."""
+        try:
+            from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
+            if tool_name in _BUILT_IN_DESCRIPTIONS:
+                return {
+                    "type": tool_name,
+                    "description": _BUILT_IN_DESCRIPTIONS[tool_name],
+                    "name": tool_name,
+                    "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
+                }
+        except ImportError:
+            pass
+        return None
+    def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
+        """Extract tool definitions needed for the given built-in tool calls."""
+        needed_definitions = []
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name:
+                        definition = self._get_built_in_tool_definition(tool_name)
+                        if definition and definition not in needed_definitions:
+                            needed_definitions.append(definition)
+        return needed_definitions
+    def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
+        """Extract just the tool names from tool calls, removing parameters."""
+        tool_names = []
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name:
+                        tool_names.append(tool_name)
+                elif tool_call.get("function", {}).get("name"):
+                    # Handle function call format
+                    tool_names.append(tool_call["function"]["name"])
+                elif tool_call.get("name"):
+                    # Handle direct name format
+                    tool_names.append(tool_call["name"])
+        return tool_names
+    def _extract_needed_tool_definitions(
+        self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
+    ) -> List[Dict]:
+        """Extract the tool definitions that are needed for the provided tool calls.
+        :param tool_calls: The tool calls that need definitions
+        :type tool_calls: List[Dict]
+        :param tool_definitions: User-provided tool definitions
+        :type tool_definitions: List[Dict]
+        :param error_target: The evaluator-specific error target for exceptions
+        :type error_target: ErrorTarget
+        :return: List of needed tool definitions
+        :rtype: List[Dict]
+        :raises EvaluationException: If validation fails
+        """
+        needed_tool_definitions = []
+        # Add all user-provided tool definitions
+        needed_tool_definitions.extend(tool_definitions)
+        # Add the needed built-in tool definitions (if they are called)
+        built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
+        needed_tool_definitions.extend(built_in_definitions)
+        # OpenAPI tool is a collection of functions, so we need to expand it
+        tool_definitions_expanded = list(
+            chain.from_iterable(
+                tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
+                for tool in needed_tool_definitions
+            )
+        )
+        # Validate that all tool calls have corresponding definitions
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name and self._get_built_in_tool_definition(tool_name):
+                        # This is a built-in tool from converter, already handled above
+                        continue
+                    elif tool_name:
+                        # This is a regular function tool from converter
+                        tool_definition_exists = any(
+                            tool.get("name") == tool_name and tool.get("type", "function") == "function"
+                            for tool in tool_definitions_expanded
+                        )
+                        if not tool_definition_exists:
+                            raise EvaluationException(
+                                message=f"Tool definition for {tool_name} not found",
+                                blame=ErrorBlame.USER_ERROR,
+                                category=ErrorCategory.INVALID_VALUE,
+                                target=error_target,
+                            )
+                    else:
+                        raise EvaluationException(
+                            message=f"Tool call missing name: {tool_call}",
+                            blame=ErrorBlame.USER_ERROR,
+                            category=ErrorCategory.INVALID_VALUE,
+                            target=error_target,
+                        )
+                else:
+                    # Unsupported tool format - only converter format is supported
+                    raise EvaluationException(
+                        message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=error_target,
+                    )
+            else:
+                # Tool call is not a dictionary
+                raise EvaluationException(
+                    message=f"Tool call is not a dictionary: {tool_call}",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=error_target,
+                )
+        return needed_tool_definitions
+    def _not_applicable_result(
+        self, error_message: str, threshold: Union[int, float]
+    ) -> Dict[str, Union[str, float, Dict]]:
+        """Return a result indicating that the evaluation is not applicable.
+        :param error_message: The error message explaining why evaluation is not applicable.
+        :type error_message: str
+        :param threshold: The threshold value for the evaluator.
+        :type threshold: Union[int, float]
+        :return: A dictionary containing the result of the evaluation.
+        :rtype: Dict[str, Union[str, float, Dict]]
+        """
+        # If no tool calls were made or tool call type is not supported, return not applicable result
+        return {
+            self._result_key: self._NOT_APPLICABLE_RESULT,
+            f"{self._result_key}_result": "pass",
+            f"{self._result_key}_threshold": threshold,
+            f"{self._result_key}_reason": error_message,
+            f"{self._result_key}_details": {},
+        }

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -25,6 +25,11 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the fluency evaluator. Default is 3.
     :type threshold: int
+    :param credential: The credential for authenticating to Azure AI service.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
     .. admonition:: Example:
@@ -68,7 +73,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, credential=None, threshold=3):
+    def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -80,6 +85,7 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             threshold=threshold,
             credential=credential,
             _higher_is_better=self._higher_is_better,
+            **kwargs,
         )
     @overload

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os, logging
 from typing import Dict, List, Optional, Union, Any, Tuple
 from typing_extensions import overload, override
-from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
+from azure.ai.evaluation._legacy.prompty import AsyncPrompty
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
@@ -33,8 +33,7 @@ logger = logging.getLogger(__name__)
 class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
-    """
-    Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
+    """Evaluates groundedness score for a given query (optional), response, and context or a multi-turn conversation,
     including reasoning.
     The groundedness measure assesses the correspondence between claims in an AI-generated answer and the source
@@ -50,6 +49,11 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     :param threshold: The threshold for the groundedness evaluator. Default is 3.
     :type threshold: int
+    :param credential: The credential for authenticating to Azure AI service.
+    :type credential: ~azure.core.credentials.TokenCredential
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
     .. admonition:: Example:
@@ -61,6 +65,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :caption: Initialize and call a GroundednessEvaluator.
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
             :start-after: [START threshold_groundedness_evaluator]
             :end-before: [END threshold_groundedness_evaluator]
@@ -107,6 +112,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             threshold=threshold,
             credential=credential,
             _higher_is_better=self._higher_is_better,
+            **kwargs,
         )
         self._model_config = model_config
         self.threshold = threshold
@@ -196,18 +202,24 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         """
         if kwargs.get("query", None):
-            current_dir = os.path.dirname(__file__)
-            prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
-            self._prompty_file = prompty_path
-            prompty_model_config = construct_prompty_model_config(
-                validate_model_config(self._model_config),
-                self._DEFAULT_OPEN_API_VERSION,
-                UserAgentSingleton().value,
-            )
-            self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
+            self._ensure_query_prompty_loaded()
         return super().__call__(*args, **kwargs)
+    def _ensure_query_prompty_loaded(self):
+        """Switch to the query prompty file if not already loaded."""
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_WITH_QUERY)
+        self._prompty_file = prompty_path
+        prompty_model_config = construct_prompty_model_config(
+            validate_model_config(self._model_config),
+            self._DEFAULT_OPEN_API_VERSION,
+            UserAgentSingleton().value,
+        )
+        self._flow = AsyncPrompty.load(source=self._prompty_file, model=prompty_model_config)
     def _has_context(self, eval_input: dict) -> bool:
         """
         Return True if eval_input contains a non-empty 'context' field.
@@ -226,7 +238,7 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
-        if "query" not in eval_input:
+        if eval_input.get("query", None) is None:
             return await super()._do_eval(eval_input)
         contains_context = self._has_context(eval_input)
@@ -273,6 +285,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         response = kwargs.get("response")
         tool_definitions = kwargs.get("tool_definitions")
+        if query and self._prompty_file != self._PROMPTY_FILE_WITH_QUERY:
+            self._ensure_query_prompty_loaded()
         if (not query) or (not response):  # or not tool_definitions:
             msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
             raise EvaluationException(

azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

azure-ai-evaluation 1.11.2py3-none-any.whl → 1.13.0py3-none-any.whl