PyPI - azure-ai-evaluation - Versions diffs - 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl - Mend

azure-ai-evaluation 1.0.1py3-none-any.whl → 1.13.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (277) hide show

azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py CHANGED Viewed

@@ -4,19 +4,32 @@
 import math
 import re
-from typing import Dict, TypeVar, Union
+import os
+from itertools import chain
+from typing import Dict, Optional, TypeVar, Union, List
-from promptflow.core import AsyncPrompty
+if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
+    from promptflow.core._flow import AsyncPrompty
+else:
+    from azure.ai.evaluation._legacy.prompty import AsyncPrompty
 from typing_extensions import override
+from azure.core.credentials import TokenCredential
 from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from ..._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score
 from . import EvaluatorBase
 try:
-    from ..._user_agent import USER_AGENT
+    from ..._user_agent import UserAgentSingleton
 except ImportError:
-    USER_AGENT = "None"
+    class UserAgentSingleton:
+        @property
+        def value(self) -> str:
+            return "None"
 T = TypeVar("T")
@@ -37,26 +50,69 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
     :param ignore_queries: If True, queries will be ignored in conversation evaluations. Default is False.
         Useful since some evaluators of this format are response-only.
     :type ignore_queries: bool
+    :keyword is_reasoning_model: This parameter is in preview. If True, updates the config parameters in prompty file based on reasoning models. Defaults to False.
+    :type is_reasoning_model: bool
     """
     _LLM_CALL_TIMEOUT = 600
     _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
-    def __init__(self, *, result_key: str, prompty_file: str, model_config: dict, eval_last_turn: bool = False):
+    def __init__(
+        self,
+        *,
+        result_key: str,
+        prompty_file: str,
+        model_config: dict,
+        eval_last_turn: bool = False,
+        threshold: int = 3,
+        credential: Optional[TokenCredential] = None,
+        _higher_is_better: bool = False,
+        **kwargs,
+    ) -> None:
         self._result_key = result_key
+        self._is_reasoning_model = kwargs.get("is_reasoning_model", False)
         self._prompty_file = prompty_file
-        super().__init__(eval_last_turn=eval_last_turn)
+        self._threshold = threshold
+        self._higher_is_better = _higher_is_better
+        super().__init__(eval_last_turn=eval_last_turn, threshold=threshold, _higher_is_better=_higher_is_better)
+        subclass_name = self.__class__.__name__
+        user_agent = f"{UserAgentSingleton().value} (type=evaluator subtype={subclass_name})"
         prompty_model_config = construct_prompty_model_config(
             validate_model_config(model_config),
             self._DEFAULT_OPEN_API_VERSION,
-            USER_AGENT,
+            user_agent,
         )
-        self._flow = AsyncPrompty.load(source=prompty_file, model=prompty_model_config)
+        self._flow = AsyncPrompty.load(
+            source=self._prompty_file,
+            model=prompty_model_config,
+            token_credential=credential,
+            is_reasoning_model=self._is_reasoning_model,
+        )
     # __call__ not overridden here because child classes have such varied signatures that there's no point
     # defining a default here.
+    def _get_binary_result(self, score: float) -> str:
+        """Get the binary result based on the score.
+        :param score: The score to evaluate.
+        :type score: float
+        :return: The binary result.
+        :rtype: str
+        """
+        if math.isnan(score):
+            return "unknown"
+        if self._higher_is_better:
+            if score >= self._threshold:
+                return EVALUATION_PASS_FAIL_MAPPING[True]
+            else:
+                return EVALUATION_PASS_FAIL_MAPPING[False]
+        else:
+            if score <= self._threshold:
+                return EVALUATION_PASS_FAIL_MAPPING[True]
+            else:
+                return EVALUATION_PASS_FAIL_MAPPING[False]
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
@@ -69,20 +125,221 @@ class PromptyEvaluatorBase(EvaluatorBase[T]):
         :return: The evaluation result.
         :rtype: Dict
         """
-        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message="Only text conversation inputs are supported.",
+                internal_message="Only text conversation inputs are supported.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=ErrorTarget.CONVERSATION,
+            )
+        # Call the prompty flow to get the evaluation result.
+        prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
         score = math.nan
-        if llm_output:
+        if prompty_output_dict:
+            llm_output = prompty_output_dict.get("llm_output", "")
+            input_token_count = prompty_output_dict.get("input_token_count", 0)
+            output_token_count = prompty_output_dict.get("output_token_count", 0)
+            total_token_count = prompty_output_dict.get("total_token_count", 0)
+            finish_reason = prompty_output_dict.get("finish_reason", "")
+            model_id = prompty_output_dict.get("model_id", "")
+            sample_input = prompty_output_dict.get("sample_input", "")
+            sample_output = prompty_output_dict.get("sample_output", "")
             # Parse out score and reason from evaluators known to possess them.
             if self._result_key in PROMPT_BASED_REASON_EVALUATORS:
                 score, reason = parse_quality_evaluator_reason_score(llm_output)
+                binary_result = self._get_binary_result(score)
                 return {
                     self._result_key: float(score),
                     f"gpt_{self._result_key}": float(score),
                     f"{self._result_key}_reason": reason,
+                    f"{self._result_key}_result": binary_result,
+                    f"{self._result_key}_threshold": self._threshold,
+                    f"{self._result_key}_prompt_tokens": input_token_count,
+                    f"{self._result_key}_completion_tokens": output_token_count,
+                    f"{self._result_key}_total_tokens": total_token_count,
+                    f"{self._result_key}_finish_reason": finish_reason,
+                    f"{self._result_key}_model": model_id,
+                    f"{self._result_key}_sample_input": sample_input,
+                    f"{self._result_key}_sample_output": sample_output,
                 }
             match = re.search(r"\d", llm_output)
             if match:
                 score = float(match.group())
-            return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
-        return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)}
+                binary_result = self._get_binary_result(score)
+            return {
+                self._result_key: float(score),
+                f"gpt_{self._result_key}": float(score),
+                f"{self._result_key}_result": binary_result,
+                f"{self._result_key}_threshold": self._threshold,
+                f"{self._result_key}_prompt_tokens": input_token_count,
+                f"{self._result_key}_completion_tokens": output_token_count,
+                f"{self._result_key}_total_tokens": total_token_count,
+                f"{self._result_key}_finish_reason": finish_reason,
+                f"{self._result_key}_model": model_id,
+                f"{self._result_key}_sample_input": sample_input,
+                f"{self._result_key}_sample_output": sample_output,
+            }
+        binary_result = self._get_binary_result(score)
+        return {
+            self._result_key: float(score),
+            f"gpt_{self._result_key}": float(score),
+            f"{self._result_key}_result": binary_result,
+            f"{self._result_key}_threshold": self._threshold,
+        }
+    @staticmethod
+    def _get_built_in_tool_definition(tool_name: str):
+        """Get the definition for the built-in tool."""
+        try:
+            from ..._converters._models import _BUILT_IN_DESCRIPTIONS, _BUILT_IN_PARAMS
+            if tool_name in _BUILT_IN_DESCRIPTIONS:
+                return {
+                    "type": tool_name,
+                    "description": _BUILT_IN_DESCRIPTIONS[tool_name],
+                    "name": tool_name,
+                    "parameters": _BUILT_IN_PARAMS.get(tool_name, {}),
+                }
+        except ImportError:
+            pass
+        return None
+    def _get_needed_built_in_tool_definitions(self, tool_calls: List[Dict]) -> List[Dict]:
+        """Extract tool definitions needed for the given built-in tool calls."""
+        needed_definitions = []
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                # Only support converter format: {type: "tool_call", name: "bing_custom_search", arguments: {...}}
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name:
+                        definition = self._get_built_in_tool_definition(tool_name)
+                        if definition and definition not in needed_definitions:
+                            needed_definitions.append(definition)
+        return needed_definitions
+    def _extract_tool_names_from_calls(self, tool_calls: List[Dict]) -> List[str]:
+        """Extract just the tool names from tool calls, removing parameters."""
+        tool_names = []
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name:
+                        tool_names.append(tool_name)
+                elif tool_call.get("function", {}).get("name"):
+                    # Handle function call format
+                    tool_names.append(tool_call["function"]["name"])
+                elif tool_call.get("name"):
+                    # Handle direct name format
+                    tool_names.append(tool_call["name"])
+        return tool_names
+    def _extract_needed_tool_definitions(
+        self, tool_calls: List[Dict], tool_definitions: List[Dict], error_target: ErrorTarget
+    ) -> List[Dict]:
+        """Extract the tool definitions that are needed for the provided tool calls.
+        :param tool_calls: The tool calls that need definitions
+        :type tool_calls: List[Dict]
+        :param tool_definitions: User-provided tool definitions
+        :type tool_definitions: List[Dict]
+        :param error_target: The evaluator-specific error target for exceptions
+        :type error_target: ErrorTarget
+        :return: List of needed tool definitions
+        :rtype: List[Dict]
+        :raises EvaluationException: If validation fails
+        """
+        needed_tool_definitions = []
+        # Add all user-provided tool definitions
+        needed_tool_definitions.extend(tool_definitions)
+        # Add the needed built-in tool definitions (if they are called)
+        built_in_definitions = self._get_needed_built_in_tool_definitions(tool_calls)
+        needed_tool_definitions.extend(built_in_definitions)
+        # OpenAPI tool is a collection of functions, so we need to expand it
+        tool_definitions_expanded = list(
+            chain.from_iterable(
+                tool.get("functions", []) if tool.get("type") == "openapi" else [tool]
+                for tool in needed_tool_definitions
+            )
+        )
+        # Validate that all tool calls have corresponding definitions
+        for tool_call in tool_calls:
+            if isinstance(tool_call, dict):
+                tool_type = tool_call.get("type")
+                if tool_type == "tool_call":
+                    tool_name = tool_call.get("name")
+                    if tool_name and self._get_built_in_tool_definition(tool_name):
+                        # This is a built-in tool from converter, already handled above
+                        continue
+                    elif tool_name:
+                        # This is a regular function tool from converter
+                        tool_definition_exists = any(
+                            tool.get("name") == tool_name and tool.get("type", "function") == "function"
+                            for tool in tool_definitions_expanded
+                        )
+                        if not tool_definition_exists:
+                            raise EvaluationException(
+                                message=f"Tool definition for {tool_name} not found",
+                                blame=ErrorBlame.USER_ERROR,
+                                category=ErrorCategory.INVALID_VALUE,
+                                target=error_target,
+                            )
+                    else:
+                        raise EvaluationException(
+                            message=f"Tool call missing name: {tool_call}",
+                            blame=ErrorBlame.USER_ERROR,
+                            category=ErrorCategory.INVALID_VALUE,
+                            target=error_target,
+                        )
+                else:
+                    # Unsupported tool format - only converter format is supported
+                    raise EvaluationException(
+                        message=f"Unsupported tool call format. Only converter format is supported: {tool_call}",
+                        blame=ErrorBlame.USER_ERROR,
+                        category=ErrorCategory.INVALID_VALUE,
+                        target=error_target,
+                    )
+            else:
+                # Tool call is not a dictionary
+                raise EvaluationException(
+                    message=f"Tool call is not a dictionary: {tool_call}",
+                    blame=ErrorBlame.USER_ERROR,
+                    category=ErrorCategory.INVALID_VALUE,
+                    target=error_target,
+                )
+        return needed_tool_definitions
+    def _not_applicable_result(
+        self, error_message: str, threshold: Union[int, float]
+    ) -> Dict[str, Union[str, float, Dict]]:
+        """Return a result indicating that the evaluation is not applicable.
+        :param error_message: The error message explaining why evaluation is not applicable.
+        :type error_message: str
+        :param threshold: The threshold value for the evaluator.
+        :type threshold: Union[int, float]
+        :return: A dictionary containing the result of the evaluation.
+        :rtype: Dict[str, Union[str, float, Dict]]
+        """
+        # If no tool calls were made or tool call type is not supported, return not applicable result
+        return {
+            self._result_key: self._NOT_APPLICABLE_RESULT,
+            f"{self._result_key}_result": "pass",
+            f"{self._result_key}_threshold": threshold,
+            f"{self._result_key}_reason": error_message,
+            f"{self._result_key}_details": {},
+        }

azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing import Dict, TypeVar, Union
+from typing import Dict, TypeVar, Union, Optional
 from typing_extensions import override
@@ -11,9 +11,11 @@ from azure.ai.evaluation._common.constants import (
     Tasks,
     _InternalAnnotationTasks,
 )
-from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
-from azure.ai.evaluation._common.utils import validate_azure_ai_project
+from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service, evaluate_with_rai_service_multimodal
+from azure.ai.evaluation._common.utils import validate_azure_ai_project, is_onedp_project
 from azure.ai.evaluation._exceptions import EvaluationException
+from azure.ai.evaluation._common.utils import validate_conversation
+from azure.ai.evaluation._constants import _AggregationType
 from azure.core.credentials import TokenCredential
 from . import EvaluatorBase
@@ -34,20 +36,45 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         aggregated. Per-turn results are still be available in the output via the "evaluation_per_turn" key
         when this occurs. Default is False, resulting full conversation evaluation and aggregation.
     :type eval_last_turn: bool
+    :param conversation_aggregation_type: The type of aggregation to perform on the per-turn results of a conversation        to produce a single result.
+        Default is ~azure.ai.evaluation._AggregationType.MEAN.
+    :type conversation_aggregation_type: ~azure.ai.evaluation._AggregationType
+    :param threshold: The threshold for the evaluation. Default is 3.
+    :type threshold: Optional[int]
+    :param _higher_is_better: If True, higher scores are better. Default is True.
+    :type _higher_is_better: Optional[bool]
+    :param evaluate_query: If True, the query will be included in the evaluation data when evaluating
+        query-response pairs. If False, only the response will be evaluated. Default is False.
+        Can be passed as a keyword argument.
+    :type evaluate_query: bool
     """
     @override
     def __init__(
         self,
         eval_metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
-        azure_ai_project: dict,
+        azure_ai_project: Union[dict, str],
         credential: TokenCredential,
         eval_last_turn: bool = False,
+        conversation_aggregation_type: _AggregationType = _AggregationType.MEAN,
+        threshold: int = 3,
+        _higher_is_better: Optional[bool] = False,
+        **kwargs,
     ):
-        super().__init__(eval_last_turn=eval_last_turn)
+        super().__init__(
+            eval_last_turn=eval_last_turn,
+            conversation_aggregation_type=conversation_aggregation_type,
+            threshold=threshold,
+            _higher_is_better=_higher_is_better,
+        )
         self._eval_metric = eval_metric
         self._azure_ai_project = validate_azure_ai_project(azure_ai_project)
         self._credential = credential
+        self._threshold = threshold
+        # Handle evaluate_query parameter from kwargs
+        self._evaluate_query = kwargs.get("evaluate_query", False)
+        self._higher_is_better = _higher_is_better
     @override
     def __call__(  # pylint: disable=docstring-missing-param
@@ -81,19 +108,52 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
         :return: The evaluation result.
         :rtype: Dict
         """
+        if "response" in eval_input:
+            return await self._evaluate_query_response(eval_input)
+        conversation = eval_input.get("conversation", None)
+        return await self._evaluate_conversation(conversation)
+    async def _evaluate_conversation(self, conversation: Dict) -> Dict[str, T]:
+        """
+        Evaluates content according to this evaluator's metric.
+        :keyword conversation: The conversation contains list of messages to be evaluated.
+            Each message should have "role" and "content" keys.
+        :param conversation: The conversation to evaluate.
+        :type conversation: ~azure.ai.evaluation.Conversation
+        :return: The evaluation score computation based on the Content Safety metric (self.metric).
+        :rtype: Dict[str, Union[float, str]]
+        """
+        # validate inputs
+        validate_conversation(conversation)
+        messages = conversation["messages"]
+        # Run score computation based on supplied metric.
+        result = await evaluate_with_rai_service_multimodal(
+            messages=messages,
+            metric_name=self._eval_metric,
+            project_scope=self._azure_ai_project,
+            credential=self._credential,
+        )
+        return result
+    async def _evaluate_query_response(self, eval_input: Dict) -> Dict[str, T]:
         query = eval_input.get("query", None)
         response = eval_input.get("response", None)
-        if query is None or response is None:
+        if response is None:
             raise EvaluationException(
                 message="Not implemented",
                 internal_message=(
-                    "Reached query/response evaluation without supplying query or response."
+                    "Reached query/response evaluation without supplying response."
                     + " This should have failed earlier."
                 ),
             )
-        input_data = {"query": query, "response": response}
+        input_data = {"response": str(response)}
+        if query is not None and self._evaluate_query:
+            input_data["query"] = str(query)
-        if "context" in self._singleton_inputs:
+        if "context" in self._get_all_singleton_inputs():
             context = eval_input.get("context", None)
             if context is None:
                 raise EvaluationException(
@@ -111,6 +171,7 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
             project_scope=self._azure_ai_project,
             credential=self._credential,
             annotation_task=self._get_task(),
+            evaluator_name=self.__class__.__name__,
         )
     def _get_task(self):
@@ -130,4 +191,8 @@ class RaiServiceEvaluatorBase(EvaluatorBase[T]):
             return _InternalAnnotationTasks.ECI
         if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
             return Tasks.PROTECTED_MATERIAL
+        if self._eval_metric == EvaluationMetrics.CODE_VULNERABILITY:
+            return Tasks.CODE_VULNERABILITY
+        if self._eval_metric == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
+            return Tasks.UNGROUNDED_ATTRIBUTES
         return Tasks.CONTENT_HARM

azure/ai/evaluation/_evaluators/_common/_conversation_aggregators.py ADDED Viewed

@@ -0,0 +1,49 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Callable, List
+from azure.ai.evaluation._common.math import list_mean
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._constants import _AggregationType
+def GetAggregator(aggregation_type: _AggregationType) -> Callable[[List[float]], float]:
+    if aggregation_type == _AggregationType.SUM:
+        return sum
+    if aggregation_type == _AggregationType.MEAN:
+        return list_mean
+    if aggregation_type == _AggregationType.MAX:
+        return max
+    if aggregation_type == _AggregationType.MIN:
+        return min
+    if aggregation_type == _AggregationType.CUSTOM:
+        msg = (
+            "Cannot 'get' aggregator function associated with custom aggregation enum."
+            + " This enum value should only be outputted as an indicator of an injected"
+            + " aggregation function, not inputted directly"
+        )
+        raise EvaluationException(
+            message=msg,
+            blame=ErrorBlame.UNKNOWN,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.EVALUATE,
+        )
+    raise EvaluationException(
+        message=f"Unaccounted for aggregation type: {aggregation_type}",
+        blame=ErrorBlame.UNKNOWN,
+        category=ErrorCategory.INVALID_VALUE,
+        target=ErrorTarget.EVALUATE,
+    )
+def GetAggregatorType(aggregation_function: Callable) -> _AggregationType:
+    if aggregation_function == sum:  # pylint: disable=comparison-with-callable
+        return _AggregationType.SUM
+    if aggregation_function == list_mean:  # pylint: disable=comparison-with-callable
+        return _AggregationType.MEAN
+    if aggregation_function == max:  # pylint: disable=comparison-with-callable
+        return _AggregationType.MAX
+    if aggregation_function == min:  # pylint: disable=comparison-with-callable
+        return _AggregationType.MIN
+    return _AggregationType.CUSTOM

azure-ai-evaluation 1.0.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.0.1py3-none-any.whl → 1.13.3py3-none-any.whl