PyPI - azure-ai-evaluation - Versions diffs - 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.3.0py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (142) hide show

azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import List, Dict
 from typing_extensions import overload, override
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 class F1ScoreEvaluator(EvaluatorBase):
@@ -25,6 +26,8 @@ class F1ScoreEvaluator(EvaluatorBase):
     model's responses. It provides a balanced evaluation of your model's performance in terms of capturing accurate
     information in the response.
+    :param threshold: The threshold for the F1 score evaluator. Default is 0.5.
+    :type threshold: float
     .. admonition:: Example:
@@ -34,13 +37,24 @@ class F1ScoreEvaluator(EvaluatorBase):
             :language: python
             :dedent: 8
             :caption: Initialize and call an F1ScoreEvaluator.
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_f1_score_evaluator]
+            :end-before: [END threshold_f1_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call an F1ScoreEvaluator.
     """
     id = "azureml://registries/azureml/models/F1Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
-    def __init__(self):
-        super().__init__()
+    def __init__(self, *, threshold=0.5):
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
     @classmethod
     def _compute_f1_score(cls, response: str, ground_truth: str) -> float:
@@ -115,8 +129,18 @@ class F1ScoreEvaluator(EvaluatorBase):
         response = eval_input["response"]
         # Run f1 score computation.
         f1_result = self._compute_f1_score(response=response, ground_truth=ground_truth)
-        return {"f1_score": f1_result}
+        binary_result = False
+        if self._higher_is_better:
+            if f1_result >= self._threshold:
+                binary_result = True
+        else:
+            if f1_result <= self._threshold:
+                binary_result = True
+        return {
+            "f1_score": f1_result,
+            "f1_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
+            "f1_threshold": self._threshold,
+        }
     @overload  # type: ignore
     def __call__(self, *, response: str, ground_truth: str) -> Dict[str, float]:

azure/ai/evaluation/_evaluators/_fluency/_fluency.py CHANGED Viewed

@@ -23,6 +23,8 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the fluency evaluator. Default is 5.
+    :type threshold: int
     .. admonition:: Example:
@@ -33,6 +35,15 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a FluencyEvaluator.
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_fluency_evaluator]
+            :end-before: [END threshold_fluency_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a FluencyEvaluator.
     .. note::
         To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -47,10 +58,18 @@ class FluencyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config):
+    def __init__(self, model_config, *, threshold=3):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            _higher_is_better=self._higher_is_better
+        )
     @overload
     def __call__(

azure/ai/evaluation/_evaluators/_gleu/_gleu.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing_extensions import overload, override
 from azure.ai.evaluation._common.utils import nltk_tokenize
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 class GleuScoreEvaluator(EvaluatorBase):
@@ -22,6 +23,9 @@ class GleuScoreEvaluator(EvaluatorBase):
     GLEU scores range from 0 to 1, where a value of 1 represents perfect overlap between the response and
     the ground truth and a value of 0 indicates no overlap.
+    :param threshold: The threshold for the GLEU evaluator. Default is 0.5.
+    :type threshold: float
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
@@ -30,14 +34,25 @@ class GleuScoreEvaluator(EvaluatorBase):
             :language: python
             :dedent: 8
             :caption: Initialize and call a GleuScoreEvaluator.
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_gleu_score_evaluator]
+            :end-before: [END threshold_gleu_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a GleuScoreEvaluator.
     """
     id = "azureml://registries/azureml/models/Gleu-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self):
-        super().__init__()
+    def __init__(self, *, threshold=0.5):
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
@@ -54,9 +69,17 @@ class GleuScoreEvaluator(EvaluatorBase):
         hypothesis_tokens = nltk_tokenize(response)
         score = sentence_gleu([reference_tokens], hypothesis_tokens)
+        binary_result = False
+        if self._higher_is_better:
+            if score >= self._threshold:
+                binary_result = True
+        else:
+            if score <= self._threshold:
+                binary_result = True
         return {
             "gleu_score": score,
+            "gleu_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
+            "gleu_threshold": self._threshold,
         }
     @overload  # type: ignore

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 from typing import Dict, List, Optional, Union
 from typing_extensions import overload, override
-from promptflow.core import AsyncPrompty
+from azure.ai.evaluation._legacy._adapters._flows import AsyncPrompty
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._model_configurations import Conversation
@@ -33,7 +33,9 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
+    :param threshold: The threshold for the groundedness evaluator. Default is 5.
+    :type threshold: int
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
@@ -43,6 +45,14 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a GroundednessEvaluator.
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_groundedness_evaluator]
+            :end-before: [END threshold_groundedness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a GroundednessEvaluator.
     .. note::
         To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -59,12 +69,20 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config):
+    def __init__(self, model_config, *, threshold=3, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE_NO_QUERY)  # Default to no query
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+        self._higher_is_better = True
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            threshold=threshold,
+            _higher_is_better=self._higher_is_better
+        )
         self._model_config = model_config
+        self.threshold = threshold
         # Needs to be set because it's used in call method to re-validate prompt if `query` is provided
     @overload

azure/ai/evaluation/_evaluators/_intent_resolution/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from ._intent_resolution import IntentResolutionEvaluator
+__all__ = ["IntentResolutionEvaluator"]

azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py ADDED Viewed

@@ -0,0 +1,152 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+import os
+import math
+from typing import Dict, Union, List, Optional
+from typing_extensions import overload, override
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._model_configurations import Conversation, Message
+from ..._common.utils import check_score_is_valid
+from azure.ai.evaluation._common._experimental import experimental
+@experimental
+class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    """
+    Evaluates intent resolution for a given query and response or a multi-turn conversation, including reasoning.
+    The intent resolution evaluator assesses whether the user intent was correctly identified and resolved.
+    :param model_config: Configuration for the Azure OpenAI model.
+    :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration]
+    .. admonition:: Example:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START intent_resolution_evaluator]
+            :end-before: [END intent_resolution_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
+    """
+    _PROMPTY_FILE = "intent_resolution.prompty"
+    _RESULT_KEY = "intent_resolution"
+    _OPTIONAL_PARAMS = ["tool_definitions"]
+    _MIN_INTENT_RESOLUTION_SCORE = 1
+    _MAX_INTENT_RESOLUTION_SCORE = 5
+    _DEFAULT_INTENT_RESOLUTION_THRESHOLD = 3
+    id = None
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+    @override
+    def __init__(self, model_config, *, threshold = _DEFAULT_INTENT_RESOLUTION_THRESHOLD):
+        current_dir = os.path.dirname(__file__)
+        prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
+        self.threshold = threshold
+        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
+    @overload
+    def __call__(
+        self,
+        *,
+        query            : Union[str, List[dict]],
+        response         : Union[str, List[dict]],
+        tool_definitions : Optional[Union[dict, List[dict]]] = None,
+    ) -> Dict[str, Union[str, float]]:
+        """Evaluate intent resolution for a given query, response and optional tool definitions.
+        The query and response can be either a string or a list of messages.
+        Example with string inputs and no tools:
+            evaluator = IntentResolutionEvaluator(model_config)
+            query = "What is the weather today?"
+            response = "The weather is sunny."
+            result = evaluator(query=query, response=response)
+        Example with list of messages:
+            evaluator = IntentResolutionEvaluator(model_config)
+            query: [{'role': 'system', 'content': 'You are a friendly and helpful customer service agent.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Hi, I need help with the last 2 orders on my account #888. Could you please update me on their status?'}]}]
+            response: [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Hello! Let me quickly look up your account details.'}]}, {'createdAt': 1700000075, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_001', 'type': 'function', 'function': {'name': 'get_orders', 'arguments': {'account_number': '888'}}}}]}, {'createdAt': 1700000080, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_001', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '[{ "order_id": "123" }, { "order_id": "124" }]'}]}, {'createdAt': 1700000085, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'Thanks for your patience. I see two orders on your account. Let me fetch the details for both.'}]}, {'createdAt': 1700000090, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_002', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '123'}}}}, {'type': 'tool_call', 'tool_call': {'id': 'tool_call_20250310_003', 'type': 'function', 'function': {'name': 'get_order', 'arguments': {'order_id': '124'}}}}]}, {'createdAt': 1700000095, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_002', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "123", "status": "shipped", "delivery_date": "2025-03-15" } }'}]}, {'createdAt': 1700000100, 'run_id': '0', 'tool_call_id': 'tool_call_20250310_003', 'role': 'tool', 'content': [{'type': 'tool_result', 'tool_result': '{ "order": { "id": "124", "status": "delayed", "expected_delivery": "2025-03-20" } }'}]}, {'createdAt': 1700000105, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': 'The order with ID 123 has been shipped and is expected to be delivered on March 15, 2025. However, the order with ID 124 is delayed and should now arrive by March 20, 2025. Is there anything else I can help you with?'}]}]
+            tool_definitions: [{'name': 'get_orders', 'description': 'Get the list of orders for a given account number.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to get the orders for.'}}}}, {'name': 'get_order', 'description': 'Get the details of a specific order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID to get the details for.'}}}}, {'name': 'initiate_return', 'description': 'Initiate the return process for an order.', 'parameters': {'type': 'object', 'properties': {'order_id': {'type': 'string', 'description': 'The order ID for the return process.'}}}}, {'name': 'update_shipping_address', 'description': 'Update the shipping address for a given account.', 'parameters': {'type': 'object', 'properties': {'account_number': {'type': 'string', 'description': 'The account number to update.'}, 'new_address': {'type': 'string', 'description': 'The new shipping address.'}}}}]
+            result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
+        :keyword query: The query to be evaluated which is either a string or a list of messages.
+            The list of messages is the previous conversation history of the user and agent, including system messages and tool calls.
+        :paramtype query: Union[str, List[dict]]
+        :keyword response: The response to be evaluated, which is either a string or a list of messages (full agent response potentially including tool calls)
+        :paramtype response: Union[str, List[dict]]
+        :keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
+        :paramtype tool_definitions: Optional[Union[dict, List[dict]]]
+        :return: A dictionary with the intent resolution evaluation
+        :rtype: Dict[str, Union[str, float]]
+        """
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Invokes the instance using the overloaded __call__ signature.
+        For detailed parameter types and return value documentation, see the overloaded __call__ definition.
+        """
+        return super().__call__(*args, **kwargs)
+    @override
+    async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # type: ignore[override]
+        """Do intent resolution evaluation.
+        :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
+        :type eval_input: Dict
+        :return: The evaluation result.
+        :rtype: Dict
+        """
+        # we override the _do_eval method as we want the output to be a dictionary, which is a different schema than _base_prompty_eval.py
+        if "query" not in eval_input and "response" not in eval_input:
+            raise EvaluationException(
+                message=f"Both query and response must be provided as input to the intent resolution evaluator.",
+                internal_message=f"Both query and response must be provided as input to the intent resolution evaluator.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.INTENT_RESOLUTION_EVALUATOR,
+            )
+        llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
+        # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
+        if isinstance(llm_output, dict):
+            score  = llm_output.get("resolution_score", math.nan)
+            if not check_score_is_valid(score, IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE, IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE):
+                raise EvaluationException(
+                    message=f"Invalid score value: {score}. Expected a number in range [{IntentResolutionEvaluator._MIN_INTENT_RESOLUTION_SCORE}, {IntentResolutionEvaluator._MAX_INTENT_RESOLUTION_SCORE}].",
+                    internal_message="Invalid score value.",
+                    category=ErrorCategory.FAILED_EXECUTION,
+                    blame=ErrorBlame.SYSTEM_ERROR,
+                )
+            reason = llm_output.get("explanation", "")
+            score = float(score)
+            score_result = 'pass' if score >= self.threshold else 'fail'
+            #remove fields 'explanation' and 'resolution_score' from llm_output as they are already included in the response_dict
+            if 'explanation' in llm_output: llm_output.pop("explanation")
+            if 'resolution_score' in llm_output: llm_output.pop("resolution_score")
+            response_dict = {
+                             f"{self._result_key}"           : score,
+                             f"{self._result_key}_result"    : score_result,
+                             f"{self._result_key}_threshold" : self.threshold,
+                             f"{self._result_key}_reason"    : reason,
+                             f"additional_details"           : llm_output
+                        }
+            return response_dict
+        # If llm_output is not a dictionary, return NaN for the score. This should never happen
+        return {self._result_key: math.nan}

azure/ai/evaluation/_evaluators/_intent_resolution/intent_resolution.prompty ADDED Viewed

@@ -0,0 +1,161 @@
+---
+name: Intent Resolution Evaluator
+description: Evaluates whether user intent was identified and correctly resolved
+model:
+  api: chat
+  parameters:
+    temperature: 0.0
+    max_tokens: 800
+    top_p: 1.0
+    presence_penalty: 0
+    frequency_penalty: 0
+    response_format:
+      type: json_object
+inputs:
+  query:
+    type: string
+  response:
+    type: string
+  tool_definitions:
+    type: string
+    optional: true
+    default: "[]"
+---
+system:
+You are an expert in evaluating the quality of a RESPONSE from an intelligent assistant based on provided definition and Data.
+user:
+# Goal
+Your goal is to assess the quality of the RESPONSE of an assistant in relation to a QUERY from a user, specifically focusing on
+the assistant's ability to understand and resolve the user intent expressed in the QUERY. There is also a field for tool definitions
+describing the functions, if any, that are accessible to the agent and that the agent may invoke in the RESPONSE if necessary.
+There are two components to intent resolution:
+    - Intent Understanding: The extent to which the agent accurately discerns the user's underlying need or inquiry.
+    - Response Resolution: The degree to which the agent's response is comprehensive, relevant, and adequately addresses the user's request.
+Note that the QUERY can either be a string with a user request or an entire conversation history including previous requests and responses from the assistant.
+In this case, the assistant's response should be evaluated in the context of the entire conversation but the focus should be on the last intent.
+# Data
+QUERY: {{query}}
+RESPONSE: {{response}}
+TOOL_DEFINITIONS: {{tool_definitions}}
+# Ratings
+## [Score: 1] (Response completely unrelated to user intent)
+**Definition:** The agent's response does not address the query at all.
+**Example:**
+  **Query:** How do I bake a chocolate cake?
+  **Response:** The latest smartphone models have incredible features and performance.
+  **Tool Definitions:** []
+**Expected output**
+{
+    "explanation": "The agent's response is entirely off-topic, discussing smartphones instead of providing any information about baking a chocolate cake."
+    "conversation_has_intent": true,
+    "agent_perceived_intent": "discussion about smartphone features",
+    "actual_user_intent": "bake a chocolate cake",
+    "correct_intent_detected": false,
+    "intent_resolved": false,
+    "resolution_score": 1,
+}
+## [Score: 2] (Response minimally relates to user intent)
+**Definition:** The response shows a token attempt to address the query by mentioning a relevant keyword or concept, but it provides almost no useful or actionable information.
+**Example input:**
+  **Query:** How do I bake a chocolate cake?
+  **Response:** Chocolate cake involves some ingredients.
+  **Tool Definitions:** []
+**Expected output**
+{
+    "explanation": "While the response mentions 'ingredients' related to a chocolate cake, it barely addresses the process or any detailed steps, leaving the query unresolved."
+    "conversation_has_intent": true,
+    "agent_perceived_intent": "mention of ingredients",
+    "actual_user_intent": "bake a chocolate cake",
+    "correct_intent_detected": false,
+    "intent_resolved": false,
+    "resolution_score": 2,
+}
+## [Score: 3] (Response partially addresses the user intent but lacks complete details)
+**Definition:** The response provides a basic idea related to the query by mentioning a few relevant elements, but it omits several key details and specifics needed for fully resolving the user's query.
+**Example input:**
+  **Query:** How do I bake a chocolate cake?
+  **Response:** Preheat your oven and mix the ingredients before baking the cake.
+  **Tool Definitions:** []
+**Expected output**
+{
+    "explanation": "The response outlines a minimal process (preheating and mixing) but omits critical details like ingredient measurements, baking time, and temperature specifics, resulting in only a partial resolution of the query."
+    "conversation_has_intent": true,
+    "agent_perceived_intent": "basic baking process",
+    "actual_user_intent": "bake a chocolate cake",
+    "correct_intent_detected": true,
+    "intent_resolved": false,
+    "resolution_score": 3,
+}
+## [Score: 4] (Response addresses the user intent with moderate accuracy but has minor inaccuracies or omissions)
+**Definition:** The response offers a moderately detailed answer that includes several specific elements relevant to the query, yet it still lacks some finer details or complete information.
+**Example input:**
+  **Query:** How do I bake a chocolate cake?
+  **Response:** Preheat your oven to 350°F. In a bowl, combine flour, sugar, cocoa, eggs, and milk, mix well, and bake for about 30 minutes.
+  **Tool Definitions:** []
+**Expected output**
+{
+    "explanation": "The response includes specific steps and ingredients, indicating a clear intent to provide a recipe. However, it lacks detailed measurements and finer instructions, which makes the resolution only moderately complete."
+    "conversation_has_intent": true,
+    "agent_perceived_intent": "provide a basic chocolate cake recipe",
+    "actual_user_intent": "bake a chocolate cake",
+    "correct_intent_detected": true,
+    "intent_resolved": false,
+    "resolution_score": 4,
+}
+## [Score: 5] (Response directly addresses the user intent and fully resolves it)
+**Definition:** The response provides a complete, detailed, and accurate answer that fully resolves the user's query with all necessary information and precision.
+**Example input:**
+  **Query:** How do I bake a chocolate cake?
+  **Response:** Preheat your oven to 350°F (175°C) and grease a 9-inch round cake pan. In a large bowl, sift together 1 ¾ cups all-purpose flour, 1 cup sugar, ¾ cup unsweetened cocoa powder, 1 ½ tsp baking powder, and 1 tsp salt. In another bowl, beat 2 large eggs with 1 cup milk, ½ cup vegetable oil, and 2 tsp vanilla extract. Combine the wet ingredients with the dry ingredients, then gradually mix in 1 cup boiling water until smooth. Pour the batter into the prepared pan and bake for 30-35 minutes or until a toothpick inserted into the center comes out clean. Allow the cake to cool before serving.
+  **Tool Definitions:** []
+**Expected output**
+{
+    "explanation": "The response delivers a complete and precise recipe with detailed instructions and measurements, fully addressing the user's query about baking a chocolate cake."
+    "conversation_has_intent": true,
+    "agent_perceived_intent": "provide a comprehensive chocolate cake recipe",
+    "actual_user_intent": "bake a chocolate cake",
+    "correct_intent_detected": true,
+    "intent_resolved": true,
+    "resolution_score": 5,
+}
+# Task
+Please provide your evaluation for the assistant RESPONSE in relation to the user QUERY and tool definitions based on the Definitions and examples above.
+Your output should consist only of a JSON object, as provided in the examples, that has the following keys:
+  - explanation: a string that explains why you think the input Data should get this resolution_score.
+  - conversation_has_intent: true or false
+  - agent_perceived_intent: a string that describes the intent the agent perceived from the user query
+  - actual_user_intent: a string that describes the actual user intent
+  - correct_intent_detected: true or false
+  - intent_resolved: true or false
+  - resolution_score: an integer between 1 and 5 that represents the resolution score
+# Output

azure/ai/evaluation/_evaluators/_meteor/_meteor.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing_extensions import overload, override
 from azure.ai.evaluation._common.utils import nltk_tokenize, ensure_nltk_data_downloaded
 from azure.ai.evaluation._evaluators._common import EvaluatorBase
+from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 class MeteorScoreEvaluator(EvaluatorBase):
@@ -32,6 +33,8 @@ class MeteorScoreEvaluator(EvaluatorBase):
     :type beta: float
     :param gamma: The METEOR score gamma parameter. Default is 0.5.
     :type gamma: float
+    :param threshold: The threshold for the METEOR score evaluator. Default is 0.5.
+    :type threshold: float
     .. admonition:: Example:
@@ -41,18 +44,30 @@ class MeteorScoreEvaluator(EvaluatorBase):
             :language: python
             :dedent: 8
             :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
+    .. admonition:: Example with Threshold:
+        .. literalinclude:: ../samples/evaluation_samples_threshold.py
+            :start-after: [START threshold_meteor_score_evaluator]
+            :end-before: [END threshold_meteor_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize with threshold and call a MeteorScoreEvaluator.
     """
     id = "azureml://registries/azureml/models/Meteor-Score-Evaluator/versions/3"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5):
+    def __init__(self, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5, *, threshold: float = 0.5):
         self._alpha = alpha
         self._beta = beta
         self._gamma = gamma
         ensure_nltk_data_downloaded()
-        super().__init__()
+        self._threshold = threshold
+        self._higher_is_better = True
+        super().__init__(threshold=threshold, _higher_is_better=self._higher_is_better)
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
@@ -74,9 +89,17 @@ class MeteorScoreEvaluator(EvaluatorBase):
             beta=self._beta,
             gamma=self._gamma,
         )
+        binary_result = False
+        if self._higher_is_better:
+            if score >= self._threshold:
+                binary_result = True
+        else:
+            if score <= self._threshold:
+                binary_result = True
         return {
             "meteor_score": score,
+            "meteor_result": EVALUATION_PASS_FAIL_MAPPING[binary_result],
+            "meteor_threshold": self._threshold,
         }
     @overload  # type: ignore

azure-ai-evaluation 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.3.0py3-none-any.whl → 1.5.0py3-none-any.whl