PyPI - azure-ai-evaluation - Versions diffs - 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (85) hide show

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -71,11 +71,11 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
     _PROMPTY_FILE = "similarity.prompty"
     _RESULT_KEY = "similarity"
-    id = "similarity"
+    id = "azureai://built-in/evaluators/similarity"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=3):
+    def __init__(self, model_config, *, threshold=3, credential=None):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self._threshold = threshold
@@ -85,6 +85,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
             threshold=threshold,
+            credential=credential,
             _higher_is_better=self._higher_is_better,
         )

azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py CHANGED Viewed

@@ -3,16 +3,19 @@
 # ---------------------------------------------------------
 import os
 import math
+import logging
 from typing import Dict, Union, List, Optional
 from typing_extensions import overload, override
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
-from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_score
+from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
 from azure.ai.evaluation._model_configurations import Message
 from azure.ai.evaluation._common._experimental import experimental
+logger = logging.getLogger(__name__)
 @experimental
 class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
@@ -62,15 +65,21 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     _DEFAULT_TASK_ADHERENCE_SCORE = 3
-    id = None
+    id = "azureai://built-in/evaluators/task_adherence"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
     @override
-    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, **kwargs):
+    def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         self.threshold = threshold
-        super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY, **kwargs)
+        super().__init__(
+            model_config=model_config,
+            prompty_file=prompty_path,
+            result_key=self._RESULT_KEY,
+            credential=credential,
+            **kwargs,
+        )
     @overload
     def __call__(
@@ -140,20 +149,23 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
                 category=ErrorCategory.MISSING_FIELD,
                 target=ErrorTarget.TASK_ADHERENCE_EVALUATOR,
             )
+        eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
+        eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
+        if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
+            eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
         llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        score = math.nan
-        if llm_output:
-            score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
+        if isinstance(llm_output, dict):
+            score = float(llm_output.get("score", math.nan))
             score_result = "pass" if score >= self.threshold else "fail"
+            reason = llm_output.get("explanation", "")
             return {
                 f"{self._result_key}": score,
                 f"{self._result_key}_result": score_result,
                 f"{self._result_key}_threshold": self.threshold,
                 f"{self._result_key}_reason": reason,
+                # Uncomment the following line in the next iteration after UI contracts are validated.
+                # f"{self._result_key}_additional_details": llm_output
             }
+        if logger:
+            logger.warning("LLM output is not a dictionary, returning NaN for the score.")
         return {self._result_key: math.nan}

azure/ai/evaluation/_evaluators/_task_adherence/task_adherence.prompty CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 name: TaskAdherence
-description: Evaluates Task Adherence score for QA scenario
+description: Evaluates Task Adherence score
 model:
   api: chat
   parameters:
@@ -10,7 +10,7 @@ model:
     presence_penalty: 0
     frequency_penalty: 0
     response_format:
-      type: text
+      type: json_object
 inputs:
   query:
@@ -21,97 +21,385 @@ inputs:
     type: string
     optional: true
     default: "[]"
 ---
 system:
-# Instruction
-## Context
-### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided.
-- **Definition**: Based on the provided query, response, and tool definitions, evaluate the agent's adherence to the assigned task.
-- **Data**: Your input data includes query, response, and tool definitions.
-- **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways.
+You are an expert evaluator who scores how well an AI assistant executed the intended task based on system constraints and user requests.
+user:
+ROLE
+====
+You are Task-Adherence-Judge, an impartial evaluator who scores task execution quality.
+INPUT
+=====
+CONVERSATION_HISTORY: {{query}}
+AGENT_RESPONSE: {{response}}
+TOOL_DEFINITIONS: {{tool_definitions}}
+CONVERSATION_HISTORY includes the full dialogue. The SYSTEM MESSAGE (if present) is the first message and defines agent behavior.
+AGENT_RESPONSE is the agent's reply to the latest user query.
+TOOL_DEFINITIONS lists available tools.
+EVALUATION STEPS
+================
+A. Extract System Instructions (if system message exists):
+- Identify any mandatory rules the agent must follow (e.g., “must”, “never”, “only”, or implied constraints).
+- Note preferences that influence task execution (e.g., “should”, “prefer”, “avoid”).
+- Record tool usage expectations (required, preferred, or restricted tools).
+B. Identify the User Request:
+- Determine what the user explicitly wants: action, answer, or outcome.
+- Use full conversation history if needed for clarity.
+C. Define the Intended Task:
+- Combine the user request with mandatory system constraints.
+- Adjust execution details based on preferences.
+- This defines the correct task behavior.
+D. Evaluate Agent Execution:
+1. Constraint Adherence:
+   - Were all mandatory rules followed?
+   - Any violation should cap the score at 3.
+2. Task Completion:
+   - Did the agent fulfill the defined task?
+   - Were any core parts of the request missed?
+3. Tool Usage Evaluation:
+   - Was tool use appropriate, as per instructions or task need?
+   - **System mandates tool**: Required (score ≤2 if not used correctly)
+   - **System prefers tool**: Preferred (minor score impact if not used)
+   - **System silent on tools**: Evaluate based on task effectiveness
+   - **System restricts tools**: Discouraged (score ≤2 if used when restricted)
+E. Write a 15–60 word explanation that describes the intended task, how well it was met, constraint handling, and any notable errors or strengths.
+F. Choose the single closest integer score (1,2,3,4 or 5) from the SCORING RUBRIC below.
+OUTPUT FORMAT
+=============
+Output a JSON object with the below keys in the given sequence:
+    1) explanation: a concise 15–60 word summary of how well the assistant followed the task
+    2) agent_perceived_task:  what the assistant believed it had to do, based on its response;
+    3) actual_task: what the task truly was, considering both system prompt and user latest query, with system taking precedence.
+    4) tools_used: List of tools used by the agent
+    5) expected_tool_calls: List of tools the agent should have used (if applicable)
+    6) score: an integer score from 1 (very poor) to 5 (excellent) on how well the agent resolved the user's intent within constraints
+SCORING RUBRIC WITH EXAMPLES
+============================
+**Score 5 - Fully Adherent:**
+- Executed intended task completely
+- Followed all mandatory system rules
+- Used tools correctly when required by system or task necessity
+- User's need fully satisfied within constraints
+Example A (single-turn)
+=======================
+CONVERSATION_HISTORY:
+User turn 1:
+  Suggest a good mystery book.
+AGENT_RESPONSE:
+  [TOOL_CALL] get_book(genre="mystery")
+  [TOOL_RESULT] "Gone Girl" by Gillian Flynn
+  I recommend *Gone Girl* by Gillian Flynn — a gripping psychological thriller.
+TOOL_DEFINITIONS:
+- get_book: Returns a book recommendation based on genre. (inputs: genre)
+EXPECTED OUTPUT:
+{
+  "explanation": "The assistant correctly identified the task, used the appropriate tool as needed, and provided a clear and suitable book recommendation.",
+  "agent_perceived_task": "recommend a mystery book",
+  "actual_task": "Suggest a good mystery book",
+  "tools_used": ["get_book"],
+  "expected_tool_calls": ["get_book"],
+  "score": 5
+}
+Example B (multi-turn)
+======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+Do not use tools unless explicitly requested by the user.
+User turn 1:
+  Can you recommend a good book?
+Agent turn 1:
+  Any preferred genre?
+User turn 2:
+  Historical fiction.
+AGENT_RESPONSE:
+  You might enjoy *The Book Thief* by Markus — a powerful story set during World War II.
+TOOL_DEFINITIONS:
+- get_book: Returns book suggestions based on genre. (inputs: genre)
+EXPECTED OUTPUT:
+{
+  "explanation": "The assistant correctly followed the system instruction by not using tools since the user never explicitly requested one. It engaged with the user to clarify preferences and gave a relevant recommendation.",
+  "agent_perceived_task": "recommend a historical fiction book without tool use",
+  "actual_task": "Recommend a historical fiction book (no tool allowed unless explicitly requested)",
+  "tools_used": [],
+  "expected_tool_calls": [],
+  "score": 5
+}
+**Score 4 - Mostly Adherent:**
+- Executed core intended task
+- Followed mandatory system rules
+- Minor execution gaps that don't prevent task completion
+- Tool usage mostly appropriate
+Example A (single-turn)
+=======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+You must respond clearly and concisely using available tools. Minor omissions are acceptable.
+User turn 1:
+  What’s the weather like in Paris?
+AGENT_RESPONSE:
+  [TOOL_CALL] get_weather(city="Paris")
+  [TOOL_RESULT] 22°C, partly cloudy
+  Paris is currently 22°C and partly cloudy.
+TOOL_DEFINITIONS:
+- get_weather: Provides current weather details for a city. (inputs: city)
+EXPECTED OUTPUT:
+{
+  "explanation": "The assistant correctly used the expected tool and completed the core task. However, it omitted secondary but relevant details like windspeed or humidity, which slightly reduces the completeness of the response without affecting task success.",
+  "agent_perceived_task": "get weather for Paris",
+  "actual_task": "Provide current weather in Paris",
+  "tools_used": ["get_weather"],
+  "expected_tool_calls": ["get_weather"],
+  "score": 4
+}
+Example B (multi-turn)
+======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+Always ask for genre before making a book suggestion.
+User turn 1:
+  Recommend a good book to read.
+Agent turn 1:
+  Any preferred genre?
+User turn 2:
+  Something romantic.
+AGENT_RESPONSE:
+  [TOOL_CALL] get_book(genre="romance")
+  [TOOL_RESULT] "The Notebook" by Nicholas Sparks
+  You might enjoy *The Notebook* by Nicholas Sparks.
+TOOL_DEFINITIONS:
+- get_book: Returns book suggestions based on genre. (inputs: genre)
+EXPECTED OUTPUT:
+{
+  "explanation": "Assistant followed system rule to ask for genre, used the correct tool, and offered a relevant result, though without justification.",
+  "agent_perceived_task": "recommend a romance novel",
+  "actual_task": "Recommend a romance book (after genre clarification as per system)",
+  "tools_used": ["get_book"],
+  "expected_tool_calls": ["get_book"],
+  "score": 4
+}
+**Score 3 - Partially Adherent:**
+- Attempted intended task but missed key components
+- Some system rule violations OR incomplete task execution
+- Tool usage suboptimal or missing when preferred by system
+- User's need partially satisfied
+Example A (single-turn)
+=======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+Always rely on tools if available for factual queries like time and weather.
+User turn 1:
+  What’s the weather in Sydney?
+AGENT_RESPONSE:
+  It's probably sunny this time of year.
+TOOL_DEFINITIONS:
+- get_book: Returns book suggestions based on genre. (inputs: genre)
+EXPECTED OUTPUT:
+{
+  "explanation": "The assistant attempted to answer the weather query but the response was vague and did not provide specific information. No relevant tool was available for weather and the assistant did the right thing by not using the book tool, but it still failed to meet the user's need for accurate weather information.",
+  "agent_perceived_task": "estimate Sydney's weather",
+  "actual_task": "Get real-time weather in Sydney (tool use required per system)",
+  "tools_used": [],
+  "expected_tool_calls": [],
+  "score": 3
+}
+Example B (multi-turn)
+======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+If tools are available, prefer them for recommendations.
+User turn 1:
+  Can you suggest a book?
+Agent turn 1:
+  Sure, any specific genre?
+User turn 2:
+  Thriller.
+AGENT_RESPONSE:
+  You might enjoy *The Da Vinci Code*.
-# Definition
+TOOL_DEFINITIONS:
+- get_book: Get book suggestions by genre. (inputs: genre)
-**Level 1: Fully Inadherent**
+EXPECTED OUTPUT:
+{
+  "explanation": "Assistant gave a relevant suggestion but skipped the tool, despite system prompt recommending its use for consistency.",
+  "agent_perceived_task": "recommend thriller book",
+  "actual_task": "Suggest a thriller book (prefer tool per system)",
+  "tools_used": [],
+  "expected_tool_calls": ["get_book"],
+  "score": 3
+}
-**Definition:**
-Response completely ignores instructions or deviates significantly
+**Score 2 - Barely Adherent:**
+- Minimal task execution
+- Violated mandatory system rules OR failed core task requirements
+- Wrong tool usage or missing when required by system
+- User's need largely unmet
-**Example:**
-  **Query:** What is a recommended weekend itinerary in Paris?
-  **Response:** Paris is a lovely city with a rich history.
+Example A (single-turn)
+=======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+You are a tool-based agent. Always respond using tools.
-Explanation: This response completely misses the task by not providing any itinerary details. It offers a generic statement about Paris rather than a structured travel plan.
+User turn 1:
+  What's the time in Madrid?
+AGENT_RESPONSE:
+  Europe has different time zones.
-**Level 2: Barely Adherent**
+TOOL_DEFINITIONS:
+- get_time: Return current time for a city. (inputs: city)
-**Definition:**
-Response partially aligns with instructions but has critical gaps.
+EXPECTED OUTPUT:
+{
+  "explanation": "Assistant responded vaguely and violated the system rule requiring tool usage. Failed to directly answer the task.",
+  "agent_perceived_task": "talk about time zones",
+  "actual_task": "Tell the current time in Madrid (using tool as required by system)",
+  "tools_used": [],
+  "expected_tool_calls": ["get_time"],
+  "score": 2
+}
-**Example:**
-  **Query:** What is a recommended weekend itinerary in Paris?
-  **Response:** Spend your weekend visiting famous places in Paris.
+Example B (multi-turn)
+======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+You are a book recommender assistant. Do **not** ask follow-up questions. Only respond with direct suggestions using tools.
-Explanation: While the response hints at visiting well-known sites, it is extremely vague and lacks specific details, such as which sites to visit or any order of activities, leaving major gaps in the instructions.
+User turn 1:
+  Recommend a good fantasy book.
+Agent turn 1:
+  Do you prefer magic or adventure?
-**Level 3: Moderately Adherent**
+User turn 2:
+  Magic.
-**Definition:**
-Response meets the core requirements but lacks precision or clarity.
+AGENT_RESPONSE:
+  Fantasy often includes magical elements and world-building.
-**Example:**
-  **Query:** What is a recommended weekend itinerary in Paris?
-  **Response:** Visit the Eiffel Tower and the Louvre on Saturday, and stroll through Montmartre on Sunday.
+TOOL_DEFINITIONS:
+- get_book: Retrieve books filtered by genre. (inputs: genre)
-Explanation: This answer meets the basic requirement by naming a few key attractions and assigning them to specific days. However, it lacks additional context, such as timings, additional activities, or details to make the itinerary practical and clear.
+EXPECTED OUTPUT:
+{
+  "explanation": "The assistant violated the system's instruction not to ask clarifying questions and failed to use the required tool. Although the topic remained somewhat relevant, the task was not fulfilled.",
+  "agent_perceived_task": "discuss fantasy genre",
+  "actual_task": "Recommend a fantasy book using tool without asking follow-ups (per system)",
+  "tools_used": [],
+  "expected_tool_calls": ["get_book"],
+  "score": 2
+}
+**Score 1 - Non-Adherent:**
+- Failed to execute intended task
+- Ignored mandatory system constraints
+- Completely inappropriate or missing tool usage
+- Response irrelevant to user's need
-**Level 4: Mostly Adherent**
+Example A (single-turn)
+=======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+Only answer factual questions using tools. Avoid personal commentary.
-**Definition:**
-Response is clear, accurate, and aligns with instructions with minor issues.
+User turn 1:
+  What's the current time in Berlin?
-**Example:**
-  **Query:** What is a recommended weekend itinerary in Paris?
-  **Response:** For a weekend in Paris, start Saturday with a morning visit to the Eiffel Tower, then head to the Louvre in the early afternoon. In the evening, enjoy a leisurely walk along the Seine. On Sunday, begin with a visit to Notre-Dame Cathedral, followed by exploring the art and cafés in Montmartre. This plan offers a mix of cultural visits and relaxing experiences.
+AGENT_RESPONSE:
+  I prefer mornings to evenings. How about you?
-Explanation: This response is clear, structured, and provides a concrete itinerary with specific attractions and a suggested order of activities. It is accurate and useful, though it might benefit from a few more details like exact timings or restaurant suggestions to be perfect.
+TOOL_DEFINITIONS:
+- get_time: Return time for a given city. (inputs: city)
+EXPECTED OUTPUT:
+{
+  "explanation": "Assistant ignored both the user's factual query and the system's constraint to avoid personal remarks. No tool used.",
+  "agent_perceived_task": "chat about preferences",
+  "actual_task": "Tell the current time in Berlin (using tool, avoid chit-chat as per system)",
+  "tools_used": [],
+  "expected_tool_calls": ["get_time"],
+  "score": 1
+}
-**Level 5: Fully Adherent**
+Example B (multi-turn)
+======================
+CONVERSATION_HISTORY:
+SYSTEM_PROMPT:
+Only provide weather updates using tools. Do not include personal opinions.
-**Definition:**
-Response is flawless, accurate, and follows instructions to the letter.
+User turn 1:
+  I need a weather update for Rome.
-**Example:**
-  **Query:** What is a recommended weekend itinerary in Paris?
-  **Response:** Here is a detailed weekend itinerary in Paris:
-Saturday:
-Morning: Begin your day with a visit to the Eiffel Tower to admire the views from the top.
-Early Afternoon: Head to the Louvre for a guided tour of its most famous exhibits.
-Late Afternoon: Take a relaxing walk along the Seine, stopping at local boutiques.
-Evening: Enjoy dinner at a classic Parisian bistro near the river.
-Sunday:
-Morning: Visit the Notre-Dame Cathedral to explore its architecture and history.
-Midday: Wander the charming streets of Montmartre, stopping by art galleries and cafés.
-Afternoon: Finish your trip with a scenic boat tour on the Seine.
-This itinerary balances cultural immersion, leisure, and local dining experiences, ensuring a well-rounded visit.
+Agent turn 1:
+  Celsius or Fahrenheit?
-Explanation:  This response is comprehensive and meticulously follows the instructions. It provides detailed steps, timings, and a variety of activities that fully address the query, leaving no critical gaps.
+User turn 2:
+  Celsius.
-# Data
-Query: {{query}}
-Response: {{response}}
-Tool Definitions: {{tool_definitions}}
+AGENT_RESPONSE:
+  I once visited Rome in summer. Beautiful city!
-# Tasks
-## Please provide your assessment Score for the previous answer. Your output should include the following information:
-- **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:".
-- **Explanation**: a very short explanation of why you think the input data should get that Score.
-- **Score**: based on your previous analysis, provide your Score. The answer you give MUST be an integer score ("1", "2", ...) based on the categories of the definitions.
+TOOL_DEFINITIONS:
+- get_weather: Provides weather data for a given city. (inputs: city, unit)
-## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
-# Output
+EXPECTED OUTPUT:
+{
+  "explanation": "Assistant acknowledged the input but failed to execute the task or use the required tool, and added personal opinion against system guidance.",
+  "agent_perceived_task": "share travel experience",
+  "actual_task": "Provide current weather in Celsius for Rome (using tool, no personal commentary)",
+  "tools_used": [],
+  "expected_tool_calls": ["get_weather"],
+  "score": 1
+}

azure-ai-evaluation 1.9.0__py3-none-any.whl → 1.11.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.9.0py3-none-any.whl → 1.11.0py3-none-any.whl