azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import os
|
|
5
|
+
import math
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Dict, Union, List, Optional
|
|
8
|
+
|
|
9
|
+
from typing_extensions import overload, override
|
|
10
|
+
|
|
11
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
12
|
+
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
13
|
+
from ..._common.utils import reformat_conversation_history, reformat_agent_response, reformat_tool_definitions
|
|
14
|
+
from azure.ai.evaluation._model_configurations import Message
|
|
15
|
+
from azure.ai.evaluation._common._experimental import experimental
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@experimental
|
|
21
|
+
class _TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
22
|
+
"""The Task Completion evaluator determines whether an AI agent successfully completed the requested task based on:
|
|
23
|
+
|
|
24
|
+
- Final outcome and deliverable of the task
|
|
25
|
+
- Completeness of task requirements
|
|
26
|
+
|
|
27
|
+
This evaluator focuses solely on task completion and success, not on task adherence or intent understanding.
|
|
28
|
+
|
|
29
|
+
Scoring is binary:
|
|
30
|
+
- 1 (pass): Task fully completed with usable deliverable that meets all user requirements
|
|
31
|
+
- 0 (fail): Task incomplete, partially completed, or deliverable does not meet requirements
|
|
32
|
+
|
|
33
|
+
The evaluation includes task requirement analysis, outcome assessment, and completion gap identification.
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
:param model_config: Configuration for the Azure OpenAI model.
|
|
37
|
+
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
38
|
+
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
39
|
+
|
|
40
|
+
.. admonition:: Example:
|
|
41
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
42
|
+
:start-after: [START task_completion_evaluator]
|
|
43
|
+
:end-before: [END task_completion_evaluator]
|
|
44
|
+
:language: python
|
|
45
|
+
:dedent: 8
|
|
46
|
+
:caption: Initialize and call a _TaskCompletionEvaluator with a query and response.
|
|
47
|
+
|
|
48
|
+
.. admonition:: Example using Azure AI Project URL:
|
|
49
|
+
|
|
50
|
+
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
|
+
:start-after: [START task_completion_evaluator]
|
|
52
|
+
:end-before: [END task_completion_evaluator]
|
|
53
|
+
:language: python
|
|
54
|
+
:dedent: 8
|
|
55
|
+
:caption: Initialize and call a _TaskCompletionEvaluator using Azure AI Project URL in the following format
|
|
56
|
+
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
|
+
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
_PROMPTY_FILE = "task_completion.prompty"
|
|
61
|
+
_RESULT_KEY = "task_completion"
|
|
62
|
+
_OPTIONAL_PARAMS = ["tool_definitions"]
|
|
63
|
+
|
|
64
|
+
id = "azureai://built-in/evaluators/task_completion"
|
|
65
|
+
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
66
|
+
|
|
67
|
+
@override
|
|
68
|
+
def __init__(self, model_config, *, credential=None, **kwargs):
|
|
69
|
+
current_dir = os.path.dirname(__file__)
|
|
70
|
+
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
71
|
+
super().__init__(
|
|
72
|
+
model_config=model_config,
|
|
73
|
+
prompty_file=prompty_path,
|
|
74
|
+
result_key=self._RESULT_KEY,
|
|
75
|
+
credential=credential,
|
|
76
|
+
**kwargs,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
@overload
|
|
80
|
+
def __call__(
|
|
81
|
+
self,
|
|
82
|
+
*,
|
|
83
|
+
query: Union[str, List[dict]],
|
|
84
|
+
response: Union[str, List[dict]],
|
|
85
|
+
tool_definitions: Optional[Union[dict, List[dict]]] = None,
|
|
86
|
+
) -> Dict[str, Union[str, float]]:
|
|
87
|
+
"""Evaluate task completion for a given query, response, and optionally tool definitions.
|
|
88
|
+
The query and response can be either a string or a list of messages.
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
Example with string inputs and no tools:
|
|
92
|
+
evaluator = _TaskCompletionEvaluator(model_config)
|
|
93
|
+
query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
|
|
94
|
+
response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."
|
|
95
|
+
|
|
96
|
+
result = evaluator(query=query, response=response)
|
|
97
|
+
|
|
98
|
+
Example with list of messages:
|
|
99
|
+
evaluator = _TaskCompletionEvaluator(model_config)
|
|
100
|
+
query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
|
|
101
|
+
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
|
|
102
|
+
tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]
|
|
103
|
+
|
|
104
|
+
result = evaluator(query=query, response=response, tool_definitions=tool_definitions)
|
|
105
|
+
|
|
106
|
+
:keyword query: The query being evaluated, either a string or a list of messages.
|
|
107
|
+
:paramtype query: Union[str, List[dict]]
|
|
108
|
+
:keyword response: The response being evaluated, either a string or a list of messages (full agent response potentially including tool calls)
|
|
109
|
+
:paramtype response: Union[str, List[dict]]
|
|
110
|
+
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
|
|
111
|
+
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
|
|
112
|
+
:return: A dictionary with the task completion evaluation results.
|
|
113
|
+
:rtype: Dict[str, Union[str, float]]
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
@override
|
|
117
|
+
def __call__( # pylint: disable=docstring-missing-param
|
|
118
|
+
self,
|
|
119
|
+
*args,
|
|
120
|
+
**kwargs,
|
|
121
|
+
):
|
|
122
|
+
"""
|
|
123
|
+
Invokes the instance using the overloaded __call__ signature.
|
|
124
|
+
|
|
125
|
+
For detailed parameter types and return value documentation, see the overloaded __call__ definition.
|
|
126
|
+
"""
|
|
127
|
+
return super().__call__(*args, **kwargs)
|
|
128
|
+
|
|
129
|
+
@override
|
|
130
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
131
|
+
"""Do Task Completion evaluation.
|
|
132
|
+
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
|
|
133
|
+
:type eval_input: Dict
|
|
134
|
+
:return: The evaluation result.
|
|
135
|
+
:rtype: Dict
|
|
136
|
+
"""
|
|
137
|
+
# we override the _do_eval method as we want the output to be a dictionary,
|
|
138
|
+
# which is a different schema than _base_prompty_eval.py
|
|
139
|
+
if "query" not in eval_input and "response" not in eval_input:
|
|
140
|
+
raise EvaluationException(
|
|
141
|
+
message=f"Both query and response must be provided as input to the Task Completion evaluator.",
|
|
142
|
+
internal_message=f"Both query and response must be provided as input to the Task Completion evaluator.",
|
|
143
|
+
blame=ErrorBlame.USER_ERROR,
|
|
144
|
+
category=ErrorCategory.MISSING_FIELD,
|
|
145
|
+
target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
|
|
146
|
+
)
|
|
147
|
+
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
|
|
148
|
+
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
|
|
149
|
+
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
|
|
150
|
+
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
|
|
151
|
+
|
|
152
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
153
|
+
llm_output = prompty_output_dict.get("llm_output", {})
|
|
154
|
+
|
|
155
|
+
if isinstance(llm_output, dict):
|
|
156
|
+
success = llm_output.get("success", 0)
|
|
157
|
+
if isinstance(success, str):
|
|
158
|
+
success = 1 if success.upper() == "TRUE" else 0
|
|
159
|
+
|
|
160
|
+
success_result = "pass" if success == 1 else "fail"
|
|
161
|
+
reason = llm_output.get("explanation", "")
|
|
162
|
+
return {
|
|
163
|
+
f"{self._result_key}": success,
|
|
164
|
+
f"{self._result_key}_result": success_result,
|
|
165
|
+
f"{self._result_key}_reason": reason,
|
|
166
|
+
f"{self._result_key}_details": llm_output.get("details", ""),
|
|
167
|
+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
|
|
168
|
+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
|
|
169
|
+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
|
|
170
|
+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
|
|
171
|
+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
|
|
172
|
+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
|
|
173
|
+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
|
|
174
|
+
}
|
|
175
|
+
if logger:
|
|
176
|
+
logger.warning("LLM output is not a dictionary, returning 0 for the success.")
|
|
177
|
+
return {self._result_key: 0}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Task Completion
|
|
3
|
+
description: Evaluates whether a task was successfully completed
|
|
4
|
+
model:
|
|
5
|
+
api: chat
|
|
6
|
+
parameters:
|
|
7
|
+
temperature: 0.0
|
|
8
|
+
max_tokens: 1500
|
|
9
|
+
top_p: 1.0
|
|
10
|
+
presence_penalty: 0
|
|
11
|
+
frequency_penalty: 0
|
|
12
|
+
response_format:
|
|
13
|
+
type: json_object
|
|
14
|
+
inputs:
|
|
15
|
+
query:
|
|
16
|
+
type: string
|
|
17
|
+
response:
|
|
18
|
+
type: string
|
|
19
|
+
tool_definitions:
|
|
20
|
+
type: Dict
|
|
21
|
+
optional: true
|
|
22
|
+
default: {}
|
|
23
|
+
---
|
|
24
|
+
system:
|
|
25
|
+
You are an expert evaluator who determines if an agent has successfully completed the task required by the user based on the final outcome.
|
|
26
|
+
|
|
27
|
+
user:
|
|
28
|
+
ROLE
|
|
29
|
+
====
|
|
30
|
+
You are a judge on Task Completion who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?**
|
|
31
|
+
|
|
32
|
+
You are NOT evaluating:
|
|
33
|
+
- How well the agent followed instructions
|
|
34
|
+
- How well the agent understood the user's intent
|
|
35
|
+
|
|
36
|
+
You ARE evaluating:
|
|
37
|
+
- Whether the task is actually completed in the final outcome
|
|
38
|
+
- Whether the deliverable meets the user's requirements
|
|
39
|
+
- Whether the end result is actionable and usable
|
|
40
|
+
|
|
41
|
+
INPUT
|
|
42
|
+
=====
|
|
43
|
+
CONVERSATION_HISTORY: {{query}}
|
|
44
|
+
AGENT_RESPONSE: {{response}}
|
|
45
|
+
|
|
46
|
+
CONVERSATION_HISTORY includes the full dialogue. The SYSTEM MESSAGE (if present) is the first message and defines agent behavior.
|
|
47
|
+
AGENT_RESPONSE is the agent's reply to the latest user query.
|
|
48
|
+
Tool calls and tool results are not visible to the user. The user only sees the agent's final response.
|
|
49
|
+
|
|
50
|
+
EVALUATION FRAMEWORK
|
|
51
|
+
====================
|
|
52
|
+
|
|
53
|
+
A. Identify the Task Requirements:
|
|
54
|
+
- What specific outcome did the user request?
|
|
55
|
+
- What deliverables were expected?
|
|
56
|
+
- What criteria define "completion" for this task?
|
|
57
|
+
|
|
58
|
+
B. Analyze the Final Outcome:
|
|
59
|
+
- What did the agent actually deliver?
|
|
60
|
+
- Is the deliverable complete and usable?
|
|
61
|
+
- Does it meet the user's specified requirements?
|
|
62
|
+
|
|
63
|
+
C. Assess Task Completion:
|
|
64
|
+
- **Fully Complete**: All requirements met, deliverable is ready for use
|
|
65
|
+
- **Partially Complete**: Some requirements met, but missing key components
|
|
66
|
+
- **Incomplete**: No usable deliverable or major requirements unmet
|
|
67
|
+
|
|
68
|
+
D. Assign a Score:
|
|
69
|
+
- **TRUE**: The agent delivered a complete and correct solution that accomplishes the user's entire goal. The user does not need to take further action or ask follow-up questions to get what they originally asked for.
|
|
70
|
+
- **FALSE**: The agent failed to complete one or more parts of the task, provided an incorrect/incomplete result, or left the user's goal unresolved.
|
|
71
|
+
|
|
72
|
+
OUTPUT FORMAT
|
|
73
|
+
=============
|
|
74
|
+
Output a JSON object with these keys:
|
|
75
|
+
{
|
|
76
|
+
"explanation": "<15-60 words explaining the completion status>",
|
|
77
|
+
"details": {
|
|
78
|
+
"task_requirements": "<15-60 words on what the user specifically requested>",
|
|
79
|
+
"delivered_outcome": "<15-60 words on what the agent actually provided>",
|
|
80
|
+
"completion_gaps": "<15-60 words on missing elements if task is incomplete>"
|
|
81
|
+
},
|
|
82
|
+
"success": <TRUE or FALSE>
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
SCORING EXAMPLES
|
|
86
|
+
================
|
|
87
|
+
|
|
88
|
+
### SUCCESS: TRUE - Example A
|
|
89
|
+
|
|
90
|
+
CONVERSATION_HISTORY:
|
|
91
|
+
User: Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine.
|
|
92
|
+
|
|
93
|
+
AGENT_RESPONSE:
|
|
94
|
+
**Day 1:**
|
|
95
|
+
- Morning: Visit Louvre Museum (9 AM - 12 PM)
|
|
96
|
+
- Lunch: Le Comptoir du Relais (traditional French bistro)
|
|
97
|
+
- Afternoon: Walk through Tuileries Garden
|
|
98
|
+
- Dinner: L'As du Fallafel (famous in Marais district)
|
|
99
|
+
|
|
100
|
+
**Day 2:**
|
|
101
|
+
- Morning: Tour Eiffel Tower and Trocadéro
|
|
102
|
+
- Lunch: Du Pain et des Idées (artisanal bakery)
|
|
103
|
+
- Afternoon: Explore Montmartre and Sacré-Cœur
|
|
104
|
+
- Dinner: Le Procope (historic restaurant)
|
|
105
|
+
|
|
106
|
+
**Day 3:**
|
|
107
|
+
- Morning: Visit Notre-Dame Cathedral area
|
|
108
|
+
- Lunch: Marché des Enfants Rouges (food market)
|
|
109
|
+
- Afternoon: Musée d'Orsay
|
|
110
|
+
- Dinner: Guy Savoy (Michelin-starred French cuisine)
|
|
111
|
+
|
|
112
|
+
Transportation: Metro Day Pass recommended. Book Louvre tickets online in advance.
|
|
113
|
+
|
|
114
|
+
EXPECTED OUTPUT:
|
|
115
|
+
{
|
|
116
|
+
"explanation": "Complete 3-day Paris itinerary delivered with cultural landmarks and local cuisine for each day, including practical transportation advice.",
|
|
117
|
+
"details": {
|
|
118
|
+
"task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine",
|
|
119
|
+
"delivered_outcome": "Detailed 3-day schedule with specific landmarks, restaurants, and practical tips",
|
|
120
|
+
"completion_gaps": "None"
|
|
121
|
+
},
|
|
122
|
+
"success": TRUE
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
### SUCCESS: TRUE - Example B
|
|
126
|
+
|
|
127
|
+
CONVERSATION_HISTORY:
|
|
128
|
+
User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
|
|
129
|
+
|
|
130
|
+
AGENT_RESPONSE:
|
|
131
|
+
[TOOL_CALL] run_network_diagnostic()
|
|
132
|
+
[TOOL_RESULT] Issue found: Router firmware outdated, DNS servers slow
|
|
133
|
+
|
|
134
|
+
[TOOL_CALL] upgrade_plan(customer_id="12345", plan="unlimited")
|
|
135
|
+
[TOOL_RESULT] Plan upgraded successfully. Unlimited data active immediately.
|
|
136
|
+
|
|
137
|
+
Your internet issues are resolved - update firmware and change DNS as instructed. Your unlimited data plan is now active. You should see stable connection and no data limits immediately.
|
|
138
|
+
|
|
139
|
+
EXPECTED OUTPUT:
|
|
140
|
+
{
|
|
141
|
+
"explanation": "Both issues resolved: specific troubleshooting steps provided for internet drops, and unlimited data plan successfully activated.",
|
|
142
|
+
"details": {
|
|
143
|
+
"task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
|
|
144
|
+
"delivered_outcome": "Diagnostic completed with specific fix instructions, data plan upgrade confirmed active",
|
|
145
|
+
"completion_gaps": "None"
|
|
146
|
+
},
|
|
147
|
+
"success": TRUE
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
### SUCCESS: FALSE - Example A
|
|
151
|
+
|
|
152
|
+
CONVERSATION_HISTORY:
|
|
153
|
+
User: Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine.
|
|
154
|
+
|
|
155
|
+
AGENT_RESPONSE:
|
|
156
|
+
I'd recommend visiting the Louvre and Eiffel Tower. Paris has great food - try French pastries and wine. The Metro is convenient for getting around.
|
|
157
|
+
|
|
158
|
+
EXPECTED OUTPUT:
|
|
159
|
+
{
|
|
160
|
+
"explanation": "Vague suggestions provided but no actual day-by-day itinerary created. Missing specific landmarks, restaurants, and detailed schedule.",
|
|
161
|
+
"details": {
|
|
162
|
+
"task_requirements": "3-day Paris itinerary with cultural landmarks and local cuisine",
|
|
163
|
+
"delivered_outcome": "General recommendations without structured itinerary",
|
|
164
|
+
"completion_gaps": "No day-by-day schedule, no specific restaurant recommendations, no detailed cultural landmarks"
|
|
165
|
+
},
|
|
166
|
+
"success": FALSE
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
### SUCCESS: FALSE - Example B
|
|
170
|
+
|
|
171
|
+
CONVERSATION_HISTORY:
|
|
172
|
+
User: My internet connection keeps dropping. Help me troubleshoot and then upgrade my data plan to unlimited.
|
|
173
|
+
|
|
174
|
+
AGENT_RESPONSE:
|
|
175
|
+
[TOOL_CALL] run_network_diagnostic()
|
|
176
|
+
[TOOL_RESULT] Issue found: Router firmware outdated
|
|
177
|
+
I found that your router firmware needs updating. You can usually do this through your router's admin panel. Let me know if you need help with the data plan upgrade after you fix the connection.
|
|
178
|
+
|
|
179
|
+
EXPECTED OUTPUT:
|
|
180
|
+
{
|
|
181
|
+
"explanation": "Internet troubleshooting started but not completed with specific steps. Data plan upgrade not attempted. User cannot proceed without more help.",
|
|
182
|
+
"details": {
|
|
183
|
+
"task_requirements": "Fix internet connection issues and upgrade to unlimited data plan",
|
|
184
|
+
"delivered_outcome": "Identified router firmware issue but no specific fix instructions, data upgrade not completed",
|
|
185
|
+
"completion_gaps": "No specific firmware update steps, data plan upgrade not processed"
|
|
186
|
+
},
|
|
187
|
+
"success": FALSE
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
### SUCCESS: FALSE - Example C
|
|
191
|
+
|
|
192
|
+
CONVERSATION_HISTORY:
|
|
193
|
+
User: Analyze our Q3 sales data and generate a summary report.
|
|
194
|
+
|
|
195
|
+
AGENT_RESPONSE:
|
|
196
|
+
I can help you analyze sales data! Sales analysis is important for business growth. You should look at metrics like revenue, customer acquisition, and seasonal trends. Would you like me to explain different types of sales analysis techniques?
|
|
197
|
+
|
|
198
|
+
EXPECTED OUTPUT:
|
|
199
|
+
{
|
|
200
|
+
"explanation": "No actual analysis performed and no report generated. Agent provided general information about sales analysis instead of completing the requested task.",
|
|
201
|
+
"details": {
|
|
202
|
+
"task_requirements": "Analyze Q3 sales data and generate summary report",
|
|
203
|
+
"delivered_outcome": "General information about sales analysis concepts",
|
|
204
|
+
"completion_gaps": "No data analysis performed, no summary report created"
|
|
205
|
+
},
|
|
206
|
+
"success": FALSE
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
KEY PRINCIPLES
|
|
210
|
+
==============
|
|
211
|
+
|
|
212
|
+
1. **Outcome Focus**: Judge only the final deliverable, not the process
|
|
213
|
+
2. **User Readiness**: Can the user proceed with what was delivered?
|
|
214
|
+
3. **Requirement Matching**: Does the outcome match what was specifically requested?
|
|
215
|
+
4. **Completeness**: Are all components of the task addressed?
|
|
216
|
+
5. **Actionability**: Is the deliverable usable in its current form?
|
|
217
|
+
|
|
218
|
+
Remember: A task can be understood correctly and approached properly but still fail if the final outcome doesn't meet requirements.
|
|
219
|
+
|
|
220
|
+
# Output
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
from ._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator, _TaskNavigationEfficiencyMatchingMode
|
|
6
|
+
|
|
7
|
+
__all__ = ["_TaskNavigationEfficiencyEvaluator", "_TaskNavigationEfficiencyMatchingMode"]
|