azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -69,7 +69,9 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
69
69
|
model_config=model_config,
|
|
70
70
|
prompty_file=prompty_path,
|
|
71
71
|
result_key=self._RESULT_KEY,
|
|
72
|
+
threshold=threshold,
|
|
72
73
|
credential=credential,
|
|
74
|
+
_higher_is_better=True,
|
|
73
75
|
**kwargs,
|
|
74
76
|
)
|
|
75
77
|
|
|
@@ -145,8 +147,10 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
145
147
|
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
|
|
146
148
|
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
|
|
147
149
|
|
|
148
|
-
|
|
150
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
151
|
+
llm_output = prompty_output_dict["llm_output"]
|
|
149
152
|
# llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
|
|
153
|
+
score = math.nan
|
|
150
154
|
if isinstance(llm_output, dict):
|
|
151
155
|
score = llm_output.get("score", math.nan)
|
|
152
156
|
if not check_score_is_valid(
|
|
@@ -162,16 +166,31 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
162
166
|
)
|
|
163
167
|
reason = llm_output.get("explanation", "")
|
|
164
168
|
score = float(score)
|
|
165
|
-
score_result = "pass" if score >= self.
|
|
169
|
+
score_result = "pass" if score >= self._threshold else "fail"
|
|
166
170
|
|
|
167
171
|
response_dict = {
|
|
168
172
|
f"{self._result_key}": score,
|
|
173
|
+
f"gpt_{self._result_key}": score,
|
|
169
174
|
f"{self._result_key}_result": score_result,
|
|
170
|
-
f"{self._result_key}_threshold": self.
|
|
175
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
171
176
|
f"{self._result_key}_reason": reason,
|
|
177
|
+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
|
|
178
|
+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
|
|
179
|
+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
|
|
180
|
+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
|
|
181
|
+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
|
|
182
|
+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
|
|
183
|
+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
|
|
172
184
|
}
|
|
173
185
|
return response_dict
|
|
174
186
|
# If llm_output is not a dictionary, return NaN for the score. This should never happen
|
|
175
187
|
if logger:
|
|
176
188
|
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
|
|
177
|
-
|
|
189
|
+
|
|
190
|
+
binary_result = self._get_binary_result(score)
|
|
191
|
+
return {
|
|
192
|
+
self._result_key: float(score),
|
|
193
|
+
f"gpt_{self._result_key}": float(score),
|
|
194
|
+
f"{self._result_key}_result": binary_result,
|
|
195
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
196
|
+
}
|
|
@@ -87,15 +87,13 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
87
87
|
def __init__(self, model_config, *, credential=None, threshold=3, **kwargs):
|
|
88
88
|
current_dir = os.path.dirname(__file__)
|
|
89
89
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
90
|
-
self._threshold = threshold
|
|
91
|
-
self._higher_is_better = True
|
|
92
90
|
super().__init__(
|
|
93
91
|
model_config=model_config,
|
|
94
92
|
prompty_file=prompty_path,
|
|
95
93
|
result_key=self._RESULT_KEY,
|
|
96
94
|
threshold=threshold,
|
|
97
95
|
credential=credential,
|
|
98
|
-
_higher_is_better=
|
|
96
|
+
_higher_is_better=True,
|
|
99
97
|
**kwargs,
|
|
100
98
|
)
|
|
101
99
|
|
|
@@ -177,7 +175,8 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
177
175
|
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger)
|
|
178
176
|
if not isinstance(eval_input["response"], str):
|
|
179
177
|
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
|
|
180
|
-
|
|
178
|
+
result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
179
|
+
llm_output = result.get("llm_output")
|
|
181
180
|
score = math.nan
|
|
182
181
|
|
|
183
182
|
if isinstance(llm_output, dict):
|
|
@@ -188,15 +187,24 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
|
|
|
188
187
|
return {
|
|
189
188
|
self._result_key: float(score),
|
|
190
189
|
f"gpt_{self._result_key}": float(score),
|
|
191
|
-
f"{self._result_key}_reason": reason,
|
|
192
190
|
f"{self._result_key}_result": binary_result,
|
|
193
191
|
f"{self._result_key}_threshold": self._threshold,
|
|
192
|
+
f"{self._result_key}_reason": reason,
|
|
193
|
+
f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
|
|
194
|
+
f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
|
|
195
|
+
f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
|
|
196
|
+
f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
|
|
197
|
+
f"{self._result_key}_model": result.get("model_id", ""),
|
|
198
|
+
f"{self._result_key}_sample_input": result.get("sample_input", ""),
|
|
199
|
+
f"{self._result_key}_sample_output": result.get("sample_output", ""),
|
|
194
200
|
}
|
|
195
201
|
|
|
202
|
+
if logger:
|
|
203
|
+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
|
|
204
|
+
|
|
196
205
|
binary_result = self._get_binary_result(score)
|
|
197
206
|
return {
|
|
198
207
|
self._result_key: float(score),
|
|
199
|
-
f"gpt_{self._result_key}": float(score),
|
|
200
208
|
f"{self._result_key}_result": binary_result,
|
|
201
209
|
f"{self._result_key}_threshold": self._threshold,
|
|
202
210
|
}
|
|
@@ -20,22 +20,25 @@ inputs:
|
|
|
20
20
|
---
|
|
21
21
|
|
|
22
22
|
system:
|
|
23
|
-
You are a Relevance-Judge, an impartial evaluator that scores how well the RESPONSE addresses the
|
|
23
|
+
You are a Relevance-Judge, an impartial evaluator that scores how well the RESPONSE addresses the user's queries in the CONVERSATION_HISTORY using the definitions provided.
|
|
24
24
|
|
|
25
25
|
user:
|
|
26
26
|
ROLE
|
|
27
27
|
====
|
|
28
|
-
You are a Relevance Evaluator. Your task is to judge how relevant a RESPONSE is to
|
|
28
|
+
You are a Relevance Evaluator. Your task is to judge how relevant a RESPONSE is to the CONVERSATION_HISTORY using the Relevance definitions provided.
|
|
29
29
|
|
|
30
30
|
INPUT
|
|
31
31
|
=====
|
|
32
|
-
|
|
32
|
+
CONVERSATION_HISTORY: {{query}}
|
|
33
33
|
RESPONSE: {{response}}
|
|
34
34
|
|
|
35
|
+
CONVERSATION_HISTORY is the full dialogue between the user and the agent up to the user's latest message. For single-turn interactions, this will be just the user's query.
|
|
36
|
+
RESPONSE is the agent's reply to the user's latest message.
|
|
37
|
+
|
|
35
38
|
TASK
|
|
36
39
|
====
|
|
37
40
|
Output a JSON object with:
|
|
38
|
-
1) a concise explanation of 15-60 words justifying your score based on how well the response is relevant to the
|
|
41
|
+
1) a concise explanation of 15-60 words justifying your score based on how well the response is relevant to the user's queries in the CONVERSATION_HISTORY.
|
|
39
42
|
2) an integer score from 1 (very poor) to 5 (excellent) using the rubric below.
|
|
40
43
|
|
|
41
44
|
The explanation should always precede the score and should clearly justify the score based on the rubric definitions.
|
|
@@ -49,13 +52,14 @@ Response format exactly as follows:
|
|
|
49
52
|
|
|
50
53
|
EVALUATION STEPS
|
|
51
54
|
================
|
|
52
|
-
A. Read the
|
|
53
|
-
B.
|
|
54
|
-
|
|
55
|
+
A. Read the CONVERSATION_HISTORY and RESPONSE carefully.
|
|
56
|
+
B. Identify the user's query from the latest message (use conversation history for context if needed).
|
|
57
|
+
C. Compare the RESPONSE against the rubric below:
|
|
58
|
+
- Does the response directly address the user's query?
|
|
55
59
|
- Is the information complete, partial, or off-topic?
|
|
56
60
|
- Is it vague, generic, or insightful?
|
|
57
|
-
|
|
58
|
-
|
|
61
|
+
D. Match the response to the best score from the rubric.
|
|
62
|
+
E. Provide a short explanation and the score using the required format.
|
|
59
63
|
|
|
60
64
|
SCORING RUBRIC
|
|
61
65
|
==============
|
|
@@ -64,7 +68,7 @@ SCORING RUBRIC
|
|
|
64
68
|
Definition: The response is unrelated to the question. It provides off-topic information and does not attempt to address the question posed.
|
|
65
69
|
|
|
66
70
|
**Example A**
|
|
67
|
-
|
|
71
|
+
CONVERSATION_HISTORY: What is the team preparing for?
|
|
68
72
|
RESPONSE: I went grocery shopping yesterday evening.
|
|
69
73
|
|
|
70
74
|
Expected Output:
|
|
@@ -75,7 +79,7 @@ Expected Output:
|
|
|
75
79
|
|
|
76
80
|
|
|
77
81
|
**Example B**
|
|
78
|
-
|
|
82
|
+
CONVERSATION_HISTORY: When will the company's new product line launch?
|
|
79
83
|
RESPONSE: International travel can be very rewarding and educational.
|
|
80
84
|
|
|
81
85
|
Expected Output:
|
|
@@ -89,7 +93,7 @@ Expected Output:
|
|
|
89
93
|
Definition: The response is loosely or formally related to the query but fails to deliver any meaningful, specific, or useful information. This includes vague phrases, non-answers, or failure/error messages.
|
|
90
94
|
|
|
91
95
|
**Example A**
|
|
92
|
-
|
|
96
|
+
CONVERSATION_HISTORY: What is the event about?
|
|
93
97
|
RESPONSE: It’s something important.
|
|
94
98
|
|
|
95
99
|
Expected Output:
|
|
@@ -99,7 +103,7 @@ Expected Output:
|
|
|
99
103
|
}
|
|
100
104
|
|
|
101
105
|
**Example B**
|
|
102
|
-
|
|
106
|
+
CONVERSATION_HISTORY: What’s the weather in Paris?
|
|
103
107
|
RESPONSE: I tried to find the forecast but the query failed.
|
|
104
108
|
|
|
105
109
|
Expected Output:
|
|
@@ -112,7 +116,7 @@ Expected Output:
|
|
|
112
116
|
Definition: The response addresses the query and includes relevant information, but omits essential components or detail. The answer is on-topic but insufficient to fully satisfy the request.
|
|
113
117
|
|
|
114
118
|
**Example A**
|
|
115
|
-
|
|
119
|
+
CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
|
|
116
120
|
RESPONSE: The apartment complex has a gym.
|
|
117
121
|
|
|
118
122
|
Expected Output:
|
|
@@ -122,7 +126,7 @@ Expected Output:
|
|
|
122
126
|
}
|
|
123
127
|
|
|
124
128
|
**Example B**
|
|
125
|
-
|
|
129
|
+
CONVERSATION_HISTORY: What services does the premium membership include?
|
|
126
130
|
RESPONSE: It includes priority customer support.
|
|
127
131
|
|
|
128
132
|
Expected Output:
|
|
@@ -137,7 +141,7 @@ Expected Output:
|
|
|
137
141
|
Definition: The response fully addresses the question with accurate and sufficient information, covering all essential aspects. Very minor omissions are acceptable as long as the core information is intact and the intent is clearly conveyed.
|
|
138
142
|
|
|
139
143
|
**Example A**
|
|
140
|
-
|
|
144
|
+
CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
|
|
141
145
|
RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security.
|
|
142
146
|
|
|
143
147
|
Expected Output:
|
|
@@ -147,7 +151,7 @@ Expected Output:
|
|
|
147
151
|
}
|
|
148
152
|
|
|
149
153
|
**Example B**
|
|
150
|
-
|
|
154
|
+
CONVERSATION_HISTORY: What services does the premium membership include?
|
|
151
155
|
RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases.
|
|
152
156
|
|
|
153
157
|
Expected Output:
|
|
@@ -161,7 +165,7 @@ Expected Output:
|
|
|
161
165
|
Definition: The response not only fully and accurately answers the question, but also adds meaningful elaboration, interpretation, or context that enhances the user's understanding. This goes beyond just listing relevant details — it offers insight into why the information matters, how it's useful, or what impact it has.
|
|
162
166
|
|
|
163
167
|
**Example A**
|
|
164
|
-
|
|
168
|
+
CONVERSATION_HISTORY: What amenities does the new apartment complex provide?
|
|
165
169
|
RESPONSE: The apartment complex provides a gym, swimming pool, and 24/7 security, designed to offer residents a comfortable and active lifestyle while ensuring their safety.
|
|
166
170
|
|
|
167
171
|
Expected Output:
|
|
@@ -171,7 +175,7 @@ Expected Output:
|
|
|
171
175
|
}
|
|
172
176
|
|
|
173
177
|
**Example B**
|
|
174
|
-
|
|
178
|
+
CONVERSATION_HISTORY: What services does the premium membership include?
|
|
175
179
|
RESPONSE: The premium membership includes priority customer support, exclusive content access, and early product releases — tailored for users who want quicker resolutions and first access to new features.
|
|
176
180
|
|
|
177
181
|
Expected Output:
|
|
@@ -179,3 +183,16 @@ Expected Output:
|
|
|
179
183
|
"explanation": "The response covers all essential services and adds valuable insight about the target user and benefits, enriching the response beyond basic listing.",
|
|
180
184
|
"score": 5
|
|
181
185
|
}
|
|
186
|
+
|
|
187
|
+
### Multi-turn Conversation Example
|
|
188
|
+
When evaluating responses in a multi-turn conversation, consider the conversation context to understand the user's intent:
|
|
189
|
+
|
|
190
|
+
**Example - Multi-turn Context**
|
|
191
|
+
CONVERSATION_HISTORY: [{"role":"user","content":"I'm planning a vacation to Europe."},{"role":"assistant","content":"That sounds exciting! What time of year are you thinking of traveling?"},{"role":"user","content":"Probably in July. What's the weather like then?"}]
|
|
192
|
+
RESPONSE: [{"role":"assistant","content":"July is summer in Europe with generally warm and pleasant weather. Most countries have temperatures between 20-25°C (68-77°F). It's a popular travel time, so expect crowds at major tourist attractions and higher accommodation prices."}]
|
|
193
|
+
|
|
194
|
+
Expected Output:
|
|
195
|
+
{
|
|
196
|
+
"explanation": "The response directly addresses the weather question while providing valuable context about crowds and pricing that's relevant to vacation planning established in the conversation.",
|
|
197
|
+
"score": 5
|
|
198
|
+
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
import logging
|
|
6
7
|
import math
|
|
7
8
|
from typing import Dict, List, Union, Optional
|
|
8
9
|
|
|
@@ -14,39 +15,37 @@ from azure.ai.evaluation._common.utils import parse_quality_evaluator_reason_sco
|
|
|
14
15
|
from azure.ai.evaluation._model_configurations import Conversation, Message
|
|
15
16
|
from azure.ai.evaluation._common._experimental import experimental
|
|
16
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
17
20
|
|
|
18
21
|
@experimental
|
|
19
22
|
class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
+
"""Evaluates the extent to which a given response contains all necessary and relevant information with respect to the
|
|
24
|
+
provided ground truth.
|
|
25
|
+
|
|
23
26
|
The completeness measure assesses how thoroughly an AI model's generated response aligns with the key information,
|
|
24
27
|
claims, and statements established in the ground truth. This evaluation considers the presence, accuracy,
|
|
25
28
|
and relevance of the content provided.
|
|
29
|
+
|
|
26
30
|
The assessment spans multiple levels, ranging from fully incomplete to fully complete, ensuring a comprehensive
|
|
27
31
|
evaluation of the response's content quality.
|
|
32
|
+
|
|
28
33
|
Use this metric when you need to evaluate an AI model's ability to deliver comprehensive and accurate information,
|
|
29
34
|
particularly in text generation tasks where conveying all essential details is crucial for clarity,
|
|
30
35
|
context, and correctness.
|
|
36
|
+
|
|
31
37
|
Completeness scores range from 1 to 5:
|
|
38
|
+
|
|
32
39
|
1: Fully incomplete — Contains none of the necessary information.
|
|
33
40
|
2: Barely complete — Contains only a small portion of the required information.
|
|
34
41
|
3: Moderately complete — Covers about half of the required content.
|
|
35
42
|
4: Mostly complete — Includes most of the necessary details with minimal omissions.
|
|
36
43
|
5: Fully complete — Contains all key information without any omissions.
|
|
44
|
+
|
|
37
45
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
38
46
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
39
47
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
40
48
|
|
|
41
|
-
.. admonition:: Example:
|
|
42
|
-
|
|
43
|
-
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
44
|
-
:start-after: [START completeness_evaluator]
|
|
45
|
-
:end-before: [END completeness_evaluator]
|
|
46
|
-
:language: python
|
|
47
|
-
:dedent: 8
|
|
48
|
-
:caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
|
|
49
|
-
|
|
50
49
|
.. admonition:: Example using Azure AI Project URL:
|
|
51
50
|
|
|
52
51
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
@@ -78,12 +77,14 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
78
77
|
):
|
|
79
78
|
current_dir = os.path.dirname(__file__)
|
|
80
79
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
81
|
-
self.threshold = threshold
|
|
80
|
+
self.threshold = threshold # to be removed in favor of _threshold
|
|
82
81
|
super().__init__(
|
|
83
82
|
model_config=model_config,
|
|
84
83
|
prompty_file=prompty_path,
|
|
85
84
|
result_key=self._RESULT_KEY,
|
|
85
|
+
threshold=threshold,
|
|
86
86
|
credential=credential,
|
|
87
|
+
_higher_is_better=True,
|
|
87
88
|
**kwargs,
|
|
88
89
|
)
|
|
89
90
|
|
|
@@ -160,20 +161,42 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
160
161
|
target=ErrorTarget.COMPLETENESS_EVALUATOR,
|
|
161
162
|
)
|
|
162
163
|
|
|
163
|
-
|
|
164
|
+
result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
165
|
+
llm_output = result.get("llm_output") if isinstance(result, dict) else result
|
|
164
166
|
|
|
165
167
|
score = math.nan
|
|
166
|
-
|
|
167
|
-
|
|
168
|
+
llm_output_is_dict = isinstance(llm_output, dict)
|
|
169
|
+
if llm_output_is_dict or isinstance(llm_output, str):
|
|
170
|
+
reason = ""
|
|
171
|
+
if llm_output_is_dict:
|
|
172
|
+
score = float(llm_output.get("score", math.nan))
|
|
173
|
+
reason = llm_output.get("explanation", "")
|
|
174
|
+
else:
|
|
175
|
+
score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[1-5]")
|
|
168
176
|
|
|
169
|
-
|
|
177
|
+
binary_result = self._get_binary_result(score)
|
|
170
178
|
|
|
171
179
|
# updating the result key and threshold to int based on the schema
|
|
172
180
|
return {
|
|
173
181
|
f"{self._result_key}": int(score),
|
|
174
|
-
f"{self._result_key}_result":
|
|
175
|
-
f"{self._result_key}_threshold": int(self.
|
|
182
|
+
f"{self._result_key}_result": binary_result,
|
|
183
|
+
f"{self._result_key}_threshold": int(self._threshold),
|
|
176
184
|
f"{self._result_key}_reason": reason,
|
|
185
|
+
f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
|
|
186
|
+
f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
|
|
187
|
+
f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
|
|
188
|
+
f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
|
|
189
|
+
f"{self._result_key}_model": result.get("model_id", ""),
|
|
190
|
+
f"{self._result_key}_sample_input": result.get("sample_input", ""),
|
|
191
|
+
f"{self._result_key}_sample_output": result.get("sample_output", ""),
|
|
177
192
|
}
|
|
178
193
|
|
|
179
|
-
|
|
194
|
+
if logger:
|
|
195
|
+
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
|
|
196
|
+
|
|
197
|
+
binary_result = self._get_binary_result(score)
|
|
198
|
+
return {
|
|
199
|
+
self._result_key: float(score),
|
|
200
|
+
f"{self._result_key}_result": binary_result,
|
|
201
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
202
|
+
}
|
|
@@ -10,7 +10,11 @@ from typing_extensions import overload, override
|
|
|
10
10
|
|
|
11
11
|
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
12
12
|
from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
|
|
13
|
-
from ..._common.utils import
|
|
13
|
+
from ..._common.utils import (
|
|
14
|
+
reformat_conversation_history,
|
|
15
|
+
reformat_agent_response,
|
|
16
|
+
reformat_tool_definitions,
|
|
17
|
+
)
|
|
14
18
|
from azure.ai.evaluation._model_configurations import Message
|
|
15
19
|
from azure.ai.evaluation._common._experimental import experimental
|
|
16
20
|
|
|
@@ -40,6 +44,7 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
40
44
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
41
45
|
|
|
42
46
|
.. admonition:: Example:
|
|
47
|
+
|
|
43
48
|
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
44
49
|
:start-after: [START task_adherence_evaluator]
|
|
45
50
|
:end-before: [END task_adherence_evaluator]
|
|
@@ -72,12 +77,14 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
72
77
|
def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE, credential=None, **kwargs):
|
|
73
78
|
current_dir = os.path.dirname(__file__)
|
|
74
79
|
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
75
|
-
self.threshold = threshold
|
|
80
|
+
self.threshold = threshold # to be removed in favor of _threshold
|
|
76
81
|
super().__init__(
|
|
77
82
|
model_config=model_config,
|
|
78
83
|
prompty_file=prompty_path,
|
|
79
84
|
result_key=self._RESULT_KEY,
|
|
85
|
+
threshold=threshold,
|
|
80
86
|
credential=credential,
|
|
87
|
+
_higher_is_better=True,
|
|
81
88
|
**kwargs,
|
|
82
89
|
)
|
|
83
90
|
|
|
@@ -153,19 +160,38 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
153
160
|
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
|
|
154
161
|
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
|
|
155
162
|
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
|
|
156
|
-
|
|
163
|
+
|
|
164
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
165
|
+
llm_output = prompty_output_dict["llm_output"]
|
|
166
|
+
|
|
167
|
+
score = math.nan
|
|
157
168
|
if isinstance(llm_output, dict):
|
|
158
169
|
score = float(llm_output.get("score", math.nan))
|
|
159
|
-
score_result = "pass" if score >= self.
|
|
170
|
+
score_result = "pass" if score >= self._threshold else "fail"
|
|
160
171
|
reason = llm_output.get("explanation", "")
|
|
161
172
|
return {
|
|
162
173
|
f"{self._result_key}": score,
|
|
174
|
+
f"gpt_{self._result_key}": score,
|
|
163
175
|
f"{self._result_key}_result": score_result,
|
|
164
|
-
f"{self._result_key}_threshold": self.
|
|
176
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
165
177
|
f"{self._result_key}_reason": reason,
|
|
166
178
|
# Uncomment the following line in the next iteration after UI contracts are validated.
|
|
167
179
|
# f"{self._result_key}_additional_details": llm_output
|
|
180
|
+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
|
|
181
|
+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
|
|
182
|
+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
|
|
183
|
+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
|
|
184
|
+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
|
|
185
|
+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
|
|
186
|
+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
|
|
168
187
|
}
|
|
169
188
|
if logger:
|
|
170
189
|
logger.warning("LLM output is not a dictionary, returning NaN for the score.")
|
|
171
|
-
|
|
190
|
+
|
|
191
|
+
binary_result = self._get_binary_result(score)
|
|
192
|
+
return {
|
|
193
|
+
self._result_key: float(score),
|
|
194
|
+
f"gpt_{self._result_key}": float(score),
|
|
195
|
+
f"{self._result_key}_result": binary_result,
|
|
196
|
+
f"{self._result_key}_threshold": self._threshold,
|
|
197
|
+
}
|
|
@@ -2,6 +2,6 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
|
-
from .
|
|
5
|
+
from ._task_completion import _TaskCompletionEvaluator
|
|
6
6
|
|
|
7
|
-
__all__ = ["
|
|
7
|
+
__all__ = ["_TaskCompletionEvaluator"]
|
|
@@ -18,8 +18,8 @@ logger = logging.getLogger(__name__)
|
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
@experimental
|
|
21
|
-
class
|
|
22
|
-
"""The Task
|
|
21
|
+
class _TaskCompletionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
22
|
+
"""The Task Completion evaluator determines whether an AI agent successfully completed the requested task based on:
|
|
23
23
|
|
|
24
24
|
- Final outcome and deliverable of the task
|
|
25
25
|
- Completeness of task requirements
|
|
@@ -27,8 +27,8 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
|
|
|
27
27
|
This evaluator focuses solely on task completion and success, not on task adherence or intent understanding.
|
|
28
28
|
|
|
29
29
|
Scoring is binary:
|
|
30
|
-
-
|
|
31
|
-
-
|
|
30
|
+
- 1 (pass): Task fully completed with usable deliverable that meets all user requirements
|
|
31
|
+
- 0 (fail): Task incomplete, partially completed, or deliverable does not meet requirements
|
|
32
32
|
|
|
33
33
|
The evaluation includes task requirement analysis, outcome assessment, and completion gap identification.
|
|
34
34
|
|
|
@@ -39,29 +39,29 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
|
|
|
39
39
|
|
|
40
40
|
.. admonition:: Example:
|
|
41
41
|
.. literalinclude:: ../samples/evaluation_samples_evaluate.py
|
|
42
|
-
:start-after: [START
|
|
43
|
-
:end-before: [END
|
|
42
|
+
:start-after: [START task_completion_evaluator]
|
|
43
|
+
:end-before: [END task_completion_evaluator]
|
|
44
44
|
:language: python
|
|
45
45
|
:dedent: 8
|
|
46
|
-
:caption: Initialize and call a
|
|
46
|
+
:caption: Initialize and call a _TaskCompletionEvaluator with a query and response.
|
|
47
47
|
|
|
48
48
|
.. admonition:: Example using Azure AI Project URL:
|
|
49
49
|
|
|
50
50
|
.. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
|
|
51
|
-
:start-after: [START
|
|
52
|
-
:end-before: [END
|
|
51
|
+
:start-after: [START task_completion_evaluator]
|
|
52
|
+
:end-before: [END task_completion_evaluator]
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent: 8
|
|
55
|
-
:caption: Initialize and call
|
|
55
|
+
:caption: Initialize and call a _TaskCompletionEvaluator using Azure AI Project URL in the following format
|
|
56
56
|
https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
|
|
57
57
|
|
|
58
58
|
"""
|
|
59
59
|
|
|
60
|
-
_PROMPTY_FILE = "
|
|
61
|
-
_RESULT_KEY = "
|
|
60
|
+
_PROMPTY_FILE = "task_completion.prompty"
|
|
61
|
+
_RESULT_KEY = "task_completion"
|
|
62
62
|
_OPTIONAL_PARAMS = ["tool_definitions"]
|
|
63
63
|
|
|
64
|
-
id = "azureai://built-in/evaluators/
|
|
64
|
+
id = "azureai://built-in/evaluators/task_completion"
|
|
65
65
|
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
|
|
66
66
|
|
|
67
67
|
@override
|
|
@@ -83,20 +83,20 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
|
|
|
83
83
|
query: Union[str, List[dict]],
|
|
84
84
|
response: Union[str, List[dict]],
|
|
85
85
|
tool_definitions: Optional[Union[dict, List[dict]]] = None,
|
|
86
|
-
) -> Dict[str, Union[str,
|
|
87
|
-
"""Evaluate task
|
|
86
|
+
) -> Dict[str, Union[str, float]]:
|
|
87
|
+
"""Evaluate task completion for a given query, response, and optionally tool definitions.
|
|
88
88
|
The query and response can be either a string or a list of messages.
|
|
89
89
|
|
|
90
90
|
|
|
91
91
|
Example with string inputs and no tools:
|
|
92
|
-
evaluator =
|
|
92
|
+
evaluator = _TaskCompletionEvaluator(model_config)
|
|
93
93
|
query = "Plan a 3-day itinerary for Paris with cultural landmarks and local cuisine."
|
|
94
94
|
response = "**Day 1:** Morning: Louvre Museum, Lunch: Le Comptoir du Relais..."
|
|
95
95
|
|
|
96
96
|
result = evaluator(query=query, response=response)
|
|
97
97
|
|
|
98
98
|
Example with list of messages:
|
|
99
|
-
evaluator =
|
|
99
|
+
evaluator = _TaskCompletionEvaluator(model_config)
|
|
100
100
|
query = [{'role': 'system', 'content': 'You are a helpful travel planning assistant.'}, {'createdAt': 1700000060, 'role': 'user', 'content': [{'type': 'text', 'text': 'Plan a 3-day Paris itinerary with cultural landmarks and cuisine'}]}]
|
|
101
101
|
response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant', 'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
|
|
102
102
|
tool_definitions = [{'name': 'get_attractions', 'description': 'Get tourist attractions for a city.', 'parameters': {'type': 'object', 'properties': {'city': {'type': 'string', 'description': 'The city name.'}}}}]
|
|
@@ -109,8 +109,8 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
|
|
|
109
109
|
:paramtype response: Union[str, List[dict]]
|
|
110
110
|
:keyword tool_definitions: An optional list of messages containing the tool definitions the agent is aware of.
|
|
111
111
|
:paramtype tool_definitions: Optional[Union[dict, List[dict]]]
|
|
112
|
-
:return: A dictionary with the task
|
|
113
|
-
:rtype: Dict[str, Union[str,
|
|
112
|
+
:return: A dictionary with the task completion evaluation results.
|
|
113
|
+
:rtype: Dict[str, Union[str, float]]
|
|
114
114
|
"""
|
|
115
115
|
|
|
116
116
|
@override
|
|
@@ -127,8 +127,8 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
|
|
|
127
127
|
return super().__call__(*args, **kwargs)
|
|
128
128
|
|
|
129
129
|
@override
|
|
130
|
-
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[
|
|
131
|
-
"""Do Task
|
|
130
|
+
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override]
|
|
131
|
+
"""Do Task Completion evaluation.
|
|
132
132
|
:param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method
|
|
133
133
|
:type eval_input: Dict
|
|
134
134
|
:return: The evaluation result.
|
|
@@ -138,31 +138,40 @@ class TaskSuccessEvaluator(PromptyEvaluatorBase[Union[str, bool]]):
|
|
|
138
138
|
# which is a different schema than _base_prompty_eval.py
|
|
139
139
|
if "query" not in eval_input and "response" not in eval_input:
|
|
140
140
|
raise EvaluationException(
|
|
141
|
-
message=f"Both query and response must be provided as input to the Task
|
|
142
|
-
internal_message=f"Both query and response must be provided as input to the Task
|
|
141
|
+
message=f"Both query and response must be provided as input to the Task Completion evaluator.",
|
|
142
|
+
internal_message=f"Both query and response must be provided as input to the Task Completion evaluator.",
|
|
143
143
|
blame=ErrorBlame.USER_ERROR,
|
|
144
144
|
category=ErrorCategory.MISSING_FIELD,
|
|
145
|
-
target=ErrorTarget.
|
|
145
|
+
target=ErrorTarget.TASK_COMPLETION_EVALUATOR,
|
|
146
146
|
)
|
|
147
147
|
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
|
|
148
148
|
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
|
|
149
149
|
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
|
|
150
150
|
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
|
|
151
151
|
|
|
152
|
-
|
|
152
|
+
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
|
|
153
|
+
llm_output = prompty_output_dict.get("llm_output", {})
|
|
154
|
+
|
|
153
155
|
if isinstance(llm_output, dict):
|
|
154
|
-
success = llm_output.get("success",
|
|
156
|
+
success = llm_output.get("success", 0)
|
|
155
157
|
if isinstance(success, str):
|
|
156
|
-
success = success.upper() == "TRUE"
|
|
158
|
+
success = 1 if success.upper() == "TRUE" else 0
|
|
157
159
|
|
|
158
|
-
success_result = "pass" if success ==
|
|
160
|
+
success_result = "pass" if success == 1 else "fail"
|
|
159
161
|
reason = llm_output.get("explanation", "")
|
|
160
162
|
return {
|
|
161
163
|
f"{self._result_key}": success,
|
|
162
164
|
f"{self._result_key}_result": success_result,
|
|
163
165
|
f"{self._result_key}_reason": reason,
|
|
164
166
|
f"{self._result_key}_details": llm_output.get("details", ""),
|
|
167
|
+
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
|
|
168
|
+
f"{self._result_key}_completion_tokens": prompty_output_dict.get("output_token_count", 0),
|
|
169
|
+
f"{self._result_key}_total_tokens": prompty_output_dict.get("total_token_count", 0),
|
|
170
|
+
f"{self._result_key}_finish_reason": prompty_output_dict.get("finish_reason", ""),
|
|
171
|
+
f"{self._result_key}_model": prompty_output_dict.get("model_id", ""),
|
|
172
|
+
f"{self._result_key}_sample_input": prompty_output_dict.get("sample_input", ""),
|
|
173
|
+
f"{self._result_key}_sample_output": prompty_output_dict.get("sample_output", ""),
|
|
165
174
|
}
|
|
166
175
|
if logger:
|
|
167
|
-
logger.warning("LLM output is not a dictionary, returning
|
|
168
|
-
return {self._result_key:
|
|
176
|
+
logger.warning("LLM output is not a dictionary, returning 0 for the success.")
|
|
177
|
+
return {self._result_key: 0}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
---
|
|
2
|
-
name: Task
|
|
2
|
+
name: Task Completion
|
|
3
3
|
description: Evaluates whether a task was successfully completed
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
@@ -27,7 +27,7 @@ You are an expert evaluator who determines if an agent has successfully complete
|
|
|
27
27
|
user:
|
|
28
28
|
ROLE
|
|
29
29
|
====
|
|
30
|
-
You are a judge on Task
|
|
30
|
+
You are a judge on Task Completion who assesses the final outcome of a user-agent interaction. Your single focus is: **Was the user's task successfully and completely accomplished?**
|
|
31
31
|
|
|
32
32
|
You are NOT evaluating:
|
|
33
33
|
- How well the agent followed instructions
|