deepeval 3.6.7__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +725 -217
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/RECORD +75 -53
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
from typing import Optional, List, Union, Dict
|
|
2
|
+
|
|
3
|
+
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
construct_verbose_logs,
|
|
6
|
+
trimAndLoadJson,
|
|
7
|
+
check_llm_test_case_params,
|
|
8
|
+
initialize_model,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
11
|
+
from deepeval.metrics import BaseMetric
|
|
12
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
13
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
|
+
from deepeval.metrics.step_efficiency.template import (
|
|
15
|
+
StepEfficiencyTemplate,
|
|
16
|
+
)
|
|
17
|
+
from deepeval.metrics.step_efficiency.schema import Task
|
|
18
|
+
from deepeval.metrics.plan_adherence.schema import (
|
|
19
|
+
AgentPlan,
|
|
20
|
+
PlanAdherenceScore,
|
|
21
|
+
)
|
|
22
|
+
from deepeval.metrics.plan_adherence.template import (
|
|
23
|
+
PlanAdherenceTemplate,
|
|
24
|
+
)
|
|
25
|
+
from deepeval.metrics.api import metric_data_manager
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PlanAdherenceMetric(BaseMetric):
|
|
29
|
+
|
|
30
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
31
|
+
LLMTestCaseParams.INPUT,
|
|
32
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
33
|
+
LLMTestCaseParams.TOOLS_CALLED,
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
threshold: float = 0.5,
|
|
39
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
40
|
+
include_reason: bool = True,
|
|
41
|
+
async_mode: bool = True,
|
|
42
|
+
strict_mode: bool = False,
|
|
43
|
+
verbose_mode: bool = False,
|
|
44
|
+
):
|
|
45
|
+
self.threshold = 1 if strict_mode else threshold
|
|
46
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
47
|
+
self.evaluation_model = self.model.get_model_name()
|
|
48
|
+
self.include_reason = include_reason
|
|
49
|
+
self.async_mode = async_mode
|
|
50
|
+
self.strict_mode = strict_mode
|
|
51
|
+
self.verbose_mode = verbose_mode
|
|
52
|
+
self.requires_trace = True
|
|
53
|
+
|
|
54
|
+
def measure(
|
|
55
|
+
self,
|
|
56
|
+
test_case: LLMTestCase,
|
|
57
|
+
_show_indicator: bool = True,
|
|
58
|
+
_in_component: bool = False,
|
|
59
|
+
_log_metric_to_confident: bool = True,
|
|
60
|
+
):
|
|
61
|
+
has_trace: bool = isinstance(test_case._trace_dict, Dict)
|
|
62
|
+
if not has_trace:
|
|
63
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
64
|
+
|
|
65
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
66
|
+
with metric_progress_indicator(
|
|
67
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
68
|
+
):
|
|
69
|
+
if self.async_mode:
|
|
70
|
+
loop = get_or_create_event_loop()
|
|
71
|
+
loop.run_until_complete(
|
|
72
|
+
self.a_measure(
|
|
73
|
+
test_case,
|
|
74
|
+
_show_indicator=False,
|
|
75
|
+
_in_component=_in_component,
|
|
76
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
task = self._extract_task_from_trace(test_case)
|
|
81
|
+
agent_plan = self._extract_plan_from_trace(test_case)
|
|
82
|
+
if len(agent_plan.plan) == 0:
|
|
83
|
+
self.score = 1
|
|
84
|
+
self.reason = "There were no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in any one of the trace attributes."
|
|
85
|
+
else:
|
|
86
|
+
plan_adherence_score = self._get_plan_adherence_score(
|
|
87
|
+
task, agent_plan.plan, test_case
|
|
88
|
+
)
|
|
89
|
+
self.score = (
|
|
90
|
+
0
|
|
91
|
+
if self.strict_mode
|
|
92
|
+
and plan_adherence_score.score < self.threshold
|
|
93
|
+
else plan_adherence_score.score
|
|
94
|
+
)
|
|
95
|
+
self.reason = plan_adherence_score.reason
|
|
96
|
+
self.success = self.score >= self.threshold
|
|
97
|
+
self.verbose_logs = construct_verbose_logs(
|
|
98
|
+
self,
|
|
99
|
+
steps=[
|
|
100
|
+
f"Task: {task} \n",
|
|
101
|
+
f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
|
|
102
|
+
f"Final Score: {self.score} \n",
|
|
103
|
+
f"Final Reason: {self.reason} \n",
|
|
104
|
+
],
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if _log_metric_to_confident:
|
|
108
|
+
metric_data_manager.post_metric_if_enabled(
|
|
109
|
+
self, test_case=test_case
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return self.score
|
|
113
|
+
|
|
114
|
+
async def a_measure(
|
|
115
|
+
self,
|
|
116
|
+
test_case: LLMTestCase,
|
|
117
|
+
_show_indicator: bool = True,
|
|
118
|
+
_in_component: bool = False,
|
|
119
|
+
_log_metric_to_confident: bool = True,
|
|
120
|
+
):
|
|
121
|
+
has_trace: bool = isinstance(test_case._trace_dict, Dict)
|
|
122
|
+
if not has_trace:
|
|
123
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
124
|
+
|
|
125
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
126
|
+
|
|
127
|
+
with metric_progress_indicator(
|
|
128
|
+
self,
|
|
129
|
+
async_mode=True,
|
|
130
|
+
_show_indicator=_show_indicator,
|
|
131
|
+
_in_component=_in_component,
|
|
132
|
+
):
|
|
133
|
+
task = await self._a_extract_task_from_trace(test_case)
|
|
134
|
+
agent_plan = await self._a_extract_plan_from_trace(test_case)
|
|
135
|
+
if len(agent_plan.plan) == 0:
|
|
136
|
+
self.score = 1
|
|
137
|
+
self.reason = "There were no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in any one of the trace attributes."
|
|
138
|
+
else:
|
|
139
|
+
plan_adherence_score = await self._a_get_plan_adherence_score(
|
|
140
|
+
task, agent_plan.plan, test_case
|
|
141
|
+
)
|
|
142
|
+
self.score = (
|
|
143
|
+
0
|
|
144
|
+
if self.strict_mode
|
|
145
|
+
and plan_adherence_score.score < self.threshold
|
|
146
|
+
else plan_adherence_score.score
|
|
147
|
+
)
|
|
148
|
+
self.reason = plan_adherence_score.reason
|
|
149
|
+
self.success = self.score >= self.threshold
|
|
150
|
+
|
|
151
|
+
self.verbose_logs = construct_verbose_logs(
|
|
152
|
+
self,
|
|
153
|
+
steps=[
|
|
154
|
+
f"Task: {task} \n",
|
|
155
|
+
f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
|
|
156
|
+
f"Final Score: {self.score} \n",
|
|
157
|
+
f"Final Reason: {self.reason} \n",
|
|
158
|
+
],
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if _log_metric_to_confident:
|
|
162
|
+
metric_data_manager.post_metric_if_enabled(
|
|
163
|
+
self, test_case=test_case
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return self.score
|
|
167
|
+
|
|
168
|
+
def _get_plan_adherence_score(self, task, plan, test_case):
|
|
169
|
+
prompt = PlanAdherenceTemplate.evaluate_adherence(
|
|
170
|
+
task, "\n".join(plan), test_case._trace_dict
|
|
171
|
+
)
|
|
172
|
+
if self.using_native_model:
|
|
173
|
+
res, cost = self.model.generate(prompt, schema=PlanAdherenceScore)
|
|
174
|
+
self.evaluation_cost += cost
|
|
175
|
+
return res
|
|
176
|
+
else:
|
|
177
|
+
try:
|
|
178
|
+
res: Task = self.model.generate(
|
|
179
|
+
prompt, schema=PlanAdherenceScore
|
|
180
|
+
)
|
|
181
|
+
return res
|
|
182
|
+
except TypeError:
|
|
183
|
+
res = self.model.generate(prompt)
|
|
184
|
+
data = trimAndLoadJson(res, self)
|
|
185
|
+
return PlanAdherenceScore(**data)
|
|
186
|
+
|
|
187
|
+
async def _a_get_plan_adherence_score(self, task, plan, test_case):
|
|
188
|
+
prompt = PlanAdherenceTemplate.evaluate_adherence(
|
|
189
|
+
task, "\n".join(plan), test_case._trace_dict
|
|
190
|
+
)
|
|
191
|
+
if self.using_native_model:
|
|
192
|
+
res, cost = await self.model.a_generate(
|
|
193
|
+
prompt, schema=PlanAdherenceScore
|
|
194
|
+
)
|
|
195
|
+
self.evaluation_cost += cost
|
|
196
|
+
return res
|
|
197
|
+
else:
|
|
198
|
+
try:
|
|
199
|
+
res: Task = await self.model.a_generate(
|
|
200
|
+
prompt, schema=PlanAdherenceScore
|
|
201
|
+
)
|
|
202
|
+
return res
|
|
203
|
+
except TypeError:
|
|
204
|
+
res = await self.model.a_generate(prompt)
|
|
205
|
+
data = trimAndLoadJson(res, self)
|
|
206
|
+
return PlanAdherenceScore(**data)
|
|
207
|
+
|
|
208
|
+
def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
|
|
209
|
+
prompt = PlanAdherenceTemplate.extract_plan_from_trace(
|
|
210
|
+
test_case._trace_dict
|
|
211
|
+
)
|
|
212
|
+
if self.using_native_model:
|
|
213
|
+
res, cost = self.model.generate(prompt, schema=AgentPlan)
|
|
214
|
+
self.evaluation_cost += cost
|
|
215
|
+
return res
|
|
216
|
+
else:
|
|
217
|
+
try:
|
|
218
|
+
res: Task = self.model.generate(prompt, schema=AgentPlan)
|
|
219
|
+
return res
|
|
220
|
+
except TypeError:
|
|
221
|
+
res = self.model.generate(prompt)
|
|
222
|
+
data = trimAndLoadJson(res, self)
|
|
223
|
+
return AgentPlan(**data)
|
|
224
|
+
|
|
225
|
+
async def _a_extract_plan_from_trace(
|
|
226
|
+
self, test_case: LLMTestCase
|
|
227
|
+
) -> AgentPlan:
|
|
228
|
+
prompt = PlanAdherenceTemplate.extract_plan_from_trace(
|
|
229
|
+
test_case._trace_dict
|
|
230
|
+
)
|
|
231
|
+
if self.using_native_model:
|
|
232
|
+
res, cost = await self.model.a_generate(prompt, schema=AgentPlan)
|
|
233
|
+
self.evaluation_cost += cost
|
|
234
|
+
return res
|
|
235
|
+
else:
|
|
236
|
+
try:
|
|
237
|
+
res: Task = await self.model.a_generate(
|
|
238
|
+
prompt, schema=AgentPlan
|
|
239
|
+
)
|
|
240
|
+
return res
|
|
241
|
+
except TypeError:
|
|
242
|
+
res = await self.model.a_generate(prompt)
|
|
243
|
+
data = trimAndLoadJson(res, self)
|
|
244
|
+
return AgentPlan(**data)
|
|
245
|
+
|
|
246
|
+
def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
|
|
247
|
+
prompt = StepEfficiencyTemplate.extract_task_from_trace(
|
|
248
|
+
test_case._trace_dict
|
|
249
|
+
)
|
|
250
|
+
if self.using_native_model:
|
|
251
|
+
res, cost = self.model.generate(prompt, schema=Task)
|
|
252
|
+
self.evaluation_cost += cost
|
|
253
|
+
return res.task
|
|
254
|
+
else:
|
|
255
|
+
try:
|
|
256
|
+
res: Task = self.model.generate(prompt, schema=Task)
|
|
257
|
+
return res.task
|
|
258
|
+
except TypeError:
|
|
259
|
+
res = self.model.generate(prompt)
|
|
260
|
+
data = trimAndLoadJson(res, self)
|
|
261
|
+
return data["task"]
|
|
262
|
+
|
|
263
|
+
async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
|
|
264
|
+
prompt = StepEfficiencyTemplate.extract_task_from_trace(
|
|
265
|
+
test_case._trace_dict
|
|
266
|
+
)
|
|
267
|
+
if self.using_native_model:
|
|
268
|
+
res, cost = await self.model.a_generate(prompt, schema=Task)
|
|
269
|
+
self.evaluation_cost += cost
|
|
270
|
+
return res.task
|
|
271
|
+
else:
|
|
272
|
+
try:
|
|
273
|
+
res: Task = await self.model.a_generate(prompt, schema=Task)
|
|
274
|
+
return res.task
|
|
275
|
+
except TypeError:
|
|
276
|
+
res = await self.model.a_generate(prompt)
|
|
277
|
+
data = trimAndLoadJson(res, self)
|
|
278
|
+
return data["task"]
|
|
279
|
+
|
|
280
|
+
def is_successful(self) -> bool:
|
|
281
|
+
if self.error is not None:
|
|
282
|
+
self.success = False
|
|
283
|
+
else:
|
|
284
|
+
try:
|
|
285
|
+
self.success = self.score >= self.threshold
|
|
286
|
+
except:
|
|
287
|
+
self.success = False
|
|
288
|
+
return self.success
|
|
289
|
+
|
|
290
|
+
@property
|
|
291
|
+
def __name__(self):
|
|
292
|
+
return "Plan Adherence"
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import textwrap
|
|
2
|
+
import json
|
|
3
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PlanAdherenceTemplate:
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def extract_plan_from_trace(trace: dict) -> str:
|
|
10
|
+
return textwrap.dedent(
|
|
11
|
+
f"""You are a **systems analyst** evaluating an AI agent's execution trace.
|
|
12
|
+
|
|
13
|
+
Your sole task is to extract the **explicit or clearly implied plan** the agent followed or intended to follow — *only if that plan is directly evidenced in the trace*.
|
|
14
|
+
|
|
15
|
+
STRICT RULES TO FOLLOW:
|
|
16
|
+
|
|
17
|
+
1. Source Evidence Requirement
|
|
18
|
+
- Every plan step you include **must** be directly supported by explicit text in the trace.
|
|
19
|
+
- Acceptable evidence sources:
|
|
20
|
+
- `"reasoning"` or `"thought"` fields inside tool calls or function invocations.
|
|
21
|
+
- Explicit plan-like statements or lists written by the agent (e.g., “My plan is to…”).
|
|
22
|
+
- If no evidence exists for a step, DO NOT infer or invent it.
|
|
23
|
+
|
|
24
|
+
2. No Hallucination Policy
|
|
25
|
+
- You must *not* create or rephrase steps that aren't explicitly or strongly implied by the trace.
|
|
26
|
+
- If there is no coherent plan present, output an empty list.
|
|
27
|
+
|
|
28
|
+
3. Focus on Intent, Not Outcomes
|
|
29
|
+
- If the agent's plan is stated but execution differs, still extract the intended steps — but only if those intended steps are traceable.
|
|
30
|
+
|
|
31
|
+
4. Granularity
|
|
32
|
+
- Each step should represent a single distinct action or intention.
|
|
33
|
+
- Avoid merging multiple intentions into one step or splitting one intention into multiple steps.
|
|
34
|
+
|
|
35
|
+
5. Neutral Language
|
|
36
|
+
- Reproduce the plan steps in **neutral, minimal paraphrasing**.
|
|
37
|
+
- Do not interpret motivation, quality, or success of actions.
|
|
38
|
+
|
|
39
|
+
OUTPUT FORMAT:
|
|
40
|
+
|
|
41
|
+
Return a JSON object with exactly this structure:
|
|
42
|
+
{{
|
|
43
|
+
"plan": [
|
|
44
|
+
"step 1",
|
|
45
|
+
"step 2",
|
|
46
|
+
...
|
|
47
|
+
]
|
|
48
|
+
}}
|
|
49
|
+
|
|
50
|
+
If no plan is evidenced in the trace, return:
|
|
51
|
+
{{
|
|
52
|
+
"plan": []
|
|
53
|
+
}}
|
|
54
|
+
|
|
55
|
+
Do not include commentary, confidence scores, or explanations.
|
|
56
|
+
|
|
57
|
+
TRACE:
|
|
58
|
+
|
|
59
|
+
{json.dumps(trace, indent=2, default=str)}
|
|
60
|
+
|
|
61
|
+
JSON:
|
|
62
|
+
"""
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def evaluate_adherence(
|
|
67
|
+
user_task: str, agent_plan: str, execution_trace: dict
|
|
68
|
+
) -> str:
|
|
69
|
+
return textwrap.dedent(
|
|
70
|
+
f"""You are an **adversarial plan adherence evaluator**. Your goal is to assign the **lowest justifiable score** based on how strictly the agent's actions in the execution trace align with its declared plan.
|
|
71
|
+
|
|
72
|
+
INPUTS:
|
|
73
|
+
|
|
74
|
+
- **User Task:** The original request or objective.
|
|
75
|
+
- **Agent Plan:** The explicit step-by-step plan the agent was supposed to follow.
|
|
76
|
+
- **Execution Trace:** A detailed record of all agent actions, reasoning, tool calls, and outputs.
|
|
77
|
+
|
|
78
|
+
EVALUATION OBJECTIVE:
|
|
79
|
+
|
|
80
|
+
Determine whether the agent **exactly and exclusively** followed its plan.
|
|
81
|
+
You are not evaluating success, correctness, or usefulness — **only plan obedience**.
|
|
82
|
+
|
|
83
|
+
Assume **non-adherence by default** unless clear, direct evidence in the trace proves that
|
|
84
|
+
each planned step was executed *as written* and *no additional actions occurred*.
|
|
85
|
+
|
|
86
|
+
### STRICT ADHERENCE RULES
|
|
87
|
+
|
|
88
|
+
1. Step Verification
|
|
89
|
+
- Every step in the plan must correspond to a **verifiable, explicit** action or reasoning entry in the trace.
|
|
90
|
+
- Each step must appear in the same logical order as the plan.
|
|
91
|
+
- If a step is missing, only implied, or ambiguous, treat as **not followed**.
|
|
92
|
+
|
|
93
|
+
2. No Extraneous Actions
|
|
94
|
+
- If the trace includes **any** major action, tool call, or reasoning segment not clearly present in the plan, immediately lower the score to as low as possible.
|
|
95
|
+
- Extra or unnecessary steps are considered serious violations.
|
|
96
|
+
|
|
97
|
+
3. Order Consistency
|
|
98
|
+
- If the agent performed steps in a different order than the plan specifies, the score must be close to 0, regardless of other alignment.
|
|
99
|
+
|
|
100
|
+
4. Completeness
|
|
101
|
+
- If even one planned step is missing, skipped, or only partially reflected in the trace, the score must be lowest possible.
|
|
102
|
+
|
|
103
|
+
5. Ambiguity Handling
|
|
104
|
+
- If it is unclear whether a trace action corresponds to a plan step, treat that step as **not executed**.
|
|
105
|
+
- When uncertain, assign the **lower score**.
|
|
106
|
+
|
|
107
|
+
6. Focus Exclusively on Plan Compliance
|
|
108
|
+
- Ignore task success, reasoning quality, or correctness of outcomes.
|
|
109
|
+
- Evaluate *only* whether the trace reflects the exact plan execution.
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
SCORING SCALE
|
|
113
|
+
|
|
114
|
+
- **1.0 — Perfect adherence**
|
|
115
|
+
- Every planned step is explicitly and verifiably present in the trace, in correct order.
|
|
116
|
+
- No skipped or added steps.
|
|
117
|
+
- No ambiguity in matching.
|
|
118
|
+
|
|
119
|
+
- **0.75 — Strong adherence**
|
|
120
|
+
- All or nearly all steps are executed in order.
|
|
121
|
+
- At most one minor deviation (e.g., a trivial reordering or minor redundant step) that does not change the plan’s structure.
|
|
122
|
+
|
|
123
|
+
- **0.5 — Partial adherence**
|
|
124
|
+
- Some steps clearly match, but others are missing, out of order, or replaced.
|
|
125
|
+
- At least one extra or ambiguous action appears.
|
|
126
|
+
- *This should be the highest score possible when there are any deviations.*
|
|
127
|
+
|
|
128
|
+
- **0.25 — Weak adherence**
|
|
129
|
+
- Only a few steps from the plan appear in the trace, or multiple extraneous actions occur.
|
|
130
|
+
- The structure or sequence of the plan is mostly lost.
|
|
131
|
+
|
|
132
|
+
- **0.0 — No adherence**
|
|
133
|
+
- The trace shows little or no resemblance to the plan.
|
|
134
|
+
- Steps are ignored, replaced, or executed in an entirely different order.
|
|
135
|
+
|
|
136
|
+
Always err toward the **lower score** when evidence is partial, ambiguous, or contradictory.
|
|
137
|
+
|
|
138
|
+
OUTPUT FORMAT:
|
|
139
|
+
|
|
140
|
+
Return a JSON object with exactly this structure:
|
|
141
|
+
|
|
142
|
+
{{
|
|
143
|
+
"score": 0.0,
|
|
144
|
+
"reason": "1-3 concise, factual sentences citing specific matched, missing, or extra steps."
|
|
145
|
+
}}
|
|
146
|
+
|
|
147
|
+
Requirements for `"reason"`:
|
|
148
|
+
- Reference specific plan step numbers or phrases.
|
|
149
|
+
- Mention concrete trace evidence of mismatches or additions.
|
|
150
|
+
- Avoid subjective adjectives (e.g., “mostly”, “close”, “reasonable”).
|
|
151
|
+
- Be precise and neutral.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
INPUTS:
|
|
156
|
+
|
|
157
|
+
User Task:
|
|
158
|
+
{user_task}
|
|
159
|
+
|
|
160
|
+
Agent Plan:
|
|
161
|
+
{agent_plan}
|
|
162
|
+
|
|
163
|
+
Execution Trace:
|
|
164
|
+
{json.dumps(execution_trace, indent=2, default=make_json_serializable)}
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
JSON:
|
|
169
|
+
"""
|
|
170
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .plan_quality import PlanQualityMetric
|