deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
from typing import Optional, List, Union, Dict
|
|
2
|
+
|
|
3
|
+
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
construct_verbose_logs,
|
|
6
|
+
trimAndLoadJson,
|
|
7
|
+
check_llm_test_case_params,
|
|
8
|
+
initialize_model,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
11
|
+
from deepeval.metrics import BaseMetric
|
|
12
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
13
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
|
+
from deepeval.metrics.step_efficiency.template import (
|
|
15
|
+
StepEfficiencyTemplate,
|
|
16
|
+
)
|
|
17
|
+
from deepeval.metrics.step_efficiency.schema import Task
|
|
18
|
+
from deepeval.metrics.plan_quality.schema import (
|
|
19
|
+
AgentPlan,
|
|
20
|
+
PlanQualityScore,
|
|
21
|
+
)
|
|
22
|
+
from deepeval.metrics.plan_quality.template import (
|
|
23
|
+
PlanQualityTemplate,
|
|
24
|
+
)
|
|
25
|
+
from deepeval.metrics.plan_adherence.template import (
|
|
26
|
+
PlanAdherenceTemplate,
|
|
27
|
+
)
|
|
28
|
+
from deepeval.metrics.api import metric_data_manager
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PlanQualityMetric(BaseMetric):
|
|
32
|
+
|
|
33
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
34
|
+
LLMTestCaseParams.INPUT,
|
|
35
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
36
|
+
LLMTestCaseParams.TOOLS_CALLED,
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
threshold: float = 0.5,
|
|
42
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
43
|
+
include_reason: bool = True,
|
|
44
|
+
async_mode: bool = True,
|
|
45
|
+
strict_mode: bool = False,
|
|
46
|
+
verbose_mode: bool = False,
|
|
47
|
+
):
|
|
48
|
+
self.threshold = 1 if strict_mode else threshold
|
|
49
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
50
|
+
self.evaluation_model = self.model.get_model_name()
|
|
51
|
+
self.include_reason = include_reason
|
|
52
|
+
self.async_mode = async_mode
|
|
53
|
+
self.strict_mode = strict_mode
|
|
54
|
+
self.verbose_mode = verbose_mode
|
|
55
|
+
self.requires_trace = True
|
|
56
|
+
|
|
57
|
+
def measure(
|
|
58
|
+
self,
|
|
59
|
+
test_case: LLMTestCase,
|
|
60
|
+
_show_indicator: bool = True,
|
|
61
|
+
_in_component: bool = False,
|
|
62
|
+
_log_metric_to_confident: bool = True,
|
|
63
|
+
):
|
|
64
|
+
has_trace: bool = isinstance(test_case._trace_dict, Dict)
|
|
65
|
+
if not has_trace:
|
|
66
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
67
|
+
|
|
68
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
69
|
+
with metric_progress_indicator(
|
|
70
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
71
|
+
):
|
|
72
|
+
if self.async_mode:
|
|
73
|
+
loop = get_or_create_event_loop()
|
|
74
|
+
loop.run_until_complete(
|
|
75
|
+
self.a_measure(
|
|
76
|
+
test_case,
|
|
77
|
+
_show_indicator=False,
|
|
78
|
+
_in_component=_in_component,
|
|
79
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
task = self._extract_task_from_trace(test_case)
|
|
84
|
+
agent_plan = self._extract_plan_from_trace(test_case)
|
|
85
|
+
if len(agent_plan.plan) == 0:
|
|
86
|
+
self.score = 1
|
|
87
|
+
self.reason = "There were no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in any one of the trace attributes."
|
|
88
|
+
else:
|
|
89
|
+
plan_quality_score = self._get_plan_quality_score(
|
|
90
|
+
task, agent_plan.plan
|
|
91
|
+
)
|
|
92
|
+
self.score = (
|
|
93
|
+
0
|
|
94
|
+
if self.strict_mode
|
|
95
|
+
and plan_quality_score.score < self.threshold
|
|
96
|
+
else plan_quality_score.score
|
|
97
|
+
)
|
|
98
|
+
self.reason = plan_quality_score.reason
|
|
99
|
+
self.success = self.score >= self.threshold
|
|
100
|
+
self.verbose_logs = construct_verbose_logs(
|
|
101
|
+
self,
|
|
102
|
+
steps=[
|
|
103
|
+
f"Task: {task} \n",
|
|
104
|
+
f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
|
|
105
|
+
f"Final Score Score: {self.score} \n",
|
|
106
|
+
f"Final Score Reason: {self.reason} \n",
|
|
107
|
+
],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if _log_metric_to_confident:
|
|
111
|
+
metric_data_manager.post_metric_if_enabled(
|
|
112
|
+
self, test_case=test_case
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
return self.score
|
|
116
|
+
|
|
117
|
+
async def a_measure(
|
|
118
|
+
self,
|
|
119
|
+
test_case: LLMTestCase,
|
|
120
|
+
_show_indicator: bool = True,
|
|
121
|
+
_in_component: bool = False,
|
|
122
|
+
_log_metric_to_confident: bool = True,
|
|
123
|
+
):
|
|
124
|
+
has_trace: bool = isinstance(test_case._trace_dict, Dict)
|
|
125
|
+
if not has_trace:
|
|
126
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
127
|
+
|
|
128
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
129
|
+
|
|
130
|
+
with metric_progress_indicator(
|
|
131
|
+
self,
|
|
132
|
+
async_mode=True,
|
|
133
|
+
_show_indicator=_show_indicator,
|
|
134
|
+
_in_component=_in_component,
|
|
135
|
+
):
|
|
136
|
+
task = await self._a_extract_task_from_trace(test_case)
|
|
137
|
+
agent_plan = await self._a_extract_plan_from_trace(test_case)
|
|
138
|
+
if len(agent_plan.plan) == 0:
|
|
139
|
+
self.score = 1
|
|
140
|
+
self.reason = "There are no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in the trace attributes."
|
|
141
|
+
else:
|
|
142
|
+
plan_quality_score = await self._a_get_plan_quality_score(
|
|
143
|
+
task, agent_plan.plan
|
|
144
|
+
)
|
|
145
|
+
self.score = (
|
|
146
|
+
0
|
|
147
|
+
if self.strict_mode
|
|
148
|
+
and plan_quality_score.score < self.threshold
|
|
149
|
+
else plan_quality_score.score
|
|
150
|
+
)
|
|
151
|
+
self.reason = plan_quality_score.reason
|
|
152
|
+
self.success = self.score >= self.threshold
|
|
153
|
+
self.verbose_logs = construct_verbose_logs(
|
|
154
|
+
self,
|
|
155
|
+
steps=[
|
|
156
|
+
f"Task: {task} \n",
|
|
157
|
+
f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
|
|
158
|
+
f"Final Score: {self.score} \n",
|
|
159
|
+
f"Final Reason: {self.reason} \n",
|
|
160
|
+
],
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if _log_metric_to_confident:
|
|
164
|
+
metric_data_manager.post_metric_if_enabled(
|
|
165
|
+
self, test_case=test_case
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return self.score
|
|
169
|
+
|
|
170
|
+
def _get_plan_quality_score(self, task, plan):
|
|
171
|
+
prompt = PlanQualityTemplate.evaluate_plan_quality(
|
|
172
|
+
task, "\n".join(plan)
|
|
173
|
+
)
|
|
174
|
+
if self.using_native_model:
|
|
175
|
+
res, cost = self.model.generate(prompt, schema=PlanQualityScore)
|
|
176
|
+
self.evaluation_cost += cost
|
|
177
|
+
return res
|
|
178
|
+
else:
|
|
179
|
+
try:
|
|
180
|
+
res: Task = self.model.generate(prompt, schema=PlanQualityScore)
|
|
181
|
+
return res
|
|
182
|
+
except TypeError:
|
|
183
|
+
res = self.model.generate(prompt)
|
|
184
|
+
data = trimAndLoadJson(res, self)
|
|
185
|
+
return PlanQualityScore(**data)
|
|
186
|
+
|
|
187
|
+
async def _a_get_plan_quality_score(self, task, plan):
|
|
188
|
+
prompt = PlanQualityTemplate.evaluate_plan_quality(
|
|
189
|
+
task, "\n".join(plan)
|
|
190
|
+
)
|
|
191
|
+
if self.using_native_model:
|
|
192
|
+
res, cost = await self.model.a_generate(
|
|
193
|
+
prompt, schema=PlanQualityScore
|
|
194
|
+
)
|
|
195
|
+
self.evaluation_cost += cost
|
|
196
|
+
return res
|
|
197
|
+
else:
|
|
198
|
+
try:
|
|
199
|
+
res: Task = await self.model.a_generate(
|
|
200
|
+
prompt, schema=PlanQualityScore
|
|
201
|
+
)
|
|
202
|
+
return res
|
|
203
|
+
except TypeError:
|
|
204
|
+
res = await self.model.a_generate(prompt)
|
|
205
|
+
data = trimAndLoadJson(res, self)
|
|
206
|
+
return PlanQualityScore(**data)
|
|
207
|
+
|
|
208
|
+
def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
|
|
209
|
+
prompt = PlanAdherenceTemplate.extract_plan_from_trace(
|
|
210
|
+
test_case._trace_dict
|
|
211
|
+
)
|
|
212
|
+
if self.using_native_model:
|
|
213
|
+
res, cost = self.model.generate(prompt, schema=AgentPlan)
|
|
214
|
+
self.evaluation_cost += cost
|
|
215
|
+
return res
|
|
216
|
+
else:
|
|
217
|
+
try:
|
|
218
|
+
res: Task = self.model.generate(prompt, schema=AgentPlan)
|
|
219
|
+
return res
|
|
220
|
+
except TypeError:
|
|
221
|
+
res = self.model.generate(prompt)
|
|
222
|
+
data = trimAndLoadJson(res, self)
|
|
223
|
+
return AgentPlan(**data)
|
|
224
|
+
|
|
225
|
+
async def _a_extract_plan_from_trace(
|
|
226
|
+
self, test_case: LLMTestCase
|
|
227
|
+
) -> AgentPlan:
|
|
228
|
+
prompt = PlanAdherenceTemplate.extract_plan_from_trace(
|
|
229
|
+
test_case._trace_dict
|
|
230
|
+
)
|
|
231
|
+
if self.using_native_model:
|
|
232
|
+
res, cost = await self.model.a_generate(prompt, schema=AgentPlan)
|
|
233
|
+
self.evaluation_cost += cost
|
|
234
|
+
return res
|
|
235
|
+
else:
|
|
236
|
+
try:
|
|
237
|
+
res: Task = await self.model.a_generate(
|
|
238
|
+
prompt, schema=AgentPlan
|
|
239
|
+
)
|
|
240
|
+
return res
|
|
241
|
+
except TypeError:
|
|
242
|
+
res = await self.model.a_generate(prompt)
|
|
243
|
+
data = trimAndLoadJson(res, self)
|
|
244
|
+
return AgentPlan(**data)
|
|
245
|
+
|
|
246
|
+
def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
|
|
247
|
+
prompt = StepEfficiencyTemplate.extract_task_from_trace(
|
|
248
|
+
test_case._trace_dict
|
|
249
|
+
)
|
|
250
|
+
if self.using_native_model:
|
|
251
|
+
res, cost = self.model.generate(prompt, schema=Task)
|
|
252
|
+
self.evaluation_cost += cost
|
|
253
|
+
return res.task
|
|
254
|
+
else:
|
|
255
|
+
try:
|
|
256
|
+
res: Task = self.model.generate(prompt, schema=Task)
|
|
257
|
+
return res.task
|
|
258
|
+
except TypeError:
|
|
259
|
+
res = self.model.generate(prompt)
|
|
260
|
+
data = trimAndLoadJson(res, self)
|
|
261
|
+
return data["task"]
|
|
262
|
+
|
|
263
|
+
async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
|
|
264
|
+
prompt = StepEfficiencyTemplate.extract_task_from_trace(
|
|
265
|
+
test_case._trace_dict
|
|
266
|
+
)
|
|
267
|
+
if self.using_native_model:
|
|
268
|
+
res, cost = await self.model.a_generate(prompt, schema=Task)
|
|
269
|
+
self.evaluation_cost += cost
|
|
270
|
+
return res.task
|
|
271
|
+
else:
|
|
272
|
+
try:
|
|
273
|
+
res: Task = await self.model.a_generate(prompt, schema=Task)
|
|
274
|
+
return res.task
|
|
275
|
+
except TypeError:
|
|
276
|
+
res = await self.model.a_generate(prompt)
|
|
277
|
+
data = trimAndLoadJson(res, self)
|
|
278
|
+
return data["task"]
|
|
279
|
+
|
|
280
|
+
def is_successful(self) -> bool:
|
|
281
|
+
if self.error is not None:
|
|
282
|
+
self.success = False
|
|
283
|
+
else:
|
|
284
|
+
try:
|
|
285
|
+
self.success = self.score >= self.threshold
|
|
286
|
+
except:
|
|
287
|
+
self.success = False
|
|
288
|
+
return self.success
|
|
289
|
+
|
|
290
|
+
@property
|
|
291
|
+
def __name__(self):
|
|
292
|
+
return "Plan Quality"
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import textwrap
|
|
2
|
+
import json
|
|
3
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PlanQualityTemplate:
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def evaluate_plan_quality(user_task: str, agent_plan: list) -> str:
|
|
10
|
+
return textwrap.dedent(
|
|
11
|
+
f"""You are a **plan quality evaluator**. Your task is to critically assess the **quality, completeness, and optimality** of an AI agent's plan to accomplish the given user task.
|
|
12
|
+
|
|
13
|
+
INPUTS:
|
|
14
|
+
|
|
15
|
+
- **User Task:** The user's explicit goal or instruction.
|
|
16
|
+
- **Agent Plan:** The ordered list of steps the agent intends to follow to achieve that goal.
|
|
17
|
+
|
|
18
|
+
EVALUATION OBJECTIVE:
|
|
19
|
+
|
|
20
|
+
Judge the **intrinsic quality** of the plan — whether the plan itself is strong enough to fully and efficiently achieve the user's task.
|
|
21
|
+
|
|
22
|
+
The evaluation must be **strict**. If the plan is incomplete, inefficient, redundant, or missing critical details, assign a very low score.
|
|
23
|
+
|
|
24
|
+
STRICT EVALUATION CRITERIA:
|
|
25
|
+
|
|
26
|
+
1. Completeness (Most Important)
|
|
27
|
+
- The plan must fully address all major requirements of the user task.
|
|
28
|
+
- Missing even one critical subtask or dependency should reduce the score sharply.
|
|
29
|
+
- The plan must include all prerequisite actions necessary for the final outcome.
|
|
30
|
+
|
|
31
|
+
2. Logical Coherence
|
|
32
|
+
- Steps must follow a clear, rational sequence that leads directly to completing the task.
|
|
33
|
+
- Disordered, redundant, or circular reasoning should be penalized heavily.
|
|
34
|
+
- Every step must have a clear purpose; no filler or irrelevant actions.
|
|
35
|
+
|
|
36
|
+
3. Optimality and Efficiency
|
|
37
|
+
- The plan must be **minimal but sufficient** — no unnecessary or repetitive steps.
|
|
38
|
+
- If a more direct, simpler, or logically superior plan could achieve the same outcome, the current plan should receive a lower score.
|
|
39
|
+
|
|
40
|
+
4. Level of Detail
|
|
41
|
+
- Each step should be specific enough for an agent to execute it reliably without ambiguity.
|
|
42
|
+
- Vague steps (e.g., “Do research”, “Handle results”) that lack operational clarity
|
|
43
|
+
lower the score.
|
|
44
|
+
|
|
45
|
+
5. Alignment with Task
|
|
46
|
+
- The plan must explicitly and directly target the user's stated goal.
|
|
47
|
+
- If any step diverges from the main objective, the score should drop significantly.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
SCORING SCALE (STRICT)
|
|
52
|
+
|
|
53
|
+
- **1.0 — Excellent plan**
|
|
54
|
+
- Fully complete, logically ordered, and optimally efficient.
|
|
55
|
+
- No missing, redundant, or ambiguous steps.
|
|
56
|
+
- Directly fulfills every aspect of the user task.
|
|
57
|
+
|
|
58
|
+
- **0.75 — Good plan**
|
|
59
|
+
- Covers nearly all aspects of the task with clear logic.
|
|
60
|
+
- Minor gaps or small inefficiencies that do not block task completion.
|
|
61
|
+
|
|
62
|
+
- **0.5 — Adequate but flawed plan**
|
|
63
|
+
- Partially complete; key details missing or step order inefficient.
|
|
64
|
+
- Some ambiguity or redundancy that would likely affect execution success.
|
|
65
|
+
|
|
66
|
+
- **0.25 — Weak plan**
|
|
67
|
+
- Major missing steps or unclear logic.
|
|
68
|
+
- The plan would likely fail to complete the task as written.
|
|
69
|
+
|
|
70
|
+
- **0.0 — Inadequate plan**
|
|
71
|
+
- Irrelevant, incoherent, or severely incomplete plan.
|
|
72
|
+
- Does not align with the user’s task or cannot plausibly achieve it.
|
|
73
|
+
|
|
74
|
+
*When in doubt, assign the lower score.*
|
|
75
|
+
|
|
76
|
+
OUTPUT FORMAT:
|
|
77
|
+
|
|
78
|
+
Return a JSON object with this exact structure:
|
|
79
|
+
|
|
80
|
+
{{
|
|
81
|
+
"score": 0.0,
|
|
82
|
+
"reason": "1-3 short, precise sentences explaining what the plan lacks or how it could fail."
|
|
83
|
+
}}
|
|
84
|
+
|
|
85
|
+
The `"reason"` must:
|
|
86
|
+
- Reference specific missing, unclear, or inefficient steps.
|
|
87
|
+
- Avoid vague language (“seems fine”, “mostly works”).
|
|
88
|
+
- Use objective terms describing gaps or weaknesses.
|
|
89
|
+
|
|
90
|
+
PROVIDED DATA
|
|
91
|
+
|
|
92
|
+
User Task:
|
|
93
|
+
{user_task}
|
|
94
|
+
|
|
95
|
+
Agent Plan:
|
|
96
|
+
{agent_plan}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
JSON:
|
|
100
|
+
"""
|
|
101
|
+
)
|
|
@@ -20,6 +20,8 @@ from deepeval.metrics.indicator import metric_progress_indicator
|
|
|
20
20
|
from deepeval.metrics.prompt_alignment import schema as paschema
|
|
21
21
|
from deepeval.config.settings import get_settings
|
|
22
22
|
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
24
|
+
|
|
23
25
|
|
|
24
26
|
class PromptAlignmentMetric(BaseMetric):
|
|
25
27
|
|
|
@@ -55,6 +57,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
55
57
|
test_case: LLMTestCase,
|
|
56
58
|
_show_indicator: bool = True,
|
|
57
59
|
_in_component: bool = False,
|
|
60
|
+
_log_metric_to_confident: bool = True,
|
|
58
61
|
) -> float:
|
|
59
62
|
|
|
60
63
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -93,6 +96,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
93
96
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
94
97
|
],
|
|
95
98
|
)
|
|
99
|
+
if _log_metric_to_confident:
|
|
100
|
+
metric_data_manager.post_metric_if_enabled(
|
|
101
|
+
self, test_case=test_case
|
|
102
|
+
)
|
|
96
103
|
|
|
97
104
|
return self.score
|
|
98
105
|
|
|
@@ -101,6 +108,7 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
101
108
|
test_case: LLMTestCase,
|
|
102
109
|
_show_indicator: bool = True,
|
|
103
110
|
_in_component: bool = False,
|
|
111
|
+
_log_metric_to_confident: bool = True,
|
|
104
112
|
) -> float:
|
|
105
113
|
|
|
106
114
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -128,7 +136,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
128
136
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
129
137
|
],
|
|
130
138
|
)
|
|
131
|
-
|
|
139
|
+
if _log_metric_to_confident:
|
|
140
|
+
metric_data_manager.post_metric_if_enabled(
|
|
141
|
+
self, test_case=test_case
|
|
142
|
+
)
|
|
132
143
|
return self.score
|
|
133
144
|
|
|
134
145
|
async def _a_generate_reason(self, input: str, actual_output: str) -> str:
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Optional, Union, List
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics import BaseConversationalMetric
|
|
4
|
+
from deepeval.metrics.api import metric_data_manager
|
|
4
5
|
from deepeval.metrics.role_adherence.schema import (
|
|
5
6
|
OutOfCharacterResponseVerdicts,
|
|
6
7
|
)
|
|
@@ -44,6 +45,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
44
45
|
test_case: ConversationalTestCase,
|
|
45
46
|
_show_indicator: bool = True,
|
|
46
47
|
_in_component: bool = False,
|
|
48
|
+
_log_metric_to_confident: bool = True,
|
|
47
49
|
):
|
|
48
50
|
check_conversational_test_case_params(
|
|
49
51
|
test_case,
|
|
@@ -63,6 +65,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
63
65
|
test_case,
|
|
64
66
|
_show_indicator=False,
|
|
65
67
|
_in_component=_in_component,
|
|
68
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
69
|
)
|
|
67
70
|
)
|
|
68
71
|
else:
|
|
@@ -82,6 +85,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
82
85
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
83
86
|
],
|
|
84
87
|
)
|
|
88
|
+
if _log_metric_to_confident:
|
|
89
|
+
metric_data_manager.post_metric_if_enabled(
|
|
90
|
+
self, test_case=test_case
|
|
91
|
+
)
|
|
85
92
|
return self.score
|
|
86
93
|
|
|
87
94
|
async def a_measure(
|
|
@@ -89,6 +96,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
89
96
|
test_case: ConversationalTestCase,
|
|
90
97
|
_show_indicator: bool = True,
|
|
91
98
|
_in_component: bool = False,
|
|
99
|
+
_log_metric_to_confident: bool = True,
|
|
92
100
|
) -> float:
|
|
93
101
|
check_conversational_test_case_params(
|
|
94
102
|
test_case,
|
|
@@ -124,6 +132,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
124
132
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
125
133
|
],
|
|
126
134
|
)
|
|
135
|
+
if _log_metric_to_confident:
|
|
136
|
+
metric_data_manager.post_metric_if_enabled(
|
|
137
|
+
self, test_case=test_case
|
|
138
|
+
)
|
|
127
139
|
return self.score
|
|
128
140
|
|
|
129
141
|
async def _a_generate_reason(self, role: str) -> str:
|
|
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
|
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.role_violation.template import RoleViolationTemplate
|
|
19
19
|
from deepeval.metrics.role_violation.schema import *
|
|
20
|
+
from deepeval.metrics.api import metric_data_manager
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class RoleViolationMetric(BaseMetric):
|
|
@@ -58,6 +59,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
58
59
|
test_case: LLMTestCase,
|
|
59
60
|
_show_indicator: bool = True,
|
|
60
61
|
_in_component: bool = False,
|
|
62
|
+
_log_metric_to_confident: bool = True,
|
|
61
63
|
) -> float:
|
|
62
64
|
|
|
63
65
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -73,6 +75,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
73
75
|
test_case,
|
|
74
76
|
_show_indicator=False,
|
|
75
77
|
_in_component=_in_component,
|
|
78
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
76
79
|
)
|
|
77
80
|
)
|
|
78
81
|
else:
|
|
@@ -94,6 +97,10 @@ class RoleViolationMetric(BaseMetric):
|
|
|
94
97
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
95
98
|
],
|
|
96
99
|
)
|
|
100
|
+
if _log_metric_to_confident:
|
|
101
|
+
metric_data_manager.post_metric_if_enabled(
|
|
102
|
+
self, test_case=test_case
|
|
103
|
+
)
|
|
97
104
|
|
|
98
105
|
return self.score
|
|
99
106
|
|
|
@@ -102,6 +109,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
102
109
|
test_case: LLMTestCase,
|
|
103
110
|
_show_indicator: bool = True,
|
|
104
111
|
_in_component: bool = False,
|
|
112
|
+
_log_metric_to_confident: bool = True,
|
|
105
113
|
) -> float:
|
|
106
114
|
|
|
107
115
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -131,6 +139,10 @@ class RoleViolationMetric(BaseMetric):
|
|
|
131
139
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
132
140
|
],
|
|
133
141
|
)
|
|
142
|
+
if _log_metric_to_confident:
|
|
143
|
+
metric_data_manager.post_metric_if_enabled(
|
|
144
|
+
self, test_case=test_case
|
|
145
|
+
)
|
|
134
146
|
|
|
135
147
|
return self.score
|
|
136
148
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .step_efficiency import StepEfficiencyMetric
|