deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
from typing import Optional, List, Union
|
|
2
|
+
import asyncio
|
|
3
|
+
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
construct_verbose_logs,
|
|
6
|
+
trimAndLoadJson,
|
|
7
|
+
get_unit_interactions,
|
|
8
|
+
print_tools_called,
|
|
9
|
+
check_conversational_test_case_params,
|
|
10
|
+
initialize_model,
|
|
11
|
+
)
|
|
12
|
+
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
13
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
14
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
15
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
16
|
+
from deepeval.metrics.goal_accuracy.template import (
|
|
17
|
+
GoalAccuracyTemplate,
|
|
18
|
+
)
|
|
19
|
+
from deepeval.metrics.goal_accuracy.schema import (
|
|
20
|
+
GoalSteps,
|
|
21
|
+
GoalScore,
|
|
22
|
+
PlanScore,
|
|
23
|
+
)
|
|
24
|
+
from deepeval.metrics.api import metric_data_manager
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class GoalAccuracyMetric(BaseConversationalMetric):
|
|
28
|
+
|
|
29
|
+
_required_test_case_params = [
|
|
30
|
+
TurnParams.ROLE,
|
|
31
|
+
TurnParams.CONTENT,
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
threshold: float = 0.5,
|
|
37
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
38
|
+
include_reason: bool = True,
|
|
39
|
+
async_mode: bool = True,
|
|
40
|
+
strict_mode: bool = False,
|
|
41
|
+
verbose_mode: bool = False,
|
|
42
|
+
):
|
|
43
|
+
self.threshold = 1 if strict_mode else threshold
|
|
44
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
45
|
+
self.evaluation_model = self.model.get_model_name()
|
|
46
|
+
self.include_reason = include_reason
|
|
47
|
+
self.async_mode = async_mode
|
|
48
|
+
self.strict_mode = strict_mode
|
|
49
|
+
self.verbose_mode = verbose_mode
|
|
50
|
+
|
|
51
|
+
def measure(
|
|
52
|
+
self,
|
|
53
|
+
test_case: ConversationalTestCase,
|
|
54
|
+
_show_indicator: bool = True,
|
|
55
|
+
_in_component: bool = False,
|
|
56
|
+
_log_metric_to_confident: bool = True,
|
|
57
|
+
):
|
|
58
|
+
check_conversational_test_case_params(
|
|
59
|
+
test_case, self._required_test_case_params, self
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
63
|
+
with metric_progress_indicator(
|
|
64
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
65
|
+
):
|
|
66
|
+
if self.async_mode:
|
|
67
|
+
loop = get_or_create_event_loop()
|
|
68
|
+
loop.run_until_complete(
|
|
69
|
+
self.a_measure(
|
|
70
|
+
test_case,
|
|
71
|
+
_show_indicator=False,
|
|
72
|
+
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
78
|
+
goal_and_steps_taken = self._goal_and_steps_taken(
|
|
79
|
+
unit_interactions
|
|
80
|
+
)
|
|
81
|
+
goal_scores = [
|
|
82
|
+
self._get_goal_accuracy_score(
|
|
83
|
+
task.user_goal, task.steps_taken
|
|
84
|
+
)
|
|
85
|
+
for task in goal_and_steps_taken
|
|
86
|
+
]
|
|
87
|
+
plan_scores = [
|
|
88
|
+
self._get_plan_scores(task.user_goal, task.steps_taken)
|
|
89
|
+
for task in goal_and_steps_taken
|
|
90
|
+
]
|
|
91
|
+
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
92
|
+
self.success = self.score >= self.threshold
|
|
93
|
+
self.reason = self._generate_reason(goal_scores, plan_scores)
|
|
94
|
+
|
|
95
|
+
self.verbose_logs = construct_verbose_logs(
|
|
96
|
+
self,
|
|
97
|
+
steps=[
|
|
98
|
+
f"Goals and steps taken: \n{self.print_goals_and_steps_taken(goal_and_steps_taken)} \n",
|
|
99
|
+
f"Goal evaluations: {prettify_list(goal_scores)} \n\n"
|
|
100
|
+
f"Plan evaluations: {prettify_list(plan_scores)} \n\n"
|
|
101
|
+
f"Final Score: {self.score}",
|
|
102
|
+
f"Final Reason: {self.reason}",
|
|
103
|
+
],
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if _log_metric_to_confident:
|
|
107
|
+
metric_data_manager.post_metric_if_enabled(
|
|
108
|
+
self, test_case=test_case
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return self.score
|
|
112
|
+
|
|
113
|
+
async def a_measure(
|
|
114
|
+
self,
|
|
115
|
+
test_case: ConversationalTestCase,
|
|
116
|
+
_show_indicator: bool = True,
|
|
117
|
+
_in_component: bool = False,
|
|
118
|
+
_log_metric_to_confident: bool = True,
|
|
119
|
+
):
|
|
120
|
+
check_conversational_test_case_params(
|
|
121
|
+
test_case, self._required_test_case_params, self
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
125
|
+
|
|
126
|
+
with metric_progress_indicator(
|
|
127
|
+
self,
|
|
128
|
+
async_mode=True,
|
|
129
|
+
_show_indicator=_show_indicator,
|
|
130
|
+
_in_component=_in_component,
|
|
131
|
+
):
|
|
132
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
133
|
+
goal_and_steps_taken = self._goal_and_steps_taken(unit_interactions)
|
|
134
|
+
goal_scores = await asyncio.gather(
|
|
135
|
+
*[
|
|
136
|
+
self._a_get_goal_accuracy_score(
|
|
137
|
+
task.user_goal, task.steps_taken
|
|
138
|
+
)
|
|
139
|
+
for task in goal_and_steps_taken
|
|
140
|
+
]
|
|
141
|
+
)
|
|
142
|
+
plan_scores = await asyncio.gather(
|
|
143
|
+
*[
|
|
144
|
+
self._a_get_plan_scores(task.user_goal, task.steps_taken)
|
|
145
|
+
for task in goal_and_steps_taken
|
|
146
|
+
]
|
|
147
|
+
)
|
|
148
|
+
self.score = self._calculate_score(goal_scores, plan_scores)
|
|
149
|
+
self.success = self.score >= self.threshold
|
|
150
|
+
self.reason = await self._a_generate_reason(
|
|
151
|
+
goal_scores, plan_scores
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
self.verbose_logs = construct_verbose_logs(
|
|
155
|
+
self,
|
|
156
|
+
steps=[
|
|
157
|
+
f"Goals and steps taken: \n{self.print_goals_and_steps_taken(goal_and_steps_taken)} \n",
|
|
158
|
+
f"Goal evaluations: {prettify_list(goal_scores)} \n\n"
|
|
159
|
+
f"Plan evaluations: {prettify_list(plan_scores)} \n\n"
|
|
160
|
+
f"Final Score: {self.score}",
|
|
161
|
+
f"Final Reason: {self.reason}",
|
|
162
|
+
],
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if _log_metric_to_confident:
|
|
166
|
+
metric_data_manager.post_metric_if_enabled(
|
|
167
|
+
self, test_case=test_case
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return self.score
|
|
171
|
+
|
|
172
|
+
def _goal_and_steps_taken(
|
|
173
|
+
self, unit_interactions: List[List[Turn]]
|
|
174
|
+
) -> List[GoalSteps]:
|
|
175
|
+
goal_and_steps_taken = []
|
|
176
|
+
for unit_interaction in unit_interactions:
|
|
177
|
+
user_messages = "User messages: \n"
|
|
178
|
+
for turn in unit_interaction:
|
|
179
|
+
if turn.role == "user":
|
|
180
|
+
user_messages += turn.content + "\n"
|
|
181
|
+
else:
|
|
182
|
+
break
|
|
183
|
+
new_goal_steps = GoalSteps(user_goal=user_messages, steps_taken=[])
|
|
184
|
+
assistant_messages = "Assistant messages: \n"
|
|
185
|
+
for turn in unit_interaction[1:]:
|
|
186
|
+
if turn.role == "assistant":
|
|
187
|
+
assistant_messages += f"{turn.content} \n"
|
|
188
|
+
if turn.tools_called:
|
|
189
|
+
assistant_messages += f"Tools called: \n{print_tools_called(turn.tools_called)} \n"
|
|
190
|
+
new_goal_steps.steps_taken.append(assistant_messages)
|
|
191
|
+
goal_and_steps_taken.append(new_goal_steps)
|
|
192
|
+
return goal_and_steps_taken
|
|
193
|
+
|
|
194
|
+
def _get_plan_scores(self, user_goal, steps_taken):
|
|
195
|
+
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
196
|
+
user_goal, "\n".join(steps_taken)
|
|
197
|
+
)
|
|
198
|
+
if self.using_native_model:
|
|
199
|
+
res, cost = self.model.generate(prompt, schema=PlanScore)
|
|
200
|
+
self.evaluation_cost += cost
|
|
201
|
+
return res
|
|
202
|
+
else:
|
|
203
|
+
try:
|
|
204
|
+
res: PlanScore = self.model.generate(prompt, schema=PlanScore)
|
|
205
|
+
return res
|
|
206
|
+
except TypeError:
|
|
207
|
+
res = self.model.generate(prompt)
|
|
208
|
+
data = trimAndLoadJson(res, self)
|
|
209
|
+
return PlanScore(**data)
|
|
210
|
+
|
|
211
|
+
async def _a_get_plan_scores(self, user_goal, steps_taken):
|
|
212
|
+
prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
|
|
213
|
+
user_goal, "\n".join(steps_taken)
|
|
214
|
+
)
|
|
215
|
+
if self.using_native_model:
|
|
216
|
+
res, cost = await self.model.a_generate(prompt, schema=PlanScore)
|
|
217
|
+
self.evaluation_cost += cost
|
|
218
|
+
return res
|
|
219
|
+
else:
|
|
220
|
+
try:
|
|
221
|
+
res: PlanScore = await self.model.a_generate(
|
|
222
|
+
prompt, schema=PlanScore
|
|
223
|
+
)
|
|
224
|
+
return res
|
|
225
|
+
except TypeError:
|
|
226
|
+
res = await self.model.a_generate(prompt)
|
|
227
|
+
data = trimAndLoadJson(res, self)
|
|
228
|
+
return PlanScore(**data)
|
|
229
|
+
|
|
230
|
+
def _calculate_score(
|
|
231
|
+
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
232
|
+
):
|
|
233
|
+
goal_scores = [goal_score.score for goal_score in goal_scores]
|
|
234
|
+
plan_scores = [plan_score.score for plan_score in plan_scores]
|
|
235
|
+
goal_score_divisor = len(goal_scores) if len(goal_scores) > 0 else 1
|
|
236
|
+
plan_score_divisor = len(plan_scores) if len(plan_scores) > 0 else 1
|
|
237
|
+
goal_avg = sum(goal_scores) / goal_score_divisor
|
|
238
|
+
plan_avg = sum(plan_scores) / plan_score_divisor
|
|
239
|
+
score = (goal_avg + plan_avg) / 2
|
|
240
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
241
|
+
|
|
242
|
+
def _generate_reason(
|
|
243
|
+
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
244
|
+
):
|
|
245
|
+
goal_evaluations = ""
|
|
246
|
+
for goal_score in goal_scores:
|
|
247
|
+
goal_evaluations += (
|
|
248
|
+
f"Score: {goal_score.score}, Reason: {goal_score.reason}"
|
|
249
|
+
)
|
|
250
|
+
plan_evalautions = ""
|
|
251
|
+
for plan_score in plan_scores:
|
|
252
|
+
plan_evalautions += (
|
|
253
|
+
f"Score: {plan_score.score}, Reason: {plan_score.reason} \n"
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
257
|
+
self.score, self.threshold, goal_evaluations, plan_evalautions
|
|
258
|
+
)
|
|
259
|
+
if self.using_native_model:
|
|
260
|
+
res, cost = self.model.generate(prompt)
|
|
261
|
+
self.evaluation_cost += cost
|
|
262
|
+
return res
|
|
263
|
+
else:
|
|
264
|
+
res = self.model.generate(prompt)
|
|
265
|
+
return res
|
|
266
|
+
|
|
267
|
+
async def _a_generate_reason(
|
|
268
|
+
self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
|
|
269
|
+
):
|
|
270
|
+
goal_evaluations = ""
|
|
271
|
+
for goal_score in goal_scores:
|
|
272
|
+
goal_evaluations += (
|
|
273
|
+
f"Score: {goal_score.score}, Reason: {goal_score.reason}"
|
|
274
|
+
)
|
|
275
|
+
plan_evalautions = ""
|
|
276
|
+
for plan_score in plan_scores:
|
|
277
|
+
plan_evalautions += (
|
|
278
|
+
f"Score: {plan_score.score}, Reason: {plan_score.reason} \n"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
prompt = GoalAccuracyTemplate.get_final_reason(
|
|
282
|
+
self.score, self.threshold, goal_evaluations, plan_evalautions
|
|
283
|
+
)
|
|
284
|
+
if self.using_native_model:
|
|
285
|
+
res, cost = await self.model.a_generate(prompt)
|
|
286
|
+
self.evaluation_cost += cost
|
|
287
|
+
return res
|
|
288
|
+
else:
|
|
289
|
+
res = await self.model.a_generate(prompt)
|
|
290
|
+
return res
|
|
291
|
+
|
|
292
|
+
def _get_goal_accuracy_score(self, user_goal, steps_taken):
|
|
293
|
+
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
294
|
+
user_goal, "\n".join(steps_taken)
|
|
295
|
+
)
|
|
296
|
+
if self.using_native_model:
|
|
297
|
+
res, cost = self.model.generate(prompt, schema=GoalScore)
|
|
298
|
+
self.evaluation_cost += cost
|
|
299
|
+
return res
|
|
300
|
+
else:
|
|
301
|
+
try:
|
|
302
|
+
res: GoalScore = self.model.generate(prompt, schema=GoalScore)
|
|
303
|
+
return res
|
|
304
|
+
except TypeError:
|
|
305
|
+
res = self.model.generate(prompt)
|
|
306
|
+
data = trimAndLoadJson(res, self)
|
|
307
|
+
return GoalScore(**data)
|
|
308
|
+
|
|
309
|
+
async def _a_get_goal_accuracy_score(self, user_goal, steps_taken):
|
|
310
|
+
prompt = GoalAccuracyTemplate.get_accuracy_score(
|
|
311
|
+
user_goal, "\n".join(steps_taken)
|
|
312
|
+
)
|
|
313
|
+
if self.using_native_model:
|
|
314
|
+
res, cost = await self.model.a_generate(prompt, schema=GoalScore)
|
|
315
|
+
self.evaluation_cost += cost
|
|
316
|
+
return res
|
|
317
|
+
else:
|
|
318
|
+
try:
|
|
319
|
+
res: GoalScore = await self.model.a_generate(
|
|
320
|
+
prompt, schema=GoalScore
|
|
321
|
+
)
|
|
322
|
+
return res
|
|
323
|
+
except TypeError:
|
|
324
|
+
res = await self.model.a_generate(prompt)
|
|
325
|
+
data = trimAndLoadJson(res, self)
|
|
326
|
+
return GoalScore(**data)
|
|
327
|
+
|
|
328
|
+
def print_goals_and_steps_taken(self, goals_and_steps):
|
|
329
|
+
final_goals_and_steps = ""
|
|
330
|
+
for goal_step in goals_and_steps:
|
|
331
|
+
final_goals_and_steps += f"{goal_step.user_goal} \n"
|
|
332
|
+
final_goals_and_steps += (
|
|
333
|
+
f"c{prettify_list(goal_step.steps_taken)} \n\n"
|
|
334
|
+
)
|
|
335
|
+
return final_goals_and_steps
|
|
336
|
+
|
|
337
|
+
def is_successful(self) -> bool:
|
|
338
|
+
if self.error is not None:
|
|
339
|
+
self.success = False
|
|
340
|
+
else:
|
|
341
|
+
try:
|
|
342
|
+
self.success = self.score >= self.threshold
|
|
343
|
+
except:
|
|
344
|
+
self.success = False
|
|
345
|
+
return self.success
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def __name__(self):
|
|
349
|
+
return "Goal Accuracy"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GoalSteps(BaseModel):
|
|
6
|
+
user_goal: str
|
|
7
|
+
steps_taken: List[str]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GoalScore(BaseModel):
|
|
11
|
+
score: float
|
|
12
|
+
reason: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PlanScore(BaseModel):
|
|
16
|
+
score: float
|
|
17
|
+
reason: str
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import textwrap
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GoalAccuracyTemplate:
|
|
6
|
+
@staticmethod
|
|
7
|
+
def get_accuracy_score(task, steps_taken):
|
|
8
|
+
return textwrap.dedent(
|
|
9
|
+
f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
|
|
10
|
+
|
|
11
|
+
PURPOSE:
|
|
12
|
+
|
|
13
|
+
Evaluate whether the assistant's **visible output** (what the user actually saw) **fully and correctly achieved the user's stated goal.
|
|
14
|
+
Ignore internal reasoning, hidden tool calls, or retriever outputs unless their results were explicitly surfaced to the user.
|
|
15
|
+
|
|
16
|
+
The evaluation must be **strict and adversarial** — if the goal is not *clearly, fully, and correctly achieved*, assign a low score.
|
|
17
|
+
|
|
18
|
+
EVALUATION RULES
|
|
19
|
+
|
|
20
|
+
1. **User-visible fulfillment only**
|
|
21
|
+
- Base your judgment solely on what the user would see in the assistant's message.
|
|
22
|
+
- Ignore hidden or internal steps unless their results were explicitly communicated.
|
|
23
|
+
|
|
24
|
+
2. **Goal completion**
|
|
25
|
+
- The assistant must explicitly provide everything the user asked for.
|
|
26
|
+
- If even one subpart of the task is missing, incomplete, or vague, the score must be **≤ 0.5**.
|
|
27
|
+
|
|
28
|
+
3. **Correctness and relevance**
|
|
29
|
+
- The information provided must be factually correct and directly relevant to the task.
|
|
30
|
+
- Hallucinated or unrelated content automatically lowers the score.
|
|
31
|
+
|
|
32
|
+
4. **Self-sufficiency**
|
|
33
|
+
- The visible response must stand on its own; the user should not need prior context or follow-up clarification.
|
|
34
|
+
|
|
35
|
+
5. **Strict bias toward failure**
|
|
36
|
+
- When uncertain, assume the goal was **not achieved**.
|
|
37
|
+
- The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
|
|
38
|
+
|
|
39
|
+
SCORING GUIDE:
|
|
40
|
+
|
|
41
|
+
- **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
|
|
42
|
+
- **0.75** → Mostly achieved; minor omissions or trivial inaccuracies.
|
|
43
|
+
- **0.5** → Partially achieved; core goal addressed, but key parts missing or incorrect.
|
|
44
|
+
- **0.25** → Weak attempt; loosely related but fails to satisfy the user’s request.
|
|
45
|
+
- **0.0** → Goal not achieved at all; irrelevant, wrong, or missing answer.
|
|
46
|
+
|
|
47
|
+
*When in doubt, choose the lower score.*
|
|
48
|
+
|
|
49
|
+
OUTPUT FORMAT:
|
|
50
|
+
|
|
51
|
+
Return only a valid JSON object with this structure:
|
|
52
|
+
|
|
53
|
+
{{
|
|
54
|
+
"score": 0.0,
|
|
55
|
+
"reason": "1-3 factual sentences explaining what parts of the user's goal were or were not achieved."
|
|
56
|
+
}}
|
|
57
|
+
|
|
58
|
+
The reason must:
|
|
59
|
+
- Be objective and concise.
|
|
60
|
+
- Refer to **specific missing or incorrect elements**.
|
|
61
|
+
- Avoid vague language (“somewhat correct”, “pretty accurate”).
|
|
62
|
+
|
|
63
|
+
EXAMPLES:
|
|
64
|
+
|
|
65
|
+
**Example 1**
|
|
66
|
+
Task: "Translate 'good night' into French."
|
|
67
|
+
Assistant Reply: "Bonne nuit."
|
|
68
|
+
→
|
|
69
|
+
{{
|
|
70
|
+
"score": 1.0,
|
|
71
|
+
"reason": "The assistant provided the exact, correct translation requested by the user."
|
|
72
|
+
}}
|
|
73
|
+
|
|
74
|
+
**Example 2**
|
|
75
|
+
Task: "List three renewable energy sources."
|
|
76
|
+
Assistant Reply: "Solar and wind energy."
|
|
77
|
+
→
|
|
78
|
+
{{
|
|
79
|
+
"score": 0.5,
|
|
80
|
+
"reason": "The assistant only listed two sources instead of three, so the goal was partially achieved."
|
|
81
|
+
}}
|
|
82
|
+
|
|
83
|
+
**Example 3**
|
|
84
|
+
Task: "Summarize this paragraph."
|
|
85
|
+
Assistant Reply: "It talks about technology."
|
|
86
|
+
→
|
|
87
|
+
{{
|
|
88
|
+
"score": 0.25,
|
|
89
|
+
"reason": "The summary is too vague and fails to convey key information from the text."
|
|
90
|
+
}}
|
|
91
|
+
|
|
92
|
+
*** END OF EXAMPLES ***
|
|
93
|
+
|
|
94
|
+
USER TASK:
|
|
95
|
+
{task}
|
|
96
|
+
|
|
97
|
+
AGENT STEPS:
|
|
98
|
+
{steps_taken}
|
|
99
|
+
|
|
100
|
+
JSON:
|
|
101
|
+
"""
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def get_plan_evaluation_score(task, steps_taken):
|
|
106
|
+
return textwrap.dedent(
|
|
107
|
+
f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
|
|
108
|
+
|
|
109
|
+
OBJECTIVE:
|
|
110
|
+
|
|
111
|
+
Evaluate:
|
|
112
|
+
|
|
113
|
+
1. **Plan Quality** — Was the agent's plan clear, complete, and logically structured to fully address the user's task?
|
|
114
|
+
2. **Plan Adherence** — Did the agent consistently follow that plan without unjustified deviations, omissions, or extraneous steps?
|
|
115
|
+
|
|
116
|
+
Your judgment must be strict: a plan must be well-formed and execution must align with it for a high score.
|
|
117
|
+
|
|
118
|
+
EVALUATION CRITERIA
|
|
119
|
+
|
|
120
|
+
- Plan Quality:
|
|
121
|
+
- The plan should explicitly or implicitly outline all necessary steps to fulfill the user's task.
|
|
122
|
+
- It must be logically ordered, neither vague nor overly generic.
|
|
123
|
+
- Missing critical components or unclear structuring lowers the score drastically.
|
|
124
|
+
|
|
125
|
+
- Plan Adherence:
|
|
126
|
+
- Execution must closely match the planned steps.
|
|
127
|
+
- Any skipped, added, or rearranged steps without clear justification count as plan deviations.
|
|
128
|
+
- Minor, justified variations are acceptable but reduce the score slightly.
|
|
129
|
+
|
|
130
|
+
- General Rules:
|
|
131
|
+
- If no discernible plan exists, score ≤ 0.5 regardless of task completion.
|
|
132
|
+
- Tool use should be coherent within the plan, not ad hoc or speculative.
|
|
133
|
+
- This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
|
|
134
|
+
|
|
135
|
+
SCORING GUIDE:
|
|
136
|
+
|
|
137
|
+
- **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
|
|
138
|
+
- **0.75** → Mostly clear plan with minor omissions or small execution deviations that do not impact the overall strategy.
|
|
139
|
+
- **0.5** → Partial plan exists but is incomplete, vague, or only partially followed; notable deviations present.
|
|
140
|
+
- **0.25** → Weak or fragmented plan; execution frequently diverges or lacks coherence with any strategy.
|
|
141
|
+
- **0.0** → No evidence of a plan; execution appears random or unrelated to the user's task.
|
|
142
|
+
|
|
143
|
+
INSTRUCTIONS:
|
|
144
|
+
|
|
145
|
+
1. Identify the agent's plan from the steps taken (explicit plans stated or implicit structure).
|
|
146
|
+
2. Assess plan completeness and logical order relative to the user's task.
|
|
147
|
+
3. Compare execution steps against the plan to check for adherence, noting any unjustified deviations.
|
|
148
|
+
4. Deduct points for vagueness, missing critical steps, or inconsistent execution.
|
|
149
|
+
|
|
150
|
+
OUTPUT FORMAT:
|
|
151
|
+
|
|
152
|
+
Return only a valid JSON object with exactly two fields:
|
|
153
|
+
|
|
154
|
+
{{
|
|
155
|
+
"score": 0.0,
|
|
156
|
+
"reason": "1-3 concise sentences explaining the quality of the plan and how well execution matched it. Specify missing or extra steps, plan clarity, and adherence issues."
|
|
157
|
+
}}
|
|
158
|
+
|
|
159
|
+
EXAMPLE:
|
|
160
|
+
|
|
161
|
+
User Task: "Plan a business trip including booking a flight, hotel, and preparing an agenda."
|
|
162
|
+
|
|
163
|
+
Agent Steps include:
|
|
164
|
+
- Outlined flight, hotel, and agenda steps explicitly.
|
|
165
|
+
- Executed flight and hotel booking steps.
|
|
166
|
+
- Skipped agenda preparation despite mentioning it in the plan.
|
|
167
|
+
|
|
168
|
+
Example JSON:
|
|
169
|
+
|
|
170
|
+
{{
|
|
171
|
+
"score": 0.75,
|
|
172
|
+
"reason": "The agent formed a clear plan covering flights, hotel, and agenda, but failed to execute the agenda preparation step, reducing adherence."
|
|
173
|
+
}}
|
|
174
|
+
|
|
175
|
+
**** END OF EXAMPLE ****
|
|
176
|
+
|
|
177
|
+
INPUTS:
|
|
178
|
+
|
|
179
|
+
USER TASK:
|
|
180
|
+
{task}
|
|
181
|
+
|
|
182
|
+
AGENT STEPS:
|
|
183
|
+
{steps_taken}
|
|
184
|
+
|
|
185
|
+
JSON:
|
|
186
|
+
"""
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
@staticmethod
|
|
190
|
+
def get_final_reason(
|
|
191
|
+
final_score, threshold, goal_evaluations, plan_evalautions
|
|
192
|
+
):
|
|
193
|
+
return textwrap.dedent(
|
|
194
|
+
f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
|
|
195
|
+
|
|
196
|
+
You are given:
|
|
197
|
+
- An agent's goal execution scores and reasons.
|
|
198
|
+
- The agent's plan evaluation scores and reasons.
|
|
199
|
+
- The **final combined score**.
|
|
200
|
+
- The **threshold** required to pass.
|
|
201
|
+
- Whether the result is a **pass** or **fail**.
|
|
202
|
+
|
|
203
|
+
Your job is to write a short, precise explanation of **why** the agent passed or failed — taking into account the quality of execution and planning, and the threshold.
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
INSTRUCTIONS:
|
|
208
|
+
|
|
209
|
+
- Write 2-4 clear, objective sentences explaining the overall result.
|
|
210
|
+
- Explicitly reference both the task and plan performance — **both must be addressed**.
|
|
211
|
+
- Mention how the final score compares to the threshold.
|
|
212
|
+
- If the agent **passed**, highlight how both task execution and planning were sufficient to meet the goal.
|
|
213
|
+
- If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
|
|
214
|
+
- Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
FORMAT:
|
|
219
|
+
Return only a single string. Do **not** include JSON or any extra formatting.
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
Goal evaluations:
|
|
224
|
+
{goal_evaluations}
|
|
225
|
+
|
|
226
|
+
Plan evaluations:
|
|
227
|
+
{plan_evalautions}
|
|
228
|
+
|
|
229
|
+
Final Score: {final_score}
|
|
230
|
+
Threshold: {threshold}
|
|
231
|
+
Result: {"PASS" if final_score >= threshold else "FAIL"}
|
|
232
|
+
|
|
233
|
+
Final Reason:
|
|
234
|
+
"""
|
|
235
|
+
)
|
|
@@ -16,15 +16,16 @@ from deepeval.metrics.hallucination.template import HallucinationTemplate
|
|
|
16
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
17
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
18
|
from deepeval.metrics.hallucination.schema import *
|
|
19
|
-
|
|
20
|
-
required_params: List[LLMTestCaseParams] = [
|
|
21
|
-
LLMTestCaseParams.INPUT,
|
|
22
|
-
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
23
|
-
LLMTestCaseParams.CONTEXT,
|
|
24
|
-
]
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
25
20
|
|
|
26
21
|
|
|
27
22
|
class HallucinationMetric(BaseMetric):
|
|
23
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
24
|
+
LLMTestCaseParams.INPUT,
|
|
25
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
26
|
+
LLMTestCaseParams.CONTEXT,
|
|
27
|
+
]
|
|
28
|
+
|
|
28
29
|
def __init__(
|
|
29
30
|
self,
|
|
30
31
|
threshold: float = 0.5,
|
|
@@ -51,9 +52,10 @@ class HallucinationMetric(BaseMetric):
|
|
|
51
52
|
test_case: LLMTestCase,
|
|
52
53
|
_show_indicator: bool = True,
|
|
53
54
|
_in_component: bool = False,
|
|
55
|
+
_log_metric_to_confident: bool = True,
|
|
54
56
|
) -> float:
|
|
55
57
|
|
|
56
|
-
check_llm_test_case_params(test_case,
|
|
58
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
57
59
|
|
|
58
60
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
59
61
|
with metric_progress_indicator(
|
|
@@ -66,6 +68,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
66
68
|
test_case,
|
|
67
69
|
_show_indicator=False,
|
|
68
70
|
_in_component=_in_component,
|
|
71
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
72
|
)
|
|
70
73
|
)
|
|
71
74
|
else:
|
|
@@ -84,6 +87,10 @@ class HallucinationMetric(BaseMetric):
|
|
|
84
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
88
|
],
|
|
86
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
87
94
|
|
|
88
95
|
return self.score
|
|
89
96
|
|
|
@@ -92,9 +99,10 @@ class HallucinationMetric(BaseMetric):
|
|
|
92
99
|
test_case: LLMTestCase,
|
|
93
100
|
_show_indicator: bool = True,
|
|
94
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
95
103
|
) -> float:
|
|
96
104
|
|
|
97
|
-
check_llm_test_case_params(test_case,
|
|
105
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
98
106
|
|
|
99
107
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
100
108
|
with metric_progress_indicator(
|
|
@@ -118,7 +126,10 @@ class HallucinationMetric(BaseMetric):
|
|
|
118
126
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
119
127
|
],
|
|
120
128
|
)
|
|
121
|
-
|
|
129
|
+
if _log_metric_to_confident:
|
|
130
|
+
metric_data_manager.post_metric_if_enabled(
|
|
131
|
+
self, test_case=test_case
|
|
132
|
+
)
|
|
122
133
|
return self.score
|
|
123
134
|
|
|
124
135
|
async def _a_generate_reason(self):
|