deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,349 @@
1
+ from typing import Optional, List, Union
2
+ import asyncio
3
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ get_unit_interactions,
8
+ print_tools_called,
9
+ check_conversational_test_case_params,
10
+ initialize_model,
11
+ )
12
+ from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
13
+ from deepeval.metrics import BaseConversationalMetric
14
+ from deepeval.models import DeepEvalBaseLLM
15
+ from deepeval.metrics.indicator import metric_progress_indicator
16
+ from deepeval.metrics.goal_accuracy.template import (
17
+ GoalAccuracyTemplate,
18
+ )
19
+ from deepeval.metrics.goal_accuracy.schema import (
20
+ GoalSteps,
21
+ GoalScore,
22
+ PlanScore,
23
+ )
24
+ from deepeval.metrics.api import metric_data_manager
25
+
26
+
27
+ class GoalAccuracyMetric(BaseConversationalMetric):
28
+
29
+ _required_test_case_params = [
30
+ TurnParams.ROLE,
31
+ TurnParams.CONTENT,
32
+ ]
33
+
34
+ def __init__(
35
+ self,
36
+ threshold: float = 0.5,
37
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
38
+ include_reason: bool = True,
39
+ async_mode: bool = True,
40
+ strict_mode: bool = False,
41
+ verbose_mode: bool = False,
42
+ ):
43
+ self.threshold = 1 if strict_mode else threshold
44
+ self.model, self.using_native_model = initialize_model(model)
45
+ self.evaluation_model = self.model.get_model_name()
46
+ self.include_reason = include_reason
47
+ self.async_mode = async_mode
48
+ self.strict_mode = strict_mode
49
+ self.verbose_mode = verbose_mode
50
+
51
+ def measure(
52
+ self,
53
+ test_case: ConversationalTestCase,
54
+ _show_indicator: bool = True,
55
+ _in_component: bool = False,
56
+ _log_metric_to_confident: bool = True,
57
+ ):
58
+ check_conversational_test_case_params(
59
+ test_case, self._required_test_case_params, self
60
+ )
61
+
62
+ self.evaluation_cost = 0 if self.using_native_model else None
63
+ with metric_progress_indicator(
64
+ self, _show_indicator=_show_indicator, _in_component=_in_component
65
+ ):
66
+ if self.async_mode:
67
+ loop = get_or_create_event_loop()
68
+ loop.run_until_complete(
69
+ self.a_measure(
70
+ test_case,
71
+ _show_indicator=False,
72
+ _in_component=_in_component,
73
+ _log_metric_to_confident=_log_metric_to_confident,
74
+ )
75
+ )
76
+ else:
77
+ unit_interactions = get_unit_interactions(test_case.turns)
78
+ goal_and_steps_taken = self._goal_and_steps_taken(
79
+ unit_interactions
80
+ )
81
+ goal_scores = [
82
+ self._get_goal_accuracy_score(
83
+ task.user_goal, task.steps_taken
84
+ )
85
+ for task in goal_and_steps_taken
86
+ ]
87
+ plan_scores = [
88
+ self._get_plan_scores(task.user_goal, task.steps_taken)
89
+ for task in goal_and_steps_taken
90
+ ]
91
+ self.score = self._calculate_score(goal_scores, plan_scores)
92
+ self.success = self.score >= self.threshold
93
+ self.reason = self._generate_reason(goal_scores, plan_scores)
94
+
95
+ self.verbose_logs = construct_verbose_logs(
96
+ self,
97
+ steps=[
98
+ f"Goals and steps taken: \n{self.print_goals_and_steps_taken(goal_and_steps_taken)} \n",
99
+ f"Goal evaluations: {prettify_list(goal_scores)} \n\n"
100
+ f"Plan evaluations: {prettify_list(plan_scores)} \n\n"
101
+ f"Final Score: {self.score}",
102
+ f"Final Reason: {self.reason}",
103
+ ],
104
+ )
105
+
106
+ if _log_metric_to_confident:
107
+ metric_data_manager.post_metric_if_enabled(
108
+ self, test_case=test_case
109
+ )
110
+
111
+ return self.score
112
+
113
+ async def a_measure(
114
+ self,
115
+ test_case: ConversationalTestCase,
116
+ _show_indicator: bool = True,
117
+ _in_component: bool = False,
118
+ _log_metric_to_confident: bool = True,
119
+ ):
120
+ check_conversational_test_case_params(
121
+ test_case, self._required_test_case_params, self
122
+ )
123
+
124
+ self.evaluation_cost = 0 if self.using_native_model else None
125
+
126
+ with metric_progress_indicator(
127
+ self,
128
+ async_mode=True,
129
+ _show_indicator=_show_indicator,
130
+ _in_component=_in_component,
131
+ ):
132
+ unit_interactions = get_unit_interactions(test_case.turns)
133
+ goal_and_steps_taken = self._goal_and_steps_taken(unit_interactions)
134
+ goal_scores = await asyncio.gather(
135
+ *[
136
+ self._a_get_goal_accuracy_score(
137
+ task.user_goal, task.steps_taken
138
+ )
139
+ for task in goal_and_steps_taken
140
+ ]
141
+ )
142
+ plan_scores = await asyncio.gather(
143
+ *[
144
+ self._a_get_plan_scores(task.user_goal, task.steps_taken)
145
+ for task in goal_and_steps_taken
146
+ ]
147
+ )
148
+ self.score = self._calculate_score(goal_scores, plan_scores)
149
+ self.success = self.score >= self.threshold
150
+ self.reason = await self._a_generate_reason(
151
+ goal_scores, plan_scores
152
+ )
153
+
154
+ self.verbose_logs = construct_verbose_logs(
155
+ self,
156
+ steps=[
157
+ f"Goals and steps taken: \n{self.print_goals_and_steps_taken(goal_and_steps_taken)} \n",
158
+ f"Goal evaluations: {prettify_list(goal_scores)} \n\n"
159
+ f"Plan evaluations: {prettify_list(plan_scores)} \n\n"
160
+ f"Final Score: {self.score}",
161
+ f"Final Reason: {self.reason}",
162
+ ],
163
+ )
164
+
165
+ if _log_metric_to_confident:
166
+ metric_data_manager.post_metric_if_enabled(
167
+ self, test_case=test_case
168
+ )
169
+
170
+ return self.score
171
+
172
+ def _goal_and_steps_taken(
173
+ self, unit_interactions: List[List[Turn]]
174
+ ) -> List[GoalSteps]:
175
+ goal_and_steps_taken = []
176
+ for unit_interaction in unit_interactions:
177
+ user_messages = "User messages: \n"
178
+ for turn in unit_interaction:
179
+ if turn.role == "user":
180
+ user_messages += turn.content + "\n"
181
+ else:
182
+ break
183
+ new_goal_steps = GoalSteps(user_goal=user_messages, steps_taken=[])
184
+ assistant_messages = "Assistant messages: \n"
185
+ for turn in unit_interaction[1:]:
186
+ if turn.role == "assistant":
187
+ assistant_messages += f"{turn.content} \n"
188
+ if turn.tools_called:
189
+ assistant_messages += f"Tools called: \n{print_tools_called(turn.tools_called)} \n"
190
+ new_goal_steps.steps_taken.append(assistant_messages)
191
+ goal_and_steps_taken.append(new_goal_steps)
192
+ return goal_and_steps_taken
193
+
194
+ def _get_plan_scores(self, user_goal, steps_taken):
195
+ prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
196
+ user_goal, "\n".join(steps_taken)
197
+ )
198
+ if self.using_native_model:
199
+ res, cost = self.model.generate(prompt, schema=PlanScore)
200
+ self.evaluation_cost += cost
201
+ return res
202
+ else:
203
+ try:
204
+ res: PlanScore = self.model.generate(prompt, schema=PlanScore)
205
+ return res
206
+ except TypeError:
207
+ res = self.model.generate(prompt)
208
+ data = trimAndLoadJson(res, self)
209
+ return PlanScore(**data)
210
+
211
+ async def _a_get_plan_scores(self, user_goal, steps_taken):
212
+ prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
213
+ user_goal, "\n".join(steps_taken)
214
+ )
215
+ if self.using_native_model:
216
+ res, cost = await self.model.a_generate(prompt, schema=PlanScore)
217
+ self.evaluation_cost += cost
218
+ return res
219
+ else:
220
+ try:
221
+ res: PlanScore = await self.model.a_generate(
222
+ prompt, schema=PlanScore
223
+ )
224
+ return res
225
+ except TypeError:
226
+ res = await self.model.a_generate(prompt)
227
+ data = trimAndLoadJson(res, self)
228
+ return PlanScore(**data)
229
+
230
+ def _calculate_score(
231
+ self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
232
+ ):
233
+ goal_scores = [goal_score.score for goal_score in goal_scores]
234
+ plan_scores = [plan_score.score for plan_score in plan_scores]
235
+ goal_score_divisor = len(goal_scores) if len(goal_scores) > 0 else 1
236
+ plan_score_divisor = len(plan_scores) if len(plan_scores) > 0 else 1
237
+ goal_avg = sum(goal_scores) / goal_score_divisor
238
+ plan_avg = sum(plan_scores) / plan_score_divisor
239
+ score = (goal_avg + plan_avg) / 2
240
+ return 0 if self.strict_mode and score < self.threshold else score
241
+
242
+ def _generate_reason(
243
+ self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
244
+ ):
245
+ goal_evaluations = ""
246
+ for goal_score in goal_scores:
247
+ goal_evaluations += (
248
+ f"Score: {goal_score.score}, Reason: {goal_score.reason}"
249
+ )
250
+ plan_evalautions = ""
251
+ for plan_score in plan_scores:
252
+ plan_evalautions += (
253
+ f"Score: {plan_score.score}, Reason: {plan_score.reason} \n"
254
+ )
255
+
256
+ prompt = GoalAccuracyTemplate.get_final_reason(
257
+ self.score, self.threshold, goal_evaluations, plan_evalautions
258
+ )
259
+ if self.using_native_model:
260
+ res, cost = self.model.generate(prompt)
261
+ self.evaluation_cost += cost
262
+ return res
263
+ else:
264
+ res = self.model.generate(prompt)
265
+ return res
266
+
267
+ async def _a_generate_reason(
268
+ self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
269
+ ):
270
+ goal_evaluations = ""
271
+ for goal_score in goal_scores:
272
+ goal_evaluations += (
273
+ f"Score: {goal_score.score}, Reason: {goal_score.reason}"
274
+ )
275
+ plan_evalautions = ""
276
+ for plan_score in plan_scores:
277
+ plan_evalautions += (
278
+ f"Score: {plan_score.score}, Reason: {plan_score.reason} \n"
279
+ )
280
+
281
+ prompt = GoalAccuracyTemplate.get_final_reason(
282
+ self.score, self.threshold, goal_evaluations, plan_evalautions
283
+ )
284
+ if self.using_native_model:
285
+ res, cost = await self.model.a_generate(prompt)
286
+ self.evaluation_cost += cost
287
+ return res
288
+ else:
289
+ res = await self.model.a_generate(prompt)
290
+ return res
291
+
292
+ def _get_goal_accuracy_score(self, user_goal, steps_taken):
293
+ prompt = GoalAccuracyTemplate.get_accuracy_score(
294
+ user_goal, "\n".join(steps_taken)
295
+ )
296
+ if self.using_native_model:
297
+ res, cost = self.model.generate(prompt, schema=GoalScore)
298
+ self.evaluation_cost += cost
299
+ return res
300
+ else:
301
+ try:
302
+ res: GoalScore = self.model.generate(prompt, schema=GoalScore)
303
+ return res
304
+ except TypeError:
305
+ res = self.model.generate(prompt)
306
+ data = trimAndLoadJson(res, self)
307
+ return GoalScore(**data)
308
+
309
+ async def _a_get_goal_accuracy_score(self, user_goal, steps_taken):
310
+ prompt = GoalAccuracyTemplate.get_accuracy_score(
311
+ user_goal, "\n".join(steps_taken)
312
+ )
313
+ if self.using_native_model:
314
+ res, cost = await self.model.a_generate(prompt, schema=GoalScore)
315
+ self.evaluation_cost += cost
316
+ return res
317
+ else:
318
+ try:
319
+ res: GoalScore = await self.model.a_generate(
320
+ prompt, schema=GoalScore
321
+ )
322
+ return res
323
+ except TypeError:
324
+ res = await self.model.a_generate(prompt)
325
+ data = trimAndLoadJson(res, self)
326
+ return GoalScore(**data)
327
+
328
+ def print_goals_and_steps_taken(self, goals_and_steps):
329
+ final_goals_and_steps = ""
330
+ for goal_step in goals_and_steps:
331
+ final_goals_and_steps += f"{goal_step.user_goal} \n"
332
+ final_goals_and_steps += (
333
+ f"c{prettify_list(goal_step.steps_taken)} \n\n"
334
+ )
335
+ return final_goals_and_steps
336
+
337
+ def is_successful(self) -> bool:
338
+ if self.error is not None:
339
+ self.success = False
340
+ else:
341
+ try:
342
+ self.success = self.score >= self.threshold
343
+ except:
344
+ self.success = False
345
+ return self.success
346
+
347
+ @property
348
+ def __name__(self):
349
+ return "Goal Accuracy"
@@ -0,0 +1,17 @@
1
+ from pydantic import BaseModel
2
+ from typing import List
3
+
4
+
5
+ class GoalSteps(BaseModel):
6
+ user_goal: str
7
+ steps_taken: List[str]
8
+
9
+
10
+ class GoalScore(BaseModel):
11
+ score: float
12
+ reason: str
13
+
14
+
15
+ class PlanScore(BaseModel):
16
+ score: float
17
+ reason: str
@@ -0,0 +1,235 @@
1
+ from typing import List
2
+ import textwrap
3
+
4
+
5
+ class GoalAccuracyTemplate:
6
+ @staticmethod
7
+ def get_accuracy_score(task, steps_taken):
8
+ return textwrap.dedent(
9
+ f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
10
+
11
+ PURPOSE:
12
+
13
+ Evaluate whether the assistant's **visible output** (what the user actually saw) **fully and correctly achieved the user's stated goal.
14
+ Ignore internal reasoning, hidden tool calls, or retriever outputs unless their results were explicitly surfaced to the user.
15
+
16
+ The evaluation must be **strict and adversarial** — if the goal is not *clearly, fully, and correctly achieved*, assign a low score.
17
+
18
+ EVALUATION RULES
19
+
20
+ 1. **User-visible fulfillment only**
21
+ - Base your judgment solely on what the user would see in the assistant's message.
22
+ - Ignore hidden or internal steps unless their results were explicitly communicated.
23
+
24
+ 2. **Goal completion**
25
+ - The assistant must explicitly provide everything the user asked for.
26
+ - If even one subpart of the task is missing, incomplete, or vague, the score must be **≤ 0.5**.
27
+
28
+ 3. **Correctness and relevance**
29
+ - The information provided must be factually correct and directly relevant to the task.
30
+ - Hallucinated or unrelated content automatically lowers the score.
31
+
32
+ 4. **Self-sufficiency**
33
+ - The visible response must stand on its own; the user should not need prior context or follow-up clarification.
34
+
35
+ 5. **Strict bias toward failure**
36
+ - When uncertain, assume the goal was **not achieved**.
37
+ - The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
38
+
39
+ SCORING GUIDE:
40
+
41
+ - **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
42
+ - **0.75** → Mostly achieved; minor omissions or trivial inaccuracies.
43
+ - **0.5** → Partially achieved; core goal addressed, but key parts missing or incorrect.
44
+ - **0.25** → Weak attempt; loosely related but fails to satisfy the user’s request.
45
+ - **0.0** → Goal not achieved at all; irrelevant, wrong, or missing answer.
46
+
47
+ *When in doubt, choose the lower score.*
48
+
49
+ OUTPUT FORMAT:
50
+
51
+ Return only a valid JSON object with this structure:
52
+
53
+ {{
54
+ "score": 0.0,
55
+ "reason": "1-3 factual sentences explaining what parts of the user's goal were or were not achieved."
56
+ }}
57
+
58
+ The reason must:
59
+ - Be objective and concise.
60
+ - Refer to **specific missing or incorrect elements**.
61
+ - Avoid vague language (“somewhat correct”, “pretty accurate”).
62
+
63
+ EXAMPLES:
64
+
65
+ **Example 1**
66
+ Task: "Translate 'good night' into French."
67
+ Assistant Reply: "Bonne nuit."
68
+
69
+ {{
70
+ "score": 1.0,
71
+ "reason": "The assistant provided the exact, correct translation requested by the user."
72
+ }}
73
+
74
+ **Example 2**
75
+ Task: "List three renewable energy sources."
76
+ Assistant Reply: "Solar and wind energy."
77
+
78
+ {{
79
+ "score": 0.5,
80
+ "reason": "The assistant only listed two sources instead of three, so the goal was partially achieved."
81
+ }}
82
+
83
+ **Example 3**
84
+ Task: "Summarize this paragraph."
85
+ Assistant Reply: "It talks about technology."
86
+
87
+ {{
88
+ "score": 0.25,
89
+ "reason": "The summary is too vague and fails to convey key information from the text."
90
+ }}
91
+
92
+ *** END OF EXAMPLES ***
93
+
94
+ USER TASK:
95
+ {task}
96
+
97
+ AGENT STEPS:
98
+ {steps_taken}
99
+
100
+ JSON:
101
+ """
102
+ )
103
+
104
+ @staticmethod
105
+ def get_plan_evaluation_score(task, steps_taken):
106
+ return textwrap.dedent(
107
+ f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
108
+
109
+ OBJECTIVE:
110
+
111
+ Evaluate:
112
+
113
+ 1. **Plan Quality** — Was the agent's plan clear, complete, and logically structured to fully address the user's task?
114
+ 2. **Plan Adherence** — Did the agent consistently follow that plan without unjustified deviations, omissions, or extraneous steps?
115
+
116
+ Your judgment must be strict: a plan must be well-formed and execution must align with it for a high score.
117
+
118
+ EVALUATION CRITERIA
119
+
120
+ - Plan Quality:
121
+ - The plan should explicitly or implicitly outline all necessary steps to fulfill the user's task.
122
+ - It must be logically ordered, neither vague nor overly generic.
123
+ - Missing critical components or unclear structuring lowers the score drastically.
124
+
125
+ - Plan Adherence:
126
+ - Execution must closely match the planned steps.
127
+ - Any skipped, added, or rearranged steps without clear justification count as plan deviations.
128
+ - Minor, justified variations are acceptable but reduce the score slightly.
129
+
130
+ - General Rules:
131
+ - If no discernible plan exists, score ≤ 0.5 regardless of task completion.
132
+ - Tool use should be coherent within the plan, not ad hoc or speculative.
133
+ - This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
134
+
135
+ SCORING GUIDE:
136
+
137
+ - **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
138
+ - **0.75** → Mostly clear plan with minor omissions or small execution deviations that do not impact the overall strategy.
139
+ - **0.5** → Partial plan exists but is incomplete, vague, or only partially followed; notable deviations present.
140
+ - **0.25** → Weak or fragmented plan; execution frequently diverges or lacks coherence with any strategy.
141
+ - **0.0** → No evidence of a plan; execution appears random or unrelated to the user's task.
142
+
143
+ INSTRUCTIONS:
144
+
145
+ 1. Identify the agent's plan from the steps taken (explicit plans stated or implicit structure).
146
+ 2. Assess plan completeness and logical order relative to the user's task.
147
+ 3. Compare execution steps against the plan to check for adherence, noting any unjustified deviations.
148
+ 4. Deduct points for vagueness, missing critical steps, or inconsistent execution.
149
+
150
+ OUTPUT FORMAT:
151
+
152
+ Return only a valid JSON object with exactly two fields:
153
+
154
+ {{
155
+ "score": 0.0,
156
+ "reason": "1-3 concise sentences explaining the quality of the plan and how well execution matched it. Specify missing or extra steps, plan clarity, and adherence issues."
157
+ }}
158
+
159
+ EXAMPLE:
160
+
161
+ User Task: "Plan a business trip including booking a flight, hotel, and preparing an agenda."
162
+
163
+ Agent Steps include:
164
+ - Outlined flight, hotel, and agenda steps explicitly.
165
+ - Executed flight and hotel booking steps.
166
+ - Skipped agenda preparation despite mentioning it in the plan.
167
+
168
+ Example JSON:
169
+
170
+ {{
171
+ "score": 0.75,
172
+ "reason": "The agent formed a clear plan covering flights, hotel, and agenda, but failed to execute the agenda preparation step, reducing adherence."
173
+ }}
174
+
175
+ **** END OF EXAMPLE ****
176
+
177
+ INPUTS:
178
+
179
+ USER TASK:
180
+ {task}
181
+
182
+ AGENT STEPS:
183
+ {steps_taken}
184
+
185
+ JSON:
186
+ """
187
+ )
188
+
189
+ @staticmethod
190
+ def get_final_reason(
191
+ final_score, threshold, goal_evaluations, plan_evalautions
192
+ ):
193
+ return textwrap.dedent(
194
+ f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
195
+
196
+ You are given:
197
+ - An agent's goal execution scores and reasons.
198
+ - The agent's plan evaluation scores and reasons.
199
+ - The **final combined score**.
200
+ - The **threshold** required to pass.
201
+ - Whether the result is a **pass** or **fail**.
202
+
203
+ Your job is to write a short, precise explanation of **why** the agent passed or failed — taking into account the quality of execution and planning, and the threshold.
204
+
205
+ ---
206
+
207
+ INSTRUCTIONS:
208
+
209
+ - Write 2-4 clear, objective sentences explaining the overall result.
210
+ - Explicitly reference both the task and plan performance — **both must be addressed**.
211
+ - Mention how the final score compares to the threshold.
212
+ - If the agent **passed**, highlight how both task execution and planning were sufficient to meet the goal.
213
+ - If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
214
+ - Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
215
+
216
+ ---
217
+
218
+ FORMAT:
219
+ Return only a single string. Do **not** include JSON or any extra formatting.
220
+
221
+ ---
222
+
223
+ Goal evaluations:
224
+ {goal_evaluations}
225
+
226
+ Plan evaluations:
227
+ {plan_evalautions}
228
+
229
+ Final Score: {final_score}
230
+ Threshold: {threshold}
231
+ Result: {"PASS" if final_score >= threshold else "FAIL"}
232
+
233
+ Final Reason:
234
+ """
235
+ )
@@ -16,15 +16,16 @@ from deepeval.metrics.hallucination.template import HallucinationTemplate
16
16
  from deepeval.models import DeepEvalBaseLLM
17
17
  from deepeval.metrics.indicator import metric_progress_indicator
18
18
  from deepeval.metrics.hallucination.schema import *
19
-
20
- required_params: List[LLMTestCaseParams] = [
21
- LLMTestCaseParams.INPUT,
22
- LLMTestCaseParams.ACTUAL_OUTPUT,
23
- LLMTestCaseParams.CONTEXT,
24
- ]
19
+ from deepeval.metrics.api import metric_data_manager
25
20
 
26
21
 
27
22
  class HallucinationMetric(BaseMetric):
23
+ _required_params: List[LLMTestCaseParams] = [
24
+ LLMTestCaseParams.INPUT,
25
+ LLMTestCaseParams.ACTUAL_OUTPUT,
26
+ LLMTestCaseParams.CONTEXT,
27
+ ]
28
+
28
29
  def __init__(
29
30
  self,
30
31
  threshold: float = 0.5,
@@ -51,9 +52,10 @@ class HallucinationMetric(BaseMetric):
51
52
  test_case: LLMTestCase,
52
53
  _show_indicator: bool = True,
53
54
  _in_component: bool = False,
55
+ _log_metric_to_confident: bool = True,
54
56
  ) -> float:
55
57
 
56
- check_llm_test_case_params(test_case, required_params, self)
58
+ check_llm_test_case_params(test_case, self._required_params, self)
57
59
 
58
60
  self.evaluation_cost = 0 if self.using_native_model else None
59
61
  with metric_progress_indicator(
@@ -66,6 +68,7 @@ class HallucinationMetric(BaseMetric):
66
68
  test_case,
67
69
  _show_indicator=False,
68
70
  _in_component=_in_component,
71
+ _log_metric_to_confident=_log_metric_to_confident,
69
72
  )
70
73
  )
71
74
  else:
@@ -84,6 +87,10 @@ class HallucinationMetric(BaseMetric):
84
87
  f"Score: {self.score}\nReason: {self.reason}",
85
88
  ],
86
89
  )
90
+ if _log_metric_to_confident:
91
+ metric_data_manager.post_metric_if_enabled(
92
+ self, test_case=test_case
93
+ )
87
94
 
88
95
  return self.score
89
96
 
@@ -92,9 +99,10 @@ class HallucinationMetric(BaseMetric):
92
99
  test_case: LLMTestCase,
93
100
  _show_indicator: bool = True,
94
101
  _in_component: bool = False,
102
+ _log_metric_to_confident: bool = True,
95
103
  ) -> float:
96
104
 
97
- check_llm_test_case_params(test_case, required_params, self)
105
+ check_llm_test_case_params(test_case, self._required_params, self)
98
106
 
99
107
  self.evaluation_cost = 0 if self.using_native_model else None
100
108
  with metric_progress_indicator(
@@ -118,7 +126,10 @@ class HallucinationMetric(BaseMetric):
118
126
  f"Score: {self.score}\nReason: {self.reason}",
119
127
  ],
120
128
  )
121
-
129
+ if _log_metric_to_confident:
130
+ metric_data_manager.post_metric_if_enabled(
131
+ self, test_case=test_case
132
+ )
122
133
  return self.score
123
134
 
124
135
  async def _a_generate_reason(self):