deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,292 @@
1
+ from typing import Optional, List, Union, Dict
2
+
3
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ check_llm_test_case_params,
8
+ initialize_model,
9
+ )
10
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
+ from deepeval.metrics import BaseMetric
12
+ from deepeval.models import DeepEvalBaseLLM
13
+ from deepeval.metrics.indicator import metric_progress_indicator
14
+ from deepeval.metrics.step_efficiency.template import (
15
+ StepEfficiencyTemplate,
16
+ )
17
+ from deepeval.metrics.step_efficiency.schema import Task
18
+ from deepeval.metrics.plan_quality.schema import (
19
+ AgentPlan,
20
+ PlanQualityScore,
21
+ )
22
+ from deepeval.metrics.plan_quality.template import (
23
+ PlanQualityTemplate,
24
+ )
25
+ from deepeval.metrics.plan_adherence.template import (
26
+ PlanAdherenceTemplate,
27
+ )
28
+ from deepeval.metrics.api import metric_data_manager
29
+
30
+
31
+ class PlanQualityMetric(BaseMetric):
32
+
33
+ _required_params: List[LLMTestCaseParams] = [
34
+ LLMTestCaseParams.INPUT,
35
+ LLMTestCaseParams.ACTUAL_OUTPUT,
36
+ LLMTestCaseParams.TOOLS_CALLED,
37
+ ]
38
+
39
+ def __init__(
40
+ self,
41
+ threshold: float = 0.5,
42
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
43
+ include_reason: bool = True,
44
+ async_mode: bool = True,
45
+ strict_mode: bool = False,
46
+ verbose_mode: bool = False,
47
+ ):
48
+ self.threshold = 1 if strict_mode else threshold
49
+ self.model, self.using_native_model = initialize_model(model)
50
+ self.evaluation_model = self.model.get_model_name()
51
+ self.include_reason = include_reason
52
+ self.async_mode = async_mode
53
+ self.strict_mode = strict_mode
54
+ self.verbose_mode = verbose_mode
55
+ self.requires_trace = True
56
+
57
+ def measure(
58
+ self,
59
+ test_case: LLMTestCase,
60
+ _show_indicator: bool = True,
61
+ _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
63
+ ):
64
+ has_trace: bool = isinstance(test_case._trace_dict, Dict)
65
+ if not has_trace:
66
+ check_llm_test_case_params(test_case, self._required_params, self)
67
+
68
+ self.evaluation_cost = 0 if self.using_native_model else None
69
+ with metric_progress_indicator(
70
+ self, _show_indicator=_show_indicator, _in_component=_in_component
71
+ ):
72
+ if self.async_mode:
73
+ loop = get_or_create_event_loop()
74
+ loop.run_until_complete(
75
+ self.a_measure(
76
+ test_case,
77
+ _show_indicator=False,
78
+ _in_component=_in_component,
79
+ _log_metric_to_confident=_log_metric_to_confident,
80
+ )
81
+ )
82
+ else:
83
+ task = self._extract_task_from_trace(test_case)
84
+ agent_plan = self._extract_plan_from_trace(test_case)
85
+ if len(agent_plan.plan) == 0:
86
+ self.score = 1
87
+ self.reason = "There were no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in any one of the trace attributes."
88
+ else:
89
+ plan_quality_score = self._get_plan_quality_score(
90
+ task, agent_plan.plan
91
+ )
92
+ self.score = (
93
+ 0
94
+ if self.strict_mode
95
+ and plan_quality_score.score < self.threshold
96
+ else plan_quality_score.score
97
+ )
98
+ self.reason = plan_quality_score.reason
99
+ self.success = self.score >= self.threshold
100
+ self.verbose_logs = construct_verbose_logs(
101
+ self,
102
+ steps=[
103
+ f"Task: {task} \n",
104
+ f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
105
+ f"Final Score Score: {self.score} \n",
106
+ f"Final Score Reason: {self.reason} \n",
107
+ ],
108
+ )
109
+
110
+ if _log_metric_to_confident:
111
+ metric_data_manager.post_metric_if_enabled(
112
+ self, test_case=test_case
113
+ )
114
+
115
+ return self.score
116
+
117
+ async def a_measure(
118
+ self,
119
+ test_case: LLMTestCase,
120
+ _show_indicator: bool = True,
121
+ _in_component: bool = False,
122
+ _log_metric_to_confident: bool = True,
123
+ ):
124
+ has_trace: bool = isinstance(test_case._trace_dict, Dict)
125
+ if not has_trace:
126
+ check_llm_test_case_params(test_case, self._required_params, self)
127
+
128
+ self.evaluation_cost = 0 if self.using_native_model else None
129
+
130
+ with metric_progress_indicator(
131
+ self,
132
+ async_mode=True,
133
+ _show_indicator=_show_indicator,
134
+ _in_component=_in_component,
135
+ ):
136
+ task = await self._a_extract_task_from_trace(test_case)
137
+ agent_plan = await self._a_extract_plan_from_trace(test_case)
138
+ if len(agent_plan.plan) == 0:
139
+ self.score = 1
140
+ self.reason = "There are no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in the trace attributes."
141
+ else:
142
+ plan_quality_score = await self._a_get_plan_quality_score(
143
+ task, agent_plan.plan
144
+ )
145
+ self.score = (
146
+ 0
147
+ if self.strict_mode
148
+ and plan_quality_score.score < self.threshold
149
+ else plan_quality_score.score
150
+ )
151
+ self.reason = plan_quality_score.reason
152
+ self.success = self.score >= self.threshold
153
+ self.verbose_logs = construct_verbose_logs(
154
+ self,
155
+ steps=[
156
+ f"Task: {task} \n",
157
+ f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
158
+ f"Final Score: {self.score} \n",
159
+ f"Final Reason: {self.reason} \n",
160
+ ],
161
+ )
162
+
163
+ if _log_metric_to_confident:
164
+ metric_data_manager.post_metric_if_enabled(
165
+ self, test_case=test_case
166
+ )
167
+
168
+ return self.score
169
+
170
+ def _get_plan_quality_score(self, task, plan):
171
+ prompt = PlanQualityTemplate.evaluate_plan_quality(
172
+ task, "\n".join(plan)
173
+ )
174
+ if self.using_native_model:
175
+ res, cost = self.model.generate(prompt, schema=PlanQualityScore)
176
+ self.evaluation_cost += cost
177
+ return res
178
+ else:
179
+ try:
180
+ res: Task = self.model.generate(prompt, schema=PlanQualityScore)
181
+ return res
182
+ except TypeError:
183
+ res = self.model.generate(prompt)
184
+ data = trimAndLoadJson(res, self)
185
+ return PlanQualityScore(**data)
186
+
187
+ async def _a_get_plan_quality_score(self, task, plan):
188
+ prompt = PlanQualityTemplate.evaluate_plan_quality(
189
+ task, "\n".join(plan)
190
+ )
191
+ if self.using_native_model:
192
+ res, cost = await self.model.a_generate(
193
+ prompt, schema=PlanQualityScore
194
+ )
195
+ self.evaluation_cost += cost
196
+ return res
197
+ else:
198
+ try:
199
+ res: Task = await self.model.a_generate(
200
+ prompt, schema=PlanQualityScore
201
+ )
202
+ return res
203
+ except TypeError:
204
+ res = await self.model.a_generate(prompt)
205
+ data = trimAndLoadJson(res, self)
206
+ return PlanQualityScore(**data)
207
+
208
+ def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
209
+ prompt = PlanAdherenceTemplate.extract_plan_from_trace(
210
+ test_case._trace_dict
211
+ )
212
+ if self.using_native_model:
213
+ res, cost = self.model.generate(prompt, schema=AgentPlan)
214
+ self.evaluation_cost += cost
215
+ return res
216
+ else:
217
+ try:
218
+ res: Task = self.model.generate(prompt, schema=AgentPlan)
219
+ return res
220
+ except TypeError:
221
+ res = self.model.generate(prompt)
222
+ data = trimAndLoadJson(res, self)
223
+ return AgentPlan(**data)
224
+
225
+ async def _a_extract_plan_from_trace(
226
+ self, test_case: LLMTestCase
227
+ ) -> AgentPlan:
228
+ prompt = PlanAdherenceTemplate.extract_plan_from_trace(
229
+ test_case._trace_dict
230
+ )
231
+ if self.using_native_model:
232
+ res, cost = await self.model.a_generate(prompt, schema=AgentPlan)
233
+ self.evaluation_cost += cost
234
+ return res
235
+ else:
236
+ try:
237
+ res: Task = await self.model.a_generate(
238
+ prompt, schema=AgentPlan
239
+ )
240
+ return res
241
+ except TypeError:
242
+ res = await self.model.a_generate(prompt)
243
+ data = trimAndLoadJson(res, self)
244
+ return AgentPlan(**data)
245
+
246
+ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
247
+ prompt = StepEfficiencyTemplate.extract_task_from_trace(
248
+ test_case._trace_dict
249
+ )
250
+ if self.using_native_model:
251
+ res, cost = self.model.generate(prompt, schema=Task)
252
+ self.evaluation_cost += cost
253
+ return res.task
254
+ else:
255
+ try:
256
+ res: Task = self.model.generate(prompt, schema=Task)
257
+ return res.task
258
+ except TypeError:
259
+ res = self.model.generate(prompt)
260
+ data = trimAndLoadJson(res, self)
261
+ return data["task"]
262
+
263
+ async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
264
+ prompt = StepEfficiencyTemplate.extract_task_from_trace(
265
+ test_case._trace_dict
266
+ )
267
+ if self.using_native_model:
268
+ res, cost = await self.model.a_generate(prompt, schema=Task)
269
+ self.evaluation_cost += cost
270
+ return res.task
271
+ else:
272
+ try:
273
+ res: Task = await self.model.a_generate(prompt, schema=Task)
274
+ return res.task
275
+ except TypeError:
276
+ res = await self.model.a_generate(prompt)
277
+ data = trimAndLoadJson(res, self)
278
+ return data["task"]
279
+
280
+ def is_successful(self) -> bool:
281
+ if self.error is not None:
282
+ self.success = False
283
+ else:
284
+ try:
285
+ self.success = self.score >= self.threshold
286
+ except:
287
+ self.success = False
288
+ return self.success
289
+
290
+ @property
291
+ def __name__(self):
292
+ return "Plan Quality"
@@ -0,0 +1,11 @@
1
+ from pydantic import BaseModel
2
+ from typing import List, Dict, Literal
3
+
4
+
5
+ class AgentPlan(BaseModel):
6
+ plan: List[str]
7
+
8
+
9
+ class PlanQualityScore(BaseModel):
10
+ score: float
11
+ reason: str
@@ -0,0 +1,101 @@
1
+ import textwrap
2
+ import json
3
+ from deepeval.tracing.utils import make_json_serializable
4
+
5
+
6
+ class PlanQualityTemplate:
7
+
8
+ @staticmethod
9
+ def evaluate_plan_quality(user_task: str, agent_plan: list) -> str:
10
+ return textwrap.dedent(
11
+ f"""You are a **plan quality evaluator**. Your task is to critically assess the **quality, completeness, and optimality** of an AI agent's plan to accomplish the given user task.
12
+
13
+ INPUTS:
14
+
15
+ - **User Task:** The user's explicit goal or instruction.
16
+ - **Agent Plan:** The ordered list of steps the agent intends to follow to achieve that goal.
17
+
18
+ EVALUATION OBJECTIVE:
19
+
20
+ Judge the **intrinsic quality** of the plan — whether the plan itself is strong enough to fully and efficiently achieve the user's task.
21
+
22
+ The evaluation must be **strict**. If the plan is incomplete, inefficient, redundant, or missing critical details, assign a very low score.
23
+
24
+ STRICT EVALUATION CRITERIA:
25
+
26
+ 1. Completeness (Most Important)
27
+ - The plan must fully address all major requirements of the user task.
28
+ - Missing even one critical subtask or dependency should reduce the score sharply.
29
+ - The plan must include all prerequisite actions necessary for the final outcome.
30
+
31
+ 2. Logical Coherence
32
+ - Steps must follow a clear, rational sequence that leads directly to completing the task.
33
+ - Disordered, redundant, or circular reasoning should be penalized heavily.
34
+ - Every step must have a clear purpose; no filler or irrelevant actions.
35
+
36
+ 3. Optimality and Efficiency
37
+ - The plan must be **minimal but sufficient** — no unnecessary or repetitive steps.
38
+ - If a more direct, simpler, or logically superior plan could achieve the same outcome, the current plan should receive a lower score.
39
+
40
+ 4. Level of Detail
41
+ - Each step should be specific enough for an agent to execute it reliably without ambiguity.
42
+ - Vague steps (e.g., “Do research”, “Handle results”) that lack operational clarity
43
+ lower the score.
44
+
45
+ 5. Alignment with Task
46
+ - The plan must explicitly and directly target the user's stated goal.
47
+ - If any step diverges from the main objective, the score should drop significantly.
48
+
49
+ ---
50
+
51
+ SCORING SCALE (STRICT)
52
+
53
+ - **1.0 — Excellent plan**
54
+ - Fully complete, logically ordered, and optimally efficient.
55
+ - No missing, redundant, or ambiguous steps.
56
+ - Directly fulfills every aspect of the user task.
57
+
58
+ - **0.75 — Good plan**
59
+ - Covers nearly all aspects of the task with clear logic.
60
+ - Minor gaps or small inefficiencies that do not block task completion.
61
+
62
+ - **0.5 — Adequate but flawed plan**
63
+ - Partially complete; key details missing or step order inefficient.
64
+ - Some ambiguity or redundancy that would likely affect execution success.
65
+
66
+ - **0.25 — Weak plan**
67
+ - Major missing steps or unclear logic.
68
+ - The plan would likely fail to complete the task as written.
69
+
70
+ - **0.0 — Inadequate plan**
71
+ - Irrelevant, incoherent, or severely incomplete plan.
72
+ - Does not align with the user’s task or cannot plausibly achieve it.
73
+
74
+ *When in doubt, assign the lower score.*
75
+
76
+ OUTPUT FORMAT:
77
+
78
+ Return a JSON object with this exact structure:
79
+
80
+ {{
81
+ "score": 0.0,
82
+ "reason": "1-3 short, precise sentences explaining what the plan lacks or how it could fail."
83
+ }}
84
+
85
+ The `"reason"` must:
86
+ - Reference specific missing, unclear, or inefficient steps.
87
+ - Avoid vague language (“seems fine”, “mostly works”).
88
+ - Use objective terms describing gaps or weaknesses.
89
+
90
+ PROVIDED DATA
91
+
92
+ User Task:
93
+ {user_task}
94
+
95
+ Agent Plan:
96
+ {agent_plan}
97
+
98
+
99
+ JSON:
100
+ """
101
+ )
@@ -20,6 +20,8 @@ from deepeval.metrics.indicator import metric_progress_indicator
20
20
  from deepeval.metrics.prompt_alignment import schema as paschema
21
21
  from deepeval.config.settings import get_settings
22
22
 
23
+ from deepeval.metrics.api import metric_data_manager
24
+
23
25
 
24
26
  class PromptAlignmentMetric(BaseMetric):
25
27
 
@@ -55,6 +57,7 @@ class PromptAlignmentMetric(BaseMetric):
55
57
  test_case: LLMTestCase,
56
58
  _show_indicator: bool = True,
57
59
  _in_component: bool = False,
60
+ _log_metric_to_confident: bool = True,
58
61
  ) -> float:
59
62
 
60
63
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -93,6 +96,10 @@ class PromptAlignmentMetric(BaseMetric):
93
96
  f"Score: {self.score}\nReason: {self.reason}",
94
97
  ],
95
98
  )
99
+ if _log_metric_to_confident:
100
+ metric_data_manager.post_metric_if_enabled(
101
+ self, test_case=test_case
102
+ )
96
103
 
97
104
  return self.score
98
105
 
@@ -101,6 +108,7 @@ class PromptAlignmentMetric(BaseMetric):
101
108
  test_case: LLMTestCase,
102
109
  _show_indicator: bool = True,
103
110
  _in_component: bool = False,
111
+ _log_metric_to_confident: bool = True,
104
112
  ) -> float:
105
113
 
106
114
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -128,7 +136,10 @@ class PromptAlignmentMetric(BaseMetric):
128
136
  f"Score: {self.score}\nReason: {self.reason}",
129
137
  ],
130
138
  )
131
-
139
+ if _log_metric_to_confident:
140
+ metric_data_manager.post_metric_if_enabled(
141
+ self, test_case=test_case
142
+ )
132
143
  return self.score
133
144
 
134
145
  async def _a_generate_reason(self, input: str, actual_output: str) -> str:
@@ -1,6 +1,7 @@
1
1
  from typing import Optional, Union, List
2
2
 
3
3
  from deepeval.metrics import BaseConversationalMetric
4
+ from deepeval.metrics.api import metric_data_manager
4
5
  from deepeval.metrics.role_adherence.schema import (
5
6
  OutOfCharacterResponseVerdicts,
6
7
  )
@@ -44,6 +45,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
44
45
  test_case: ConversationalTestCase,
45
46
  _show_indicator: bool = True,
46
47
  _in_component: bool = False,
48
+ _log_metric_to_confident: bool = True,
47
49
  ):
48
50
  check_conversational_test_case_params(
49
51
  test_case,
@@ -63,6 +65,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
63
65
  test_case,
64
66
  _show_indicator=False,
65
67
  _in_component=_in_component,
68
+ _log_metric_to_confident=_log_metric_to_confident,
66
69
  )
67
70
  )
68
71
  else:
@@ -82,6 +85,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
82
85
  f"Score: {self.score}\nReason: {self.reason}",
83
86
  ],
84
87
  )
88
+ if _log_metric_to_confident:
89
+ metric_data_manager.post_metric_if_enabled(
90
+ self, test_case=test_case
91
+ )
85
92
  return self.score
86
93
 
87
94
  async def a_measure(
@@ -89,6 +96,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
89
96
  test_case: ConversationalTestCase,
90
97
  _show_indicator: bool = True,
91
98
  _in_component: bool = False,
99
+ _log_metric_to_confident: bool = True,
92
100
  ) -> float:
93
101
  check_conversational_test_case_params(
94
102
  test_case,
@@ -124,6 +132,10 @@ class RoleAdherenceMetric(BaseConversationalMetric):
124
132
  f"Score: {self.score}\nReason: {self.reason}",
125
133
  ],
126
134
  )
135
+ if _log_metric_to_confident:
136
+ metric_data_manager.post_metric_if_enabled(
137
+ self, test_case=test_case
138
+ )
127
139
  return self.score
128
140
 
129
141
  async def _a_generate_reason(self, role: str) -> str:
@@ -17,6 +17,7 @@ from deepeval.metrics.utils import (
17
17
  )
18
18
  from deepeval.metrics.role_violation.template import RoleViolationTemplate
19
19
  from deepeval.metrics.role_violation.schema import *
20
+ from deepeval.metrics.api import metric_data_manager
20
21
 
21
22
 
22
23
  class RoleViolationMetric(BaseMetric):
@@ -58,6 +59,7 @@ class RoleViolationMetric(BaseMetric):
58
59
  test_case: LLMTestCase,
59
60
  _show_indicator: bool = True,
60
61
  _in_component: bool = False,
62
+ _log_metric_to_confident: bool = True,
61
63
  ) -> float:
62
64
 
63
65
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -73,6 +75,7 @@ class RoleViolationMetric(BaseMetric):
73
75
  test_case,
74
76
  _show_indicator=False,
75
77
  _in_component=_in_component,
78
+ _log_metric_to_confident=_log_metric_to_confident,
76
79
  )
77
80
  )
78
81
  else:
@@ -94,6 +97,10 @@ class RoleViolationMetric(BaseMetric):
94
97
  f"Score: {self.score}\nReason: {self.reason}",
95
98
  ],
96
99
  )
100
+ if _log_metric_to_confident:
101
+ metric_data_manager.post_metric_if_enabled(
102
+ self, test_case=test_case
103
+ )
97
104
 
98
105
  return self.score
99
106
 
@@ -102,6 +109,7 @@ class RoleViolationMetric(BaseMetric):
102
109
  test_case: LLMTestCase,
103
110
  _show_indicator: bool = True,
104
111
  _in_component: bool = False,
112
+ _log_metric_to_confident: bool = True,
105
113
  ) -> float:
106
114
 
107
115
  check_llm_test_case_params(test_case, self._required_params, self)
@@ -131,6 +139,10 @@ class RoleViolationMetric(BaseMetric):
131
139
  f"Score: {self.score}\nReason: {self.reason}",
132
140
  ],
133
141
  )
142
+ if _log_metric_to_confident:
143
+ metric_data_manager.post_metric_if_enabled(
144
+ self, test_case=test_case
145
+ )
134
146
 
135
147
  return self.score
136
148
 
@@ -0,0 +1 @@
1
+ from .step_efficiency import StepEfficiencyMetric
@@ -0,0 +1,11 @@
1
+ from pydantic import BaseModel
2
+ from typing import List, Dict, Literal
3
+
4
+
5
+ class Task(BaseModel):
6
+ task: str
7
+
8
+
9
+ class EfficiencyVerdict(BaseModel):
10
+ score: float
11
+ reason: str