deepeval 3.6.7__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/errors.py +20 -2
  3. deepeval/evaluate/execute.py +725 -217
  4. deepeval/evaluate/types.py +1 -0
  5. deepeval/evaluate/utils.py +13 -3
  6. deepeval/integrations/crewai/__init__.py +2 -1
  7. deepeval/integrations/crewai/tool.py +71 -0
  8. deepeval/integrations/llama_index/__init__.py +0 -4
  9. deepeval/integrations/llama_index/handler.py +20 -21
  10. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  11. deepeval/metrics/__init__.py +13 -0
  12. deepeval/metrics/base_metric.py +1 -0
  13. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  14. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  15. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  16. deepeval/metrics/dag/schema.py +1 -1
  17. deepeval/metrics/dag/templates.py +2 -2
  18. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  19. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  20. deepeval/metrics/goal_accuracy/schema.py +17 -0
  21. deepeval/metrics/goal_accuracy/template.py +235 -0
  22. deepeval/metrics/hallucination/hallucination.py +8 -8
  23. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  24. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  25. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  26. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  27. deepeval/metrics/plan_adherence/__init__.py +1 -0
  28. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  29. deepeval/metrics/plan_adherence/schema.py +11 -0
  30. deepeval/metrics/plan_adherence/template.py +170 -0
  31. deepeval/metrics/plan_quality/__init__.py +1 -0
  32. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  33. deepeval/metrics/plan_quality/schema.py +11 -0
  34. deepeval/metrics/plan_quality/template.py +101 -0
  35. deepeval/metrics/step_efficiency/__init__.py +1 -0
  36. deepeval/metrics/step_efficiency/schema.py +11 -0
  37. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  38. deepeval/metrics/step_efficiency/template.py +256 -0
  39. deepeval/metrics/task_completion/task_completion.py +1 -0
  40. deepeval/metrics/tool_correctness/schema.py +6 -0
  41. deepeval/metrics/tool_correctness/template.py +88 -0
  42. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  43. deepeval/metrics/tool_use/__init__.py +1 -0
  44. deepeval/metrics/tool_use/schema.py +19 -0
  45. deepeval/metrics/tool_use/template.py +220 -0
  46. deepeval/metrics/tool_use/tool_use.py +458 -0
  47. deepeval/metrics/topic_adherence/__init__.py +1 -0
  48. deepeval/metrics/topic_adherence/schema.py +16 -0
  49. deepeval/metrics/topic_adherence/template.py +162 -0
  50. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  52. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  53. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  54. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  55. deepeval/openai/extractors.py +61 -16
  56. deepeval/openai/patch.py +8 -12
  57. deepeval/openai/types.py +1 -1
  58. deepeval/openai/utils.py +108 -1
  59. deepeval/prompt/prompt.py +1 -0
  60. deepeval/prompt/utils.py +43 -14
  61. deepeval/synthesizer/synthesizer.py +11 -10
  62. deepeval/test_case/llm_test_case.py +6 -2
  63. deepeval/test_run/test_run.py +190 -207
  64. deepeval/tracing/__init__.py +2 -1
  65. deepeval/tracing/otel/exporter.py +3 -4
  66. deepeval/tracing/otel/utils.py +23 -4
  67. deepeval/tracing/trace_context.py +53 -38
  68. deepeval/tracing/tracing.py +23 -0
  69. deepeval/tracing/types.py +16 -14
  70. deepeval/utils.py +21 -0
  71. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  72. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/RECORD +75 -53
  73. deepeval/integrations/llama_index/agent/patched.py +0 -68
  74. deepeval/tracing/message_types/__init__.py +0 -10
  75. deepeval/tracing/message_types/base.py +0 -6
  76. deepeval/tracing/message_types/messages.py +0 -14
  77. deepeval/tracing/message_types/tools.py +0 -18
  78. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  79. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  80. {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,292 @@
1
+ from typing import Optional, List, Union, Dict
2
+
3
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ check_llm_test_case_params,
8
+ initialize_model,
9
+ )
10
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
+ from deepeval.metrics import BaseMetric
12
+ from deepeval.models import DeepEvalBaseLLM
13
+ from deepeval.metrics.indicator import metric_progress_indicator
14
+ from deepeval.metrics.step_efficiency.template import (
15
+ StepEfficiencyTemplate,
16
+ )
17
+ from deepeval.metrics.step_efficiency.schema import Task
18
+ from deepeval.metrics.plan_adherence.schema import (
19
+ AgentPlan,
20
+ PlanAdherenceScore,
21
+ )
22
+ from deepeval.metrics.plan_adherence.template import (
23
+ PlanAdherenceTemplate,
24
+ )
25
+ from deepeval.metrics.api import metric_data_manager
26
+
27
+
28
+ class PlanAdherenceMetric(BaseMetric):
29
+
30
+ _required_params: List[LLMTestCaseParams] = [
31
+ LLMTestCaseParams.INPUT,
32
+ LLMTestCaseParams.ACTUAL_OUTPUT,
33
+ LLMTestCaseParams.TOOLS_CALLED,
34
+ ]
35
+
36
+ def __init__(
37
+ self,
38
+ threshold: float = 0.5,
39
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
40
+ include_reason: bool = True,
41
+ async_mode: bool = True,
42
+ strict_mode: bool = False,
43
+ verbose_mode: bool = False,
44
+ ):
45
+ self.threshold = 1 if strict_mode else threshold
46
+ self.model, self.using_native_model = initialize_model(model)
47
+ self.evaluation_model = self.model.get_model_name()
48
+ self.include_reason = include_reason
49
+ self.async_mode = async_mode
50
+ self.strict_mode = strict_mode
51
+ self.verbose_mode = verbose_mode
52
+ self.requires_trace = True
53
+
54
+ def measure(
55
+ self,
56
+ test_case: LLMTestCase,
57
+ _show_indicator: bool = True,
58
+ _in_component: bool = False,
59
+ _log_metric_to_confident: bool = True,
60
+ ):
61
+ has_trace: bool = isinstance(test_case._trace_dict, Dict)
62
+ if not has_trace:
63
+ check_llm_test_case_params(test_case, self._required_params, self)
64
+
65
+ self.evaluation_cost = 0 if self.using_native_model else None
66
+ with metric_progress_indicator(
67
+ self, _show_indicator=_show_indicator, _in_component=_in_component
68
+ ):
69
+ if self.async_mode:
70
+ loop = get_or_create_event_loop()
71
+ loop.run_until_complete(
72
+ self.a_measure(
73
+ test_case,
74
+ _show_indicator=False,
75
+ _in_component=_in_component,
76
+ _log_metric_to_confident=_log_metric_to_confident,
77
+ )
78
+ )
79
+ else:
80
+ task = self._extract_task_from_trace(test_case)
81
+ agent_plan = self._extract_plan_from_trace(test_case)
82
+ if len(agent_plan.plan) == 0:
83
+ self.score = 1
84
+ self.reason = "There were no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in any one of the trace attributes."
85
+ else:
86
+ plan_adherence_score = self._get_plan_adherence_score(
87
+ task, agent_plan.plan, test_case
88
+ )
89
+ self.score = (
90
+ 0
91
+ if self.strict_mode
92
+ and plan_adherence_score.score < self.threshold
93
+ else plan_adherence_score.score
94
+ )
95
+ self.reason = plan_adherence_score.reason
96
+ self.success = self.score >= self.threshold
97
+ self.verbose_logs = construct_verbose_logs(
98
+ self,
99
+ steps=[
100
+ f"Task: {task} \n",
101
+ f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
102
+ f"Final Score: {self.score} \n",
103
+ f"Final Reason: {self.reason} \n",
104
+ ],
105
+ )
106
+
107
+ if _log_metric_to_confident:
108
+ metric_data_manager.post_metric_if_enabled(
109
+ self, test_case=test_case
110
+ )
111
+
112
+ return self.score
113
+
114
+ async def a_measure(
115
+ self,
116
+ test_case: LLMTestCase,
117
+ _show_indicator: bool = True,
118
+ _in_component: bool = False,
119
+ _log_metric_to_confident: bool = True,
120
+ ):
121
+ has_trace: bool = isinstance(test_case._trace_dict, Dict)
122
+ if not has_trace:
123
+ check_llm_test_case_params(test_case, self._required_params, self)
124
+
125
+ self.evaluation_cost = 0 if self.using_native_model else None
126
+
127
+ with metric_progress_indicator(
128
+ self,
129
+ async_mode=True,
130
+ _show_indicator=_show_indicator,
131
+ _in_component=_in_component,
132
+ ):
133
+ task = await self._a_extract_task_from_trace(test_case)
134
+ agent_plan = await self._a_extract_plan_from_trace(test_case)
135
+ if len(agent_plan.plan) == 0:
136
+ self.score = 1
137
+ self.reason = "There were no plans to evaluate within the trace of your agent's execution. Please check if the agent's planning or reasoning or thinking is stored in any one of the trace attributes."
138
+ else:
139
+ plan_adherence_score = await self._a_get_plan_adherence_score(
140
+ task, agent_plan.plan, test_case
141
+ )
142
+ self.score = (
143
+ 0
144
+ if self.strict_mode
145
+ and plan_adherence_score.score < self.threshold
146
+ else plan_adherence_score.score
147
+ )
148
+ self.reason = plan_adherence_score.reason
149
+ self.success = self.score >= self.threshold
150
+
151
+ self.verbose_logs = construct_verbose_logs(
152
+ self,
153
+ steps=[
154
+ f"Task: {task} \n",
155
+ f"Agent Plan: \n{prettify_list(agent_plan.plan)} \n",
156
+ f"Final Score: {self.score} \n",
157
+ f"Final Reason: {self.reason} \n",
158
+ ],
159
+ )
160
+
161
+ if _log_metric_to_confident:
162
+ metric_data_manager.post_metric_if_enabled(
163
+ self, test_case=test_case
164
+ )
165
+
166
+ return self.score
167
+
168
+ def _get_plan_adherence_score(self, task, plan, test_case):
169
+ prompt = PlanAdherenceTemplate.evaluate_adherence(
170
+ task, "\n".join(plan), test_case._trace_dict
171
+ )
172
+ if self.using_native_model:
173
+ res, cost = self.model.generate(prompt, schema=PlanAdherenceScore)
174
+ self.evaluation_cost += cost
175
+ return res
176
+ else:
177
+ try:
178
+ res: Task = self.model.generate(
179
+ prompt, schema=PlanAdherenceScore
180
+ )
181
+ return res
182
+ except TypeError:
183
+ res = self.model.generate(prompt)
184
+ data = trimAndLoadJson(res, self)
185
+ return PlanAdherenceScore(**data)
186
+
187
+ async def _a_get_plan_adherence_score(self, task, plan, test_case):
188
+ prompt = PlanAdherenceTemplate.evaluate_adherence(
189
+ task, "\n".join(plan), test_case._trace_dict
190
+ )
191
+ if self.using_native_model:
192
+ res, cost = await self.model.a_generate(
193
+ prompt, schema=PlanAdherenceScore
194
+ )
195
+ self.evaluation_cost += cost
196
+ return res
197
+ else:
198
+ try:
199
+ res: Task = await self.model.a_generate(
200
+ prompt, schema=PlanAdherenceScore
201
+ )
202
+ return res
203
+ except TypeError:
204
+ res = await self.model.a_generate(prompt)
205
+ data = trimAndLoadJson(res, self)
206
+ return PlanAdherenceScore(**data)
207
+
208
+ def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
209
+ prompt = PlanAdherenceTemplate.extract_plan_from_trace(
210
+ test_case._trace_dict
211
+ )
212
+ if self.using_native_model:
213
+ res, cost = self.model.generate(prompt, schema=AgentPlan)
214
+ self.evaluation_cost += cost
215
+ return res
216
+ else:
217
+ try:
218
+ res: Task = self.model.generate(prompt, schema=AgentPlan)
219
+ return res
220
+ except TypeError:
221
+ res = self.model.generate(prompt)
222
+ data = trimAndLoadJson(res, self)
223
+ return AgentPlan(**data)
224
+
225
+ async def _a_extract_plan_from_trace(
226
+ self, test_case: LLMTestCase
227
+ ) -> AgentPlan:
228
+ prompt = PlanAdherenceTemplate.extract_plan_from_trace(
229
+ test_case._trace_dict
230
+ )
231
+ if self.using_native_model:
232
+ res, cost = await self.model.a_generate(prompt, schema=AgentPlan)
233
+ self.evaluation_cost += cost
234
+ return res
235
+ else:
236
+ try:
237
+ res: Task = await self.model.a_generate(
238
+ prompt, schema=AgentPlan
239
+ )
240
+ return res
241
+ except TypeError:
242
+ res = await self.model.a_generate(prompt)
243
+ data = trimAndLoadJson(res, self)
244
+ return AgentPlan(**data)
245
+
246
+ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
247
+ prompt = StepEfficiencyTemplate.extract_task_from_trace(
248
+ test_case._trace_dict
249
+ )
250
+ if self.using_native_model:
251
+ res, cost = self.model.generate(prompt, schema=Task)
252
+ self.evaluation_cost += cost
253
+ return res.task
254
+ else:
255
+ try:
256
+ res: Task = self.model.generate(prompt, schema=Task)
257
+ return res.task
258
+ except TypeError:
259
+ res = self.model.generate(prompt)
260
+ data = trimAndLoadJson(res, self)
261
+ return data["task"]
262
+
263
+ async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
264
+ prompt = StepEfficiencyTemplate.extract_task_from_trace(
265
+ test_case._trace_dict
266
+ )
267
+ if self.using_native_model:
268
+ res, cost = await self.model.a_generate(prompt, schema=Task)
269
+ self.evaluation_cost += cost
270
+ return res.task
271
+ else:
272
+ try:
273
+ res: Task = await self.model.a_generate(prompt, schema=Task)
274
+ return res.task
275
+ except TypeError:
276
+ res = await self.model.a_generate(prompt)
277
+ data = trimAndLoadJson(res, self)
278
+ return data["task"]
279
+
280
+ def is_successful(self) -> bool:
281
+ if self.error is not None:
282
+ self.success = False
283
+ else:
284
+ try:
285
+ self.success = self.score >= self.threshold
286
+ except:
287
+ self.success = False
288
+ return self.success
289
+
290
+ @property
291
+ def __name__(self):
292
+ return "Plan Adherence"
@@ -0,0 +1,11 @@
1
+ from pydantic import BaseModel
2
+ from typing import List, Dict, Literal
3
+
4
+
5
+ class AgentPlan(BaseModel):
6
+ plan: List[str]
7
+
8
+
9
+ class PlanAdherenceScore(BaseModel):
10
+ score: float
11
+ reason: str
@@ -0,0 +1,170 @@
1
+ import textwrap
2
+ import json
3
+ from deepeval.tracing.utils import make_json_serializable
4
+
5
+
6
+ class PlanAdherenceTemplate:
7
+
8
+ @staticmethod
9
+ def extract_plan_from_trace(trace: dict) -> str:
10
+ return textwrap.dedent(
11
+ f"""You are a **systems analyst** evaluating an AI agent's execution trace.
12
+
13
+ Your sole task is to extract the **explicit or clearly implied plan** the agent followed or intended to follow — *only if that plan is directly evidenced in the trace*.
14
+
15
+ STRICT RULES TO FOLLOW:
16
+
17
+ 1. Source Evidence Requirement
18
+ - Every plan step you include **must** be directly supported by explicit text in the trace.
19
+ - Acceptable evidence sources:
20
+ - `"reasoning"` or `"thought"` fields inside tool calls or function invocations.
21
+ - Explicit plan-like statements or lists written by the agent (e.g., “My plan is to…”).
22
+ - If no evidence exists for a step, DO NOT infer or invent it.
23
+
24
+ 2. No Hallucination Policy
25
+ - You must *not* create or rephrase steps that aren't explicitly or strongly implied by the trace.
26
+ - If there is no coherent plan present, output an empty list.
27
+
28
+ 3. Focus on Intent, Not Outcomes
29
+ - If the agent's plan is stated but execution differs, still extract the intended steps — but only if those intended steps are traceable.
30
+
31
+ 4. Granularity
32
+ - Each step should represent a single distinct action or intention.
33
+ - Avoid merging multiple intentions into one step or splitting one intention into multiple steps.
34
+
35
+ 5. Neutral Language
36
+ - Reproduce the plan steps in **neutral, minimal paraphrasing**.
37
+ - Do not interpret motivation, quality, or success of actions.
38
+
39
+ OUTPUT FORMAT:
40
+
41
+ Return a JSON object with exactly this structure:
42
+ {{
43
+ "plan": [
44
+ "step 1",
45
+ "step 2",
46
+ ...
47
+ ]
48
+ }}
49
+
50
+ If no plan is evidenced in the trace, return:
51
+ {{
52
+ "plan": []
53
+ }}
54
+
55
+ Do not include commentary, confidence scores, or explanations.
56
+
57
+ TRACE:
58
+
59
+ {json.dumps(trace, indent=2, default=str)}
60
+
61
+ JSON:
62
+ """
63
+ )
64
+
65
+ @staticmethod
66
+ def evaluate_adherence(
67
+ user_task: str, agent_plan: str, execution_trace: dict
68
+ ) -> str:
69
+ return textwrap.dedent(
70
+ f"""You are an **adversarial plan adherence evaluator**. Your goal is to assign the **lowest justifiable score** based on how strictly the agent's actions in the execution trace align with its declared plan.
71
+
72
+ INPUTS:
73
+
74
+ - **User Task:** The original request or objective.
75
+ - **Agent Plan:** The explicit step-by-step plan the agent was supposed to follow.
76
+ - **Execution Trace:** A detailed record of all agent actions, reasoning, tool calls, and outputs.
77
+
78
+ EVALUATION OBJECTIVE:
79
+
80
+ Determine whether the agent **exactly and exclusively** followed its plan.
81
+ You are not evaluating success, correctness, or usefulness — **only plan obedience**.
82
+
83
+ Assume **non-adherence by default** unless clear, direct evidence in the trace proves that
84
+ each planned step was executed *as written* and *no additional actions occurred*.
85
+
86
+ ### STRICT ADHERENCE RULES
87
+
88
+ 1. Step Verification
89
+ - Every step in the plan must correspond to a **verifiable, explicit** action or reasoning entry in the trace.
90
+ - Each step must appear in the same logical order as the plan.
91
+ - If a step is missing, only implied, or ambiguous, treat as **not followed**.
92
+
93
+ 2. No Extraneous Actions
94
+ - If the trace includes **any** major action, tool call, or reasoning segment not clearly present in the plan, immediately lower the score to as low as possible.
95
+ - Extra or unnecessary steps are considered serious violations.
96
+
97
+ 3. Order Consistency
98
+ - If the agent performed steps in a different order than the plan specifies, the score must be close to 0, regardless of other alignment.
99
+
100
+ 4. Completeness
101
+ - If even one planned step is missing, skipped, or only partially reflected in the trace, the score must be lowest possible.
102
+
103
+ 5. Ambiguity Handling
104
+ - If it is unclear whether a trace action corresponds to a plan step, treat that step as **not executed**.
105
+ - When uncertain, assign the **lower score**.
106
+
107
+ 6. Focus Exclusively on Plan Compliance
108
+ - Ignore task success, reasoning quality, or correctness of outcomes.
109
+ - Evaluate *only* whether the trace reflects the exact plan execution.
110
+
111
+
112
+ SCORING SCALE
113
+
114
+ - **1.0 — Perfect adherence**
115
+ - Every planned step is explicitly and verifiably present in the trace, in correct order.
116
+ - No skipped or added steps.
117
+ - No ambiguity in matching.
118
+
119
+ - **0.75 — Strong adherence**
120
+ - All or nearly all steps are executed in order.
121
+ - At most one minor deviation (e.g., a trivial reordering or minor redundant step) that does not change the plan’s structure.
122
+
123
+ - **0.5 — Partial adherence**
124
+ - Some steps clearly match, but others are missing, out of order, or replaced.
125
+ - At least one extra or ambiguous action appears.
126
+ - *This should be the highest score possible when there are any deviations.*
127
+
128
+ - **0.25 — Weak adherence**
129
+ - Only a few steps from the plan appear in the trace, or multiple extraneous actions occur.
130
+ - The structure or sequence of the plan is mostly lost.
131
+
132
+ - **0.0 — No adherence**
133
+ - The trace shows little or no resemblance to the plan.
134
+ - Steps are ignored, replaced, or executed in an entirely different order.
135
+
136
+ Always err toward the **lower score** when evidence is partial, ambiguous, or contradictory.
137
+
138
+ OUTPUT FORMAT:
139
+
140
+ Return a JSON object with exactly this structure:
141
+
142
+ {{
143
+ "score": 0.0,
144
+ "reason": "1-3 concise, factual sentences citing specific matched, missing, or extra steps."
145
+ }}
146
+
147
+ Requirements for `"reason"`:
148
+ - Reference specific plan step numbers or phrases.
149
+ - Mention concrete trace evidence of mismatches or additions.
150
+ - Avoid subjective adjectives (e.g., “mostly”, “close”, “reasonable”).
151
+ - Be precise and neutral.
152
+
153
+ ---
154
+
155
+ INPUTS:
156
+
157
+ User Task:
158
+ {user_task}
159
+
160
+ Agent Plan:
161
+ {agent_plan}
162
+
163
+ Execution Trace:
164
+ {json.dumps(execution_trace, indent=2, default=make_json_serializable)}
165
+
166
+ ---
167
+
168
+ JSON:
169
+ """
170
+ )
@@ -0,0 +1 @@
1
+ from .plan_quality import PlanQualityMetric