deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -3,9 +3,10 @@ from typing import Optional, List, Union, Dict
3
3
  from deepeval.utils import get_or_create_event_loop, prettify_list
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  check_llm_test_case_params,
8
7
  initialize_model,
8
+ a_generate_with_schema_and_extract,
9
+ generate_with_schema_and_extract,
9
10
  )
10
11
  from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
12
  from deepeval.metrics import BaseMetric
@@ -30,7 +31,6 @@ class PlanAdherenceMetric(BaseMetric):
30
31
  _required_params: List[LLMTestCaseParams] = [
31
32
  LLMTestCaseParams.INPUT,
32
33
  LLMTestCaseParams.ACTUAL_OUTPUT,
33
- LLMTestCaseParams.TOOLS_CALLED,
34
34
  ]
35
35
 
36
36
  def __init__(
@@ -58,9 +58,15 @@ class PlanAdherenceMetric(BaseMetric):
58
58
  _in_component: bool = False,
59
59
  _log_metric_to_confident: bool = True,
60
60
  ):
61
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
62
- if not has_trace:
63
- check_llm_test_case_params(test_case, self._required_params, self)
61
+ check_llm_test_case_params(
62
+ test_case,
63
+ self._required_params,
64
+ None,
65
+ None,
66
+ self,
67
+ self.model,
68
+ test_case.multimodal,
69
+ )
64
70
 
65
71
  self.evaluation_cost = 0 if self.using_native_model else None
66
72
  with metric_progress_indicator(
@@ -118,9 +124,15 @@ class PlanAdherenceMetric(BaseMetric):
118
124
  _in_component: bool = False,
119
125
  _log_metric_to_confident: bool = True,
120
126
  ):
121
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
122
- if not has_trace:
123
- check_llm_test_case_params(test_case, self._required_params, self)
127
+ check_llm_test_case_params(
128
+ test_case,
129
+ self._required_params,
130
+ None,
131
+ None,
132
+ self,
133
+ self.model,
134
+ test_case.multimodal,
135
+ )
124
136
 
125
137
  self.evaluation_cost = 0 if self.using_native_model else None
126
138
 
@@ -169,58 +181,37 @@ class PlanAdherenceMetric(BaseMetric):
169
181
  prompt = PlanAdherenceTemplate.evaluate_adherence(
170
182
  task, "\n".join(plan), test_case._trace_dict
171
183
  )
172
- if self.using_native_model:
173
- res, cost = self.model.generate(prompt, schema=PlanAdherenceScore)
174
- self.evaluation_cost += cost
175
- return res
176
- else:
177
- try:
178
- res: Task = self.model.generate(
179
- prompt, schema=PlanAdherenceScore
180
- )
181
- return res
182
- except TypeError:
183
- res = self.model.generate(prompt)
184
- data = trimAndLoadJson(res, self)
185
- return PlanAdherenceScore(**data)
184
+ return generate_with_schema_and_extract(
185
+ metric=self,
186
+ prompt=prompt,
187
+ schema_cls=PlanAdherenceScore,
188
+ extract_schema=lambda s: s,
189
+ extract_json=lambda data: PlanAdherenceScore(**data),
190
+ )
186
191
 
187
192
  async def _a_get_plan_adherence_score(self, task, plan, test_case):
188
193
  prompt = PlanAdherenceTemplate.evaluate_adherence(
189
194
  task, "\n".join(plan), test_case._trace_dict
190
195
  )
191
- if self.using_native_model:
192
- res, cost = await self.model.a_generate(
193
- prompt, schema=PlanAdherenceScore
194
- )
195
- self.evaluation_cost += cost
196
- return res
197
- else:
198
- try:
199
- res: Task = await self.model.a_generate(
200
- prompt, schema=PlanAdherenceScore
201
- )
202
- return res
203
- except TypeError:
204
- res = await self.model.a_generate(prompt)
205
- data = trimAndLoadJson(res, self)
206
- return PlanAdherenceScore(**data)
196
+ return await a_generate_with_schema_and_extract(
197
+ metric=self,
198
+ prompt=prompt,
199
+ schema_cls=PlanAdherenceScore,
200
+ extract_schema=lambda s: s,
201
+ extract_json=lambda data: PlanAdherenceScore(**data),
202
+ )
207
203
 
208
204
  def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
209
205
  prompt = PlanAdherenceTemplate.extract_plan_from_trace(
210
206
  test_case._trace_dict
211
207
  )
212
- if self.using_native_model:
213
- res, cost = self.model.generate(prompt, schema=AgentPlan)
214
- self.evaluation_cost += cost
215
- return res
216
- else:
217
- try:
218
- res: Task = self.model.generate(prompt, schema=AgentPlan)
219
- return res
220
- except TypeError:
221
- res = self.model.generate(prompt)
222
- data = trimAndLoadJson(res, self)
223
- return AgentPlan(**data)
208
+ return generate_with_schema_and_extract(
209
+ metric=self,
210
+ prompt=prompt,
211
+ schema_cls=AgentPlan,
212
+ extract_schema=lambda s: s,
213
+ extract_json=lambda data: AgentPlan(**data),
214
+ )
224
215
 
225
216
  async def _a_extract_plan_from_trace(
226
217
  self, test_case: LLMTestCase
@@ -228,54 +219,37 @@ class PlanAdherenceMetric(BaseMetric):
228
219
  prompt = PlanAdherenceTemplate.extract_plan_from_trace(
229
220
  test_case._trace_dict
230
221
  )
231
- if self.using_native_model:
232
- res, cost = await self.model.a_generate(prompt, schema=AgentPlan)
233
- self.evaluation_cost += cost
234
- return res
235
- else:
236
- try:
237
- res: Task = await self.model.a_generate(
238
- prompt, schema=AgentPlan
239
- )
240
- return res
241
- except TypeError:
242
- res = await self.model.a_generate(prompt)
243
- data = trimAndLoadJson(res, self)
244
- return AgentPlan(**data)
222
+ return await a_generate_with_schema_and_extract(
223
+ metric=self,
224
+ prompt=prompt,
225
+ schema_cls=AgentPlan,
226
+ extract_schema=lambda s: s,
227
+ extract_json=lambda data: AgentPlan(**data),
228
+ )
245
229
 
246
230
  def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
247
231
  prompt = StepEfficiencyTemplate.extract_task_from_trace(
248
232
  test_case._trace_dict
249
233
  )
250
- if self.using_native_model:
251
- res, cost = self.model.generate(prompt, schema=Task)
252
- self.evaluation_cost += cost
253
- return res.task
254
- else:
255
- try:
256
- res: Task = self.model.generate(prompt, schema=Task)
257
- return res.task
258
- except TypeError:
259
- res = self.model.generate(prompt)
260
- data = trimAndLoadJson(res, self)
261
- return data["task"]
234
+ return generate_with_schema_and_extract(
235
+ metric=self,
236
+ prompt=prompt,
237
+ schema_cls=Task,
238
+ extract_schema=lambda s: s.task,
239
+ extract_json=lambda data: data["task"],
240
+ )
262
241
 
263
242
  async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
264
243
  prompt = StepEfficiencyTemplate.extract_task_from_trace(
265
244
  test_case._trace_dict
266
245
  )
267
- if self.using_native_model:
268
- res, cost = await self.model.a_generate(prompt, schema=Task)
269
- self.evaluation_cost += cost
270
- return res.task
271
- else:
272
- try:
273
- res: Task = await self.model.a_generate(prompt, schema=Task)
274
- return res.task
275
- except TypeError:
276
- res = await self.model.a_generate(prompt)
277
- data = trimAndLoadJson(res, self)
278
- return data["task"]
246
+ return await a_generate_with_schema_and_extract(
247
+ metric=self,
248
+ prompt=prompt,
249
+ schema_cls=Task,
250
+ extract_schema=lambda s: s.task,
251
+ extract_json=lambda data: data["task"],
252
+ )
279
253
 
280
254
  def is_successful(self) -> bool:
281
255
  if self.error is not None:
@@ -283,7 +257,7 @@ class PlanAdherenceMetric(BaseMetric):
283
257
  else:
284
258
  try:
285
259
  self.success = self.score >= self.threshold
286
- except:
260
+ except TypeError:
287
261
  self.success = False
288
262
  return self.success
289
263
 
@@ -4,6 +4,13 @@ from deepeval.tracing.utils import make_json_serializable
4
4
 
5
5
 
6
6
  class PlanAdherenceTemplate:
7
+ multimodal_rules = """
8
+ --- MULTIMODAL INPUT RULES ---
9
+ - Treat image content as factual evidence.
10
+ - Only reference visual details that are explicitly and clearly visible.
11
+ - Do not infer or guess objects, text, or details not visibly present.
12
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
13
+ """
7
14
 
8
15
  @staticmethod
9
16
  def extract_plan_from_trace(trace: dict) -> str:
@@ -36,6 +43,8 @@ class PlanAdherenceTemplate:
36
43
  - Reproduce the plan steps in **neutral, minimal paraphrasing**.
37
44
  - Do not interpret motivation, quality, or success of actions.
38
45
 
46
+ {PlanAdherenceTemplate.multimodal_rules}
47
+
39
48
  OUTPUT FORMAT:
40
49
 
41
50
  Return a JSON object with exactly this structure:
@@ -108,6 +117,8 @@ class PlanAdherenceTemplate:
108
117
  - Ignore task success, reasoning quality, or correctness of outcomes.
109
118
  - Evaluate *only* whether the trace reflects the exact plan execution.
110
119
 
120
+ {PlanAdherenceTemplate.multimodal_rules}
121
+
111
122
 
112
123
  SCORING SCALE
113
124
 
@@ -3,9 +3,10 @@ from typing import Optional, List, Union, Dict
3
3
  from deepeval.utils import get_or_create_event_loop, prettify_list
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  check_llm_test_case_params,
8
7
  initialize_model,
8
+ a_generate_with_schema_and_extract,
9
+ generate_with_schema_and_extract,
9
10
  )
10
11
  from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
12
  from deepeval.metrics import BaseMetric
@@ -33,7 +34,6 @@ class PlanQualityMetric(BaseMetric):
33
34
  _required_params: List[LLMTestCaseParams] = [
34
35
  LLMTestCaseParams.INPUT,
35
36
  LLMTestCaseParams.ACTUAL_OUTPUT,
36
- LLMTestCaseParams.TOOLS_CALLED,
37
37
  ]
38
38
 
39
39
  def __init__(
@@ -61,9 +61,15 @@ class PlanQualityMetric(BaseMetric):
61
61
  _in_component: bool = False,
62
62
  _log_metric_to_confident: bool = True,
63
63
  ):
64
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
65
- if not has_trace:
66
- check_llm_test_case_params(test_case, self._required_params, self)
64
+ check_llm_test_case_params(
65
+ test_case,
66
+ self._required_params,
67
+ None,
68
+ None,
69
+ self,
70
+ self.model,
71
+ test_case.multimodal,
72
+ )
67
73
 
68
74
  self.evaluation_cost = 0 if self.using_native_model else None
69
75
  with metric_progress_indicator(
@@ -121,9 +127,15 @@ class PlanQualityMetric(BaseMetric):
121
127
  _in_component: bool = False,
122
128
  _log_metric_to_confident: bool = True,
123
129
  ):
124
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
125
- if not has_trace:
126
- check_llm_test_case_params(test_case, self._required_params, self)
130
+ check_llm_test_case_params(
131
+ test_case,
132
+ self._required_params,
133
+ None,
134
+ None,
135
+ self,
136
+ self.model,
137
+ test_case.multimodal,
138
+ )
127
139
 
128
140
  self.evaluation_cost = 0 if self.using_native_model else None
129
141
 
@@ -171,56 +183,37 @@ class PlanQualityMetric(BaseMetric):
171
183
  prompt = PlanQualityTemplate.evaluate_plan_quality(
172
184
  task, "\n".join(plan)
173
185
  )
174
- if self.using_native_model:
175
- res, cost = self.model.generate(prompt, schema=PlanQualityScore)
176
- self.evaluation_cost += cost
177
- return res
178
- else:
179
- try:
180
- res: Task = self.model.generate(prompt, schema=PlanQualityScore)
181
- return res
182
- except TypeError:
183
- res = self.model.generate(prompt)
184
- data = trimAndLoadJson(res, self)
185
- return PlanQualityScore(**data)
186
+ return generate_with_schema_and_extract(
187
+ metric=self,
188
+ prompt=prompt,
189
+ schema_cls=PlanQualityScore,
190
+ extract_schema=lambda s: s,
191
+ extract_json=lambda data: PlanQualityScore(**data),
192
+ )
186
193
 
187
194
  async def _a_get_plan_quality_score(self, task, plan):
188
195
  prompt = PlanQualityTemplate.evaluate_plan_quality(
189
196
  task, "\n".join(plan)
190
197
  )
191
- if self.using_native_model:
192
- res, cost = await self.model.a_generate(
193
- prompt, schema=PlanQualityScore
194
- )
195
- self.evaluation_cost += cost
196
- return res
197
- else:
198
- try:
199
- res: Task = await self.model.a_generate(
200
- prompt, schema=PlanQualityScore
201
- )
202
- return res
203
- except TypeError:
204
- res = await self.model.a_generate(prompt)
205
- data = trimAndLoadJson(res, self)
206
- return PlanQualityScore(**data)
198
+ return await a_generate_with_schema_and_extract(
199
+ metric=self,
200
+ prompt=prompt,
201
+ schema_cls=PlanQualityScore,
202
+ extract_schema=lambda s: s,
203
+ extract_json=lambda data: PlanQualityScore(**data),
204
+ )
207
205
 
208
206
  def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
209
207
  prompt = PlanAdherenceTemplate.extract_plan_from_trace(
210
208
  test_case._trace_dict
211
209
  )
212
- if self.using_native_model:
213
- res, cost = self.model.generate(prompt, schema=AgentPlan)
214
- self.evaluation_cost += cost
215
- return res
216
- else:
217
- try:
218
- res: Task = self.model.generate(prompt, schema=AgentPlan)
219
- return res
220
- except TypeError:
221
- res = self.model.generate(prompt)
222
- data = trimAndLoadJson(res, self)
223
- return AgentPlan(**data)
210
+ return generate_with_schema_and_extract(
211
+ metric=self,
212
+ prompt=prompt,
213
+ schema_cls=AgentPlan,
214
+ extract_schema=lambda s: s,
215
+ extract_json=lambda data: AgentPlan(**data),
216
+ )
224
217
 
225
218
  async def _a_extract_plan_from_trace(
226
219
  self, test_case: LLMTestCase
@@ -228,54 +221,37 @@ class PlanQualityMetric(BaseMetric):
228
221
  prompt = PlanAdherenceTemplate.extract_plan_from_trace(
229
222
  test_case._trace_dict
230
223
  )
231
- if self.using_native_model:
232
- res, cost = await self.model.a_generate(prompt, schema=AgentPlan)
233
- self.evaluation_cost += cost
234
- return res
235
- else:
236
- try:
237
- res: Task = await self.model.a_generate(
238
- prompt, schema=AgentPlan
239
- )
240
- return res
241
- except TypeError:
242
- res = await self.model.a_generate(prompt)
243
- data = trimAndLoadJson(res, self)
244
- return AgentPlan(**data)
224
+ return await a_generate_with_schema_and_extract(
225
+ metric=self,
226
+ prompt=prompt,
227
+ schema_cls=AgentPlan,
228
+ extract_schema=lambda s: s,
229
+ extract_json=lambda data: AgentPlan(**data),
230
+ )
245
231
 
246
232
  def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
247
233
  prompt = StepEfficiencyTemplate.extract_task_from_trace(
248
234
  test_case._trace_dict
249
235
  )
250
- if self.using_native_model:
251
- res, cost = self.model.generate(prompt, schema=Task)
252
- self.evaluation_cost += cost
253
- return res.task
254
- else:
255
- try:
256
- res: Task = self.model.generate(prompt, schema=Task)
257
- return res.task
258
- except TypeError:
259
- res = self.model.generate(prompt)
260
- data = trimAndLoadJson(res, self)
261
- return data["task"]
236
+ return generate_with_schema_and_extract(
237
+ metric=self,
238
+ prompt=prompt,
239
+ schema_cls=Task,
240
+ extract_schema=lambda s: s.task,
241
+ extract_json=lambda data: data["task"],
242
+ )
262
243
 
263
244
  async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
264
245
  prompt = StepEfficiencyTemplate.extract_task_from_trace(
265
246
  test_case._trace_dict
266
247
  )
267
- if self.using_native_model:
268
- res, cost = await self.model.a_generate(prompt, schema=Task)
269
- self.evaluation_cost += cost
270
- return res.task
271
- else:
272
- try:
273
- res: Task = await self.model.a_generate(prompt, schema=Task)
274
- return res.task
275
- except TypeError:
276
- res = await self.model.a_generate(prompt)
277
- data = trimAndLoadJson(res, self)
278
- return data["task"]
248
+ return await a_generate_with_schema_and_extract(
249
+ metric=self,
250
+ prompt=prompt,
251
+ schema_cls=Task,
252
+ extract_schema=lambda s: s.task,
253
+ extract_json=lambda data: data["task"],
254
+ )
279
255
 
280
256
  def is_successful(self) -> bool:
281
257
  if self.error is not None:
@@ -283,7 +259,7 @@ class PlanQualityMetric(BaseMetric):
283
259
  else:
284
260
  try:
285
261
  self.success = self.score >= self.threshold
286
- except:
262
+ except TypeError:
287
263
  self.success = False
288
264
  return self.success
289
265
 
@@ -4,6 +4,13 @@ from deepeval.tracing.utils import make_json_serializable
4
4
 
5
5
 
6
6
  class PlanQualityTemplate:
7
+ multimodal_rules = """
8
+ --- MULTIMODAL INPUT RULES ---
9
+ - Treat image content as factual evidence.
10
+ - Only reference visual details that are explicitly and clearly visible.
11
+ - Do not infer or guess objects, text, or details not visibly present.
12
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
13
+ """
7
14
 
8
15
  @staticmethod
9
16
  def evaluate_plan_quality(user_task: str, agent_plan: list) -> str:
@@ -46,6 +53,8 @@ class PlanQualityTemplate:
46
53
  - The plan must explicitly and directly target the user's stated goal.
47
54
  - If any step diverges from the main objective, the score should drop significantly.
48
55
 
56
+ {PlanQualityTemplate.multimodal_rules}
57
+
49
58
  ---
50
59
 
51
60
  SCORING SCALE (STRICT)