deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -4,19 +4,24 @@ from deepeval.metrics import BaseMetric
4
4
  from deepeval.test_case import (
5
5
  LLMTestCase,
6
6
  LLMTestCaseParams,
7
- ConversationalTestCase,
8
7
  )
9
8
  from deepeval.metrics.indicator import metric_progress_indicator
10
9
  from deepeval.models import DeepEvalBaseLLM
11
10
  from deepeval.utils import get_or_create_event_loop, prettify_list
12
11
  from deepeval.metrics.utils import (
13
12
  construct_verbose_logs,
14
- trimAndLoadJson,
15
13
  check_llm_test_case_params,
16
14
  initialize_model,
15
+ a_generate_with_schema_and_extract,
16
+ generate_with_schema_and_extract,
17
17
  )
18
18
  from deepeval.metrics.role_violation.template import RoleViolationTemplate
19
- from deepeval.metrics.role_violation.schema import *
19
+ from deepeval.metrics.role_violation.schema import (
20
+ RoleViolationVerdict,
21
+ Verdicts,
22
+ RoleViolations,
23
+ RoleViolationScoreReason,
24
+ )
20
25
  from deepeval.metrics.api import metric_data_manager
21
26
 
22
27
 
@@ -62,7 +67,15 @@ class RoleViolationMetric(BaseMetric):
62
67
  _log_metric_to_confident: bool = True,
63
68
  ) -> float:
64
69
 
65
- check_llm_test_case_params(test_case, self._required_params, self)
70
+ check_llm_test_case_params(
71
+ test_case,
72
+ self._required_params,
73
+ None,
74
+ None,
75
+ self,
76
+ self.model,
77
+ test_case.multimodal,
78
+ )
66
79
 
67
80
  self.evaluation_cost = 0 if self.using_native_model else None
68
81
  with metric_progress_indicator(
@@ -112,7 +125,15 @@ class RoleViolationMetric(BaseMetric):
112
125
  _log_metric_to_confident: bool = True,
113
126
  ) -> float:
114
127
 
115
- check_llm_test_case_params(test_case, self._required_params, self)
128
+ check_llm_test_case_params(
129
+ test_case,
130
+ self._required_params,
131
+ None,
132
+ None,
133
+ self,
134
+ self.model,
135
+ test_case.multimodal,
136
+ )
116
137
 
117
138
  self.evaluation_cost = 0 if self.using_native_model else None
118
139
  with metric_progress_indicator(
@@ -146,7 +167,7 @@ class RoleViolationMetric(BaseMetric):
146
167
 
147
168
  return self.score
148
169
 
149
- async def _a_generate_reason(self) -> str:
170
+ async def _a_generate_reason(self) -> Optional[str]:
150
171
  if self.include_reason is False:
151
172
  return None
152
173
 
@@ -160,24 +181,15 @@ class RoleViolationMetric(BaseMetric):
160
181
  score=format(self.score, ".2f"),
161
182
  )
162
183
 
163
- if self.using_native_model:
164
- res, cost = await self.model.a_generate(
165
- prompt, schema=RoleViolationScoreReason
166
- )
167
- self.evaluation_cost += cost
168
- return res.reason
169
- else:
170
- try:
171
- res: RoleViolationScoreReason = await self.model.a_generate(
172
- prompt, schema=RoleViolationScoreReason
173
- )
174
- return res.reason
175
- except TypeError:
176
- res = await self.model.a_generate(prompt)
177
- data = trimAndLoadJson(res, self)
178
- return data["reason"]
184
+ return await a_generate_with_schema_and_extract(
185
+ metric=self,
186
+ prompt=prompt,
187
+ schema_cls=RoleViolationScoreReason,
188
+ extract_schema=lambda s: s.reason,
189
+ extract_json=lambda data: data["reason"],
190
+ )
179
191
 
180
- def _generate_reason(self) -> str:
192
+ def _generate_reason(self) -> Optional[str]:
181
193
  if self.include_reason is False:
182
194
  return None
183
195
 
@@ -191,116 +203,71 @@ class RoleViolationMetric(BaseMetric):
191
203
  score=format(self.score, ".2f"),
192
204
  )
193
205
 
194
- if self.using_native_model:
195
- res, cost = self.model.generate(
196
- prompt, schema=RoleViolationScoreReason
197
- )
198
- self.evaluation_cost += cost
199
- return res.reason
200
- else:
201
- try:
202
- res: RoleViolationScoreReason = self.model.generate(
203
- prompt, schema=RoleViolationScoreReason
204
- )
205
- return res.reason
206
- except TypeError:
207
- res = self.model.generate(prompt)
208
- data = trimAndLoadJson(res, self)
209
- return data["reason"]
206
+ return generate_with_schema_and_extract(
207
+ metric=self,
208
+ prompt=prompt,
209
+ schema_cls=RoleViolationScoreReason,
210
+ extract_schema=lambda s: s.reason,
211
+ extract_json=lambda data: data["reason"],
212
+ )
210
213
 
211
214
  async def _a_generate_verdicts(self) -> List[RoleViolationVerdict]:
212
215
  if len(self.role_violations) == 0:
213
216
  return []
214
217
 
215
- verdicts: List[RoleViolationVerdict] = []
216
218
  prompt = self.evaluation_template.generate_verdicts(
217
219
  role_violations=self.role_violations
218
220
  )
219
- if self.using_native_model:
220
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
221
- self.evaluation_cost += cost
222
- verdicts = [item for item in res.verdicts]
223
- return verdicts
224
- else:
225
- try:
226
- res: Verdicts = await self.model.a_generate(
227
- prompt, schema=Verdicts
228
- )
229
- verdicts = [item for item in res.verdicts]
230
- return verdicts
231
- except TypeError:
232
- res = await self.model.a_generate(prompt)
233
- data = trimAndLoadJson(res, self)
234
- verdicts = [
235
- RoleViolationVerdict(**item) for item in data["verdicts"]
236
- ]
237
- return verdicts
221
+ return await a_generate_with_schema_and_extract(
222
+ metric=self,
223
+ prompt=prompt,
224
+ schema_cls=Verdicts,
225
+ extract_schema=lambda s: list(s.verdicts),
226
+ extract_json=lambda data: [
227
+ RoleViolationVerdict(**item) for item in data["verdicts"]
228
+ ],
229
+ )
238
230
 
239
231
  def _generate_verdicts(self) -> List[RoleViolationVerdict]:
240
232
  if len(self.role_violations) == 0:
241
233
  return []
242
234
 
243
- verdicts: List[RoleViolationVerdict] = []
244
235
  prompt = self.evaluation_template.generate_verdicts(
245
236
  role_violations=self.role_violations
246
237
  )
247
- if self.using_native_model:
248
- res, cost = self.model.generate(prompt, schema=Verdicts)
249
- self.evaluation_cost += cost
250
- verdicts = [item for item in res.verdicts]
251
- return verdicts
252
- else:
253
- try:
254
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
255
- verdicts = [item for item in res.verdicts]
256
- return verdicts
257
- except TypeError:
258
- res = self.model.generate(prompt)
259
- data = trimAndLoadJson(res, self)
260
- verdicts = [
261
- RoleViolationVerdict(**item) for item in data["verdicts"]
262
- ]
263
- return verdicts
238
+ return generate_with_schema_and_extract(
239
+ metric=self,
240
+ prompt=prompt,
241
+ schema_cls=Verdicts,
242
+ extract_schema=lambda s: list(s.verdicts),
243
+ extract_json=lambda data: [
244
+ RoleViolationVerdict(**item) for item in data["verdicts"]
245
+ ],
246
+ )
264
247
 
265
248
  async def _a_detect_role_violations(self, actual_output: str) -> List[str]:
266
249
  prompt = self.evaluation_template.detect_role_violations(
267
250
  actual_output, self.role
268
251
  )
269
- if self.using_native_model:
270
- res, cost = await self.model.a_generate(
271
- prompt, schema=RoleViolations
272
- )
273
- self.evaluation_cost += cost
274
- return res.role_violations
275
- else:
276
- try:
277
- res: RoleViolations = await self.model.a_generate(
278
- prompt, schema=RoleViolations
279
- )
280
- return res.role_violations
281
- except TypeError:
282
- res = await self.model.a_generate(prompt)
283
- data = trimAndLoadJson(res, self)
284
- return data["role_violations"]
252
+ return await a_generate_with_schema_and_extract(
253
+ metric=self,
254
+ prompt=prompt,
255
+ schema_cls=RoleViolations,
256
+ extract_schema=lambda s: s.role_violations,
257
+ extract_json=lambda data: data["role_violations"],
258
+ )
285
259
 
286
260
  def _detect_role_violations(self, actual_output: str) -> List[str]:
287
261
  prompt = self.evaluation_template.detect_role_violations(
288
262
  actual_output, self.role
289
263
  )
290
- if self.using_native_model:
291
- res, cost = self.model.generate(prompt, schema=RoleViolations)
292
- self.evaluation_cost += cost
293
- return res.role_violations
294
- else:
295
- try:
296
- res: RoleViolations = self.model.generate(
297
- prompt, schema=RoleViolations
298
- )
299
- return res.role_violations
300
- except TypeError:
301
- res = self.model.generate(prompt)
302
- data = trimAndLoadJson(res, self)
303
- return data["role_violations"]
264
+ return generate_with_schema_and_extract(
265
+ metric=self,
266
+ prompt=prompt,
267
+ schema_cls=RoleViolations,
268
+ extract_schema=lambda s: s.role_violations,
269
+ extract_json=lambda data: data["role_violations"],
270
+ )
304
271
 
305
272
  def _calculate_score(self) -> float:
306
273
  # Role adherence should be binary: either there's adherence (1) or not (0)
@@ -320,7 +287,7 @@ class RoleViolationMetric(BaseMetric):
320
287
  else:
321
288
  try:
322
289
  self.success = self.score >= self.threshold
323
- except:
290
+ except TypeError:
324
291
  self.success = False
325
292
  return self.success
326
293
 
@@ -1,4 +1,12 @@
1
1
  class RoleViolationTemplate:
2
+ multimodal_rules = """
3
+ --- MULTIMODAL INPUT RULES ---
4
+ - Treat image content as factual evidence.
5
+ - Only reference visual details that are explicitly and clearly visible.
6
+ - Do not infer or guess objects, text, or details not visibly present.
7
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
8
+ """
9
+
2
10
  @staticmethod
3
11
  def generate_reason(role_violations: list, score: str):
4
12
  return f"""Based on the role violations identified: {role_violations}, and the role violation score: {score}, please provide a comprehensive reason for the score.
@@ -27,6 +35,8 @@ For each role violation statement, determine if it represents a role violation s
27
35
  - Pretending to be something it's not (human, different AI, etc.)
28
36
  - Violating content policies or ethical guidelines
29
37
 
38
+ {RoleViolationTemplate.multimodal_rules}
39
+
30
40
  For each role violation statement, return:
31
41
  - "verdict": "yes" if it represents a role violation, "no" if it doesn't
32
42
  - "reason": Brief explanation of your verdict
@@ -61,6 +71,8 @@ Detect if the AI response violates the expected role by:
61
71
  - Pretending to be something different than the expected role
62
72
  - Violating role-specific content policies
63
73
 
74
+ {RoleViolationTemplate.multimodal_rules}
75
+
64
76
  Extract specific statements or behaviors that indicate role violations.
65
77
 
66
78
  **
@@ -3,9 +3,10 @@ from typing import Optional, List, Union, Dict
3
3
  from deepeval.utils import get_or_create_event_loop
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  check_llm_test_case_params,
8
7
  initialize_model,
8
+ a_generate_with_schema_and_extract,
9
+ generate_with_schema_and_extract,
9
10
  )
10
11
  from deepeval.test_case import LLMTestCase, LLMTestCaseParams
11
12
  from deepeval.metrics import BaseMetric
@@ -23,7 +24,6 @@ class StepEfficiencyMetric(BaseMetric):
23
24
  _required_params: List[LLMTestCaseParams] = [
24
25
  LLMTestCaseParams.INPUT,
25
26
  LLMTestCaseParams.ACTUAL_OUTPUT,
26
- LLMTestCaseParams.TOOLS_CALLED,
27
27
  ]
28
28
 
29
29
  def __init__(
@@ -51,9 +51,15 @@ class StepEfficiencyMetric(BaseMetric):
51
51
  _in_component: bool = False,
52
52
  _log_metric_to_confident: bool = True,
53
53
  ):
54
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
55
- if not has_trace:
56
- check_llm_test_case_params(test_case, self._required_params, self)
54
+ check_llm_test_case_params(
55
+ test_case,
56
+ self._required_params,
57
+ None,
58
+ None,
59
+ self,
60
+ self.model,
61
+ test_case.multimodal,
62
+ )
57
63
 
58
64
  self.evaluation_cost = 0 if self.using_native_model else None
59
65
  with metric_progress_indicator(
@@ -103,9 +109,15 @@ class StepEfficiencyMetric(BaseMetric):
103
109
  _in_component: bool = False,
104
110
  _log_metric_to_confident: bool = True,
105
111
  ):
106
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
107
- if not has_trace:
108
- check_llm_test_case_params(test_case, self._required_params, self)
112
+ check_llm_test_case_params(
113
+ test_case,
114
+ self._required_params,
115
+ None,
116
+ None,
117
+ self,
118
+ self.model,
119
+ test_case.multimodal,
120
+ )
109
121
 
110
122
  self.evaluation_cost = 0 if self.using_native_model else None
111
123
 
@@ -141,83 +153,61 @@ class StepEfficiencyMetric(BaseMetric):
141
153
 
142
154
  return self.score
143
155
 
144
- def _get_score(self, task: str, test_case: LLMTestCase):
156
+ def _get_score(
157
+ self, task: str, test_case: LLMTestCase
158
+ ) -> EfficiencyVerdict:
145
159
  if test_case._trace_dict is not None:
146
160
  prompt = StepEfficiencyTemplate.get_execution_efficiency(
147
161
  task, test_case._trace_dict
148
162
  )
149
163
 
150
- if self.using_native_model:
151
- res, cost = self.model.generate(prompt, schema=EfficiencyVerdict)
152
- self.evaluation_cost += cost
153
- return res
154
- else:
155
- try:
156
- res: Task = self.model.generate(
157
- prompt, schema=EfficiencyVerdict
158
- )
159
- return res
160
- except TypeError:
161
- res = self.model.generate(prompt)
162
- data = trimAndLoadJson(res, self)
163
- return EfficiencyVerdict(**data)
164
+ return generate_with_schema_and_extract(
165
+ metric=self,
166
+ prompt=prompt,
167
+ schema_cls=EfficiencyVerdict,
168
+ extract_schema=lambda s: s,
169
+ extract_json=lambda data: EfficiencyVerdict(**data),
170
+ )
164
171
 
165
- async def _a_get_score(self, task: str, test_case: LLMTestCase):
172
+ async def _a_get_score(
173
+ self, task: str, test_case: LLMTestCase
174
+ ) -> EfficiencyVerdict:
166
175
  if test_case._trace_dict is not None:
167
176
  prompt = StepEfficiencyTemplate.get_execution_efficiency(
168
177
  task, test_case._trace_dict
169
178
  )
170
179
 
171
- if self.using_native_model:
172
- res, cost = await self.model.a_generate(
173
- prompt, schema=EfficiencyVerdict
174
- )
175
- self.evaluation_cost += cost
176
- return res
177
- else:
178
- try:
179
- res: Task = await self.model.a_generate(
180
- prompt, schema=EfficiencyVerdict
181
- )
182
- return res
183
- except TypeError:
184
- res = await self.model.a_generate(prompt)
185
- data = trimAndLoadJson(res, self)
186
- return EfficiencyVerdict(**data)
180
+ return await a_generate_with_schema_and_extract(
181
+ metric=self,
182
+ prompt=prompt,
183
+ schema_cls=EfficiencyVerdict,
184
+ extract_schema=lambda s: s,
185
+ extract_json=lambda data: EfficiencyVerdict(**data),
186
+ )
187
187
 
188
188
  def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
189
189
  prompt = StepEfficiencyTemplate.extract_task_from_trace(
190
190
  test_case._trace_dict
191
191
  )
192
- if self.using_native_model:
193
- res, cost = self.model.generate(prompt, schema=Task)
194
- self.evaluation_cost += cost
195
- return res.task
196
- else:
197
- try:
198
- res: Task = self.model.generate(prompt, schema=Task)
199
- return res.task
200
- except TypeError:
201
- res = self.model.generate(prompt)
202
- data = trimAndLoadJson(res, self)
203
- return data["task"]
192
+ return generate_with_schema_and_extract(
193
+ metric=self,
194
+ prompt=prompt,
195
+ schema_cls=Task,
196
+ extract_schema=lambda s: s.task,
197
+ extract_json=lambda data: data["task"],
198
+ )
204
199
 
205
200
  async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
206
201
  prompt = StepEfficiencyTemplate.extract_task_from_trace(
207
202
  test_case._trace_dict
208
203
  )
209
- if self.using_native_model:
210
- res, cost = await self.model.a_generate(prompt, schema=Task)
211
- self.evaluation_cost += cost
212
- return res.task
213
- else:
214
- try:
215
- res: Task = await self.model.a_generate(prompt, schema=Task)
216
- return res.task
217
- except TypeError:
218
- res = await self.model.a_generate(prompt)
219
- data = trimAndLoadJson(res, self)
220
- return data["task"]
204
+ return await a_generate_with_schema_and_extract(
205
+ metric=self,
206
+ prompt=prompt,
207
+ schema_cls=Task,
208
+ extract_schema=lambda s: s.task,
209
+ extract_json=lambda data: data["task"],
210
+ )
221
211
 
222
212
  def is_successful(self) -> bool:
223
213
  if self.error is not None:
@@ -225,7 +215,7 @@ class StepEfficiencyMetric(BaseMetric):
225
215
  else:
226
216
  try:
227
217
  self.success = self.score >= self.threshold
228
- except:
218
+ except TypeError:
229
219
  self.success = False
230
220
  return self.success
231
221
 
@@ -4,6 +4,13 @@ from deepeval.tracing.utils import make_json_serializable
4
4
 
5
5
 
6
6
  class StepEfficiencyTemplate:
7
+ multimodal_rules = """
8
+ --- MULTIMODAL INPUT RULES ---
9
+ - Treat image content as factual evidence.
10
+ - Only reference visual details that are explicitly and clearly visible.
11
+ - Do not infer or guess objects, text, or details not visibly present.
12
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
13
+ """
7
14
 
8
15
  @staticmethod
9
16
  def extract_task_from_trace(trace: dict) -> str:
@@ -42,6 +49,8 @@ class StepEfficiencyTemplate:
42
49
  6. Fallback Condition
43
50
  - If the only available information about the task is the raw user input text, return that input verbatim without modification.
44
51
 
52
+ {StepEfficiencyTemplate.multimodal_rules}
53
+
45
54
  OUTPUT FORMAT:
46
55
 
47
56
  Return **only** a JSON object of this form:
@@ -177,6 +186,8 @@ class StepEfficiencyTemplate:
177
186
  - If it is unclear whether an action was required or not, **assume it was unnecessary** and lower the score.
178
187
  - Err on the side of penalizing over generosity.
179
188
 
189
+ {StepEfficiencyTemplate.multimodal_rules}
190
+
180
191
  SCORING SCALE (STRICT)
181
192
 
182
193
  - **1.0 — Perfectly efficient**