deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -3,8 +3,16 @@ import textwrap
3
3
 
4
4
 
5
5
  class GoalAccuracyTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
13
+
6
14
  @staticmethod
7
- def get_accuracy_score(task, steps_taken):
15
+ def get_accuracy_score(task, steps_taken, multimodal: bool = False):
8
16
  return textwrap.dedent(
9
17
  f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
10
18
 
@@ -36,6 +44,8 @@ class GoalAccuracyTemplate:
36
44
  - When uncertain, assume the goal was **not achieved**.
37
45
  - The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
38
46
 
47
+ {GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
48
+
39
49
  SCORING GUIDE:
40
50
 
41
51
  - **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
@@ -102,7 +112,7 @@ class GoalAccuracyTemplate:
102
112
  )
103
113
 
104
114
  @staticmethod
105
- def get_plan_evaluation_score(task, steps_taken):
115
+ def get_plan_evaluation_score(task, steps_taken, multimodal: bool = False):
106
116
  return textwrap.dedent(
107
117
  f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
108
118
 
@@ -132,6 +142,8 @@ class GoalAccuracyTemplate:
132
142
  - Tool use should be coherent within the plan, not ad hoc or speculative.
133
143
  - This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
134
144
 
145
+ {GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
146
+
135
147
  SCORING GUIDE:
136
148
 
137
149
  - **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
@@ -188,7 +200,11 @@ class GoalAccuracyTemplate:
188
200
 
189
201
  @staticmethod
190
202
  def get_final_reason(
191
- final_score, threshold, goal_evaluations, plan_evalautions
203
+ final_score,
204
+ threshold,
205
+ goal_evaluations,
206
+ plan_evalautions,
207
+ multimodal: bool = False,
192
208
  ):
193
209
  return textwrap.dedent(
194
210
  f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
@@ -213,6 +229,8 @@ class GoalAccuracyTemplate:
213
229
  - If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
214
230
  - Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
215
231
 
232
+ {GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
233
+
216
234
  ---
217
235
 
218
236
  FORMAT:
@@ -8,14 +8,19 @@ from deepeval.metrics import BaseMetric
8
8
  from deepeval.utils import get_or_create_event_loop, prettify_list
9
9
  from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
11
  check_llm_test_case_params,
13
12
  initialize_model,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
16
  from deepeval.metrics.hallucination.template import HallucinationTemplate
16
17
  from deepeval.models import DeepEvalBaseLLM
17
18
  from deepeval.metrics.indicator import metric_progress_indicator
18
- from deepeval.metrics.hallucination.schema import *
19
+ from deepeval.metrics.hallucination.schema import (
20
+ HallucinationVerdict,
21
+ Verdicts,
22
+ HallucinationScoreReason,
23
+ )
19
24
  from deepeval.metrics.api import metric_data_manager
20
25
 
21
26
 
@@ -55,7 +60,16 @@ class HallucinationMetric(BaseMetric):
55
60
  _log_metric_to_confident: bool = True,
56
61
  ) -> float:
57
62
 
58
- check_llm_test_case_params(test_case, self._required_params, self)
63
+ multimodal = test_case.multimodal
64
+ check_llm_test_case_params(
65
+ test_case,
66
+ self._required_params,
67
+ None,
68
+ None,
69
+ self,
70
+ self.model,
71
+ multimodal,
72
+ )
59
73
 
60
74
  self.evaluation_cost = 0 if self.using_native_model else None
61
75
  with metric_progress_indicator(
@@ -102,7 +116,16 @@ class HallucinationMetric(BaseMetric):
102
116
  _log_metric_to_confident: bool = True,
103
117
  ) -> float:
104
118
 
105
- check_llm_test_case_params(test_case, self._required_params, self)
119
+ multimodal = test_case.multimodal
120
+ check_llm_test_case_params(
121
+ test_case,
122
+ self._required_params,
123
+ None,
124
+ None,
125
+ self,
126
+ self.model,
127
+ multimodal,
128
+ )
106
129
 
107
130
  self.evaluation_cost = 0 if self.using_native_model else None
108
131
  with metric_progress_indicator(
@@ -150,22 +173,13 @@ class HallucinationMetric(BaseMetric):
150
173
  score=format(self.score, ".2f"),
151
174
  )
152
175
 
153
- if self.using_native_model:
154
- res, cost = await self.model.a_generate(
155
- prompt, schema=HallucinationScoreReason
156
- )
157
- self.evaluation_cost += cost
158
- return res.reason
159
- else:
160
- try:
161
- res: HallucinationScoreReason = await self.model.a_generate(
162
- prompt, schema=HallucinationScoreReason
163
- )
164
- return res.reason
165
- except TypeError:
166
- res = await self.model.a_generate(prompt)
167
- data = trimAndLoadJson(res, self)
168
- return data["reason"]
176
+ return await a_generate_with_schema_and_extract(
177
+ metric=self,
178
+ prompt=prompt,
179
+ schema_cls=HallucinationScoreReason,
180
+ extract_schema=lambda s: s.reason,
181
+ extract_json=lambda data: data["reason"],
182
+ )
169
183
 
170
184
  def _generate_reason(self):
171
185
  if self.include_reason is False:
@@ -185,74 +199,45 @@ class HallucinationMetric(BaseMetric):
185
199
  score=format(self.score, ".2f"),
186
200
  )
187
201
 
188
- if self.using_native_model:
189
- res, cost = self.model.generate(
190
- prompt, schema=HallucinationScoreReason
191
- )
192
- self.evaluation_cost += cost
193
- return res.reason
194
- else:
195
- try:
196
- res: HallucinationScoreReason = self.model.generate(
197
- prompt, schema=HallucinationScoreReason
198
- )
199
- return res.reason
200
- except TypeError:
201
- res = self.model.generate(prompt)
202
- data = trimAndLoadJson(res, self)
203
- return data["reason"]
202
+ return generate_with_schema_and_extract(
203
+ metric=self,
204
+ prompt=prompt,
205
+ schema_cls=HallucinationScoreReason,
206
+ extract_schema=lambda s: s.reason,
207
+ extract_json=lambda data: data["reason"],
208
+ )
204
209
 
205
210
  async def _a_generate_verdicts(
206
211
  self, actual_output: str, contexts: List[str]
207
212
  ) -> List[HallucinationVerdict]:
208
- verdicts: List[HallucinationVerdict] = []
209
213
  prompt = self.evaluation_template.generate_verdicts(
210
214
  actual_output=actual_output, contexts=contexts
211
215
  )
212
- if self.using_native_model:
213
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
214
- self.evaluation_cost += cost
215
- verdicts = [item for item in res.verdicts]
216
- return verdicts
217
- else:
218
- try:
219
- res: Verdicts = await self.model.a_generate(
220
- prompt, schema=Verdicts
221
- )
222
- verdicts = [item for item in res.verdicts]
223
- return verdicts
224
- except TypeError:
225
- res = await self.model.a_generate(prompt)
226
- data = trimAndLoadJson(res, self)
227
- verdicts = [
228
- HallucinationVerdict(**item) for item in data["verdicts"]
229
- ]
230
- return verdicts
216
+ return await a_generate_with_schema_and_extract(
217
+ metric=self,
218
+ prompt=prompt,
219
+ schema_cls=Verdicts,
220
+ extract_schema=lambda s: list(s.verdicts),
221
+ extract_json=lambda data: [
222
+ HallucinationVerdict(**item) for item in data["verdicts"]
223
+ ],
224
+ )
231
225
 
232
226
  def _generate_verdicts(
233
227
  self, actual_output: str, contexts: List[str]
234
228
  ) -> List[HallucinationVerdict]:
235
- verdicts: List[HallucinationVerdict] = []
236
229
  prompt = self.evaluation_template.generate_verdicts(
237
230
  actual_output=actual_output, contexts=contexts
238
231
  )
239
- if self.using_native_model:
240
- res, cost = self.model.generate(prompt, schema=Verdicts)
241
- self.evaluation_cost += cost
242
- verdicts = [item for item in res.verdicts]
243
- return verdicts
244
- else:
245
- try:
246
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
247
- verdicts = [item for item in res.verdicts]
248
- return verdicts
249
- except TypeError:
250
- res = self.model.generate(prompt)
251
- data = trimAndLoadJson(res, self)
252
- verdicts = [
253
- HallucinationVerdict(**item) for item in data["verdicts"]
254
- ]
255
- return verdicts
232
+ return generate_with_schema_and_extract(
233
+ metric=self,
234
+ prompt=prompt,
235
+ schema_cls=Verdicts,
236
+ extract_schema=lambda s: list(s.verdicts),
237
+ extract_json=lambda data: [
238
+ HallucinationVerdict(**item) for item in data["verdicts"]
239
+ ],
240
+ )
256
241
 
257
242
  def _calculate_score(self) -> float:
258
243
  number_of_verdicts = len(self.verdicts)
@@ -273,7 +258,7 @@ class HallucinationMetric(BaseMetric):
273
258
  else:
274
259
  try:
275
260
  self.success = self.score <= self.threshold
276
- except:
261
+ except TypeError:
277
262
  self.success = False
278
263
  return self.success
279
264
 
@@ -2,9 +2,20 @@ from typing import List
2
2
 
3
3
 
4
4
  class HallucinationTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_verdicts(actual_output: str, contexts: List[str]):
7
15
  return f"""For each context in contexts, which is a list of strings, please generate a list of JSON objects to indicate whether the given 'actual output' agrees with EACH context. The JSON will have 2 fields: 'verdict' and 'reason'.
16
+
17
+ {HallucinationTemplate.multimodal_rules}
18
+
8
19
  The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the given text agrees with the context.
9
20
  The 'reason' is the reason for the verdict. When the answer is 'no', try to provide a correction in the reason.
10
21
 
@@ -46,6 +57,8 @@ JSON:
46
57
  ):
47
58
  return f"""Given a list of factual alignments and contradictions, which highlights alignment/contradictions between the `actual output` and `contexts, use it to provide a reason for the hallucination score in a CONCISELY. Note that The hallucination score ranges from 0 - 1, and the lower the better.
48
59
 
60
+ {HallucinationTemplate.multimodal_rules}
61
+
49
62
  **
50
63
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
51
64
  Example JSON:
@@ -1,24 +1,24 @@
1
+ import asyncio
2
+ import logging
3
+ import sys
4
+ import time
1
5
  from rich.console import Console
2
6
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
3
7
  from contextlib import contextmanager
4
- import sys
5
8
  from typing import List, Optional, Union
6
- import time
7
- import asyncio
8
9
 
9
10
  from deepeval.errors import MissingTestCaseParamsError
10
11
  from deepeval.metrics import (
11
12
  BaseMetric,
12
13
  BaseConversationalMetric,
13
- BaseMultimodalMetric,
14
14
  BaseArenaMetric,
15
15
  )
16
16
  from deepeval.test_case import LLMTestCase, ConversationalTestCase
17
17
  from deepeval.test_run.cache import CachedTestCase, Cache
18
18
  from deepeval.telemetry import capture_metric_type
19
19
  from deepeval.utils import update_pbar
20
+ from deepeval.config.settings import get_settings
20
21
 
21
- import logging
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
@@ -74,7 +74,7 @@ def metric_progress_indicator(
74
74
  async def measure_metric_task(
75
75
  task_id,
76
76
  progress,
77
- metric: Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric],
77
+ metric: Union[BaseMetric, BaseConversationalMetric],
78
78
  test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
79
79
  cached_test_case: Union[CachedTestCase, None],
80
80
  ignore_errors: bool,
@@ -156,9 +156,7 @@ async def measure_metric_task(
156
156
 
157
157
 
158
158
  async def measure_metrics_with_indicator(
159
- metrics: List[
160
- Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
161
- ],
159
+ metrics: List[Union[BaseMetric, BaseConversationalMetric]],
162
160
  test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
163
161
  cached_test_case: Union[CachedTestCase, None],
164
162
  ignore_errors: bool,
@@ -238,7 +236,7 @@ async def measure_metrics_with_indicator(
238
236
 
239
237
 
240
238
  async def safe_a_measure(
241
- metric: Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric],
239
+ metric: Union[BaseMetric, BaseConversationalMetric],
242
240
  tc: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
243
241
  ignore_errors: bool,
244
242
  skip_on_missing_params: bool,
@@ -263,6 +261,9 @@ async def safe_a_measure(
263
261
  "Timed out/cancelled while evaluating metric. "
264
262
  "Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
265
263
  "DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
264
+ if not get_settings().DEEPEVAL_DISABLE_TIMEOUTS
265
+ else "Cancelled while evaluating metric (DeepEval timeouts are disabled; this likely came from upstream orchestration or the provider/network layer). "
266
+ "Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
266
267
  )
267
268
  metric.success = False
268
269
 
@@ -11,7 +11,8 @@ from deepeval.metrics.utils import (
11
11
  construct_verbose_logs,
12
12
  check_llm_test_case_params,
13
13
  initialize_model,
14
- trimAndLoadJson,
14
+ a_generate_with_schema_and_extract,
15
+ generate_with_schema_and_extract,
15
16
  )
16
17
  from deepeval.models import DeepEvalBaseLLM
17
18
  from deepeval.metrics.indicator import metric_progress_indicator
@@ -46,6 +47,7 @@ class JsonCorrectnessMetric(BaseMetric):
46
47
  self.async_mode = async_mode
47
48
  self.verbose_mode = verbose_mode
48
49
  self.expected_schema = expected_schema
50
+ self.evaluation_model = self.model.get_model_name()
49
51
 
50
52
  def measure(
51
53
  self,
@@ -55,7 +57,16 @@ class JsonCorrectnessMetric(BaseMetric):
55
57
  _log_metric_to_confident: bool = True,
56
58
  ) -> float:
57
59
 
58
- check_llm_test_case_params(test_case, self._required_params, self)
60
+ multimodal = test_case.multimodal
61
+ check_llm_test_case_params(
62
+ test_case,
63
+ self._required_params,
64
+ None,
65
+ None,
66
+ self,
67
+ self.model,
68
+ multimodal,
69
+ )
59
70
 
60
71
  self.evaluation_cost = 0 if self.using_native_model else None
61
72
  with metric_progress_indicator(
@@ -77,7 +88,7 @@ class JsonCorrectnessMetric(BaseMetric):
77
88
  self.expected_schema.model_validate_json(
78
89
  test_case.actual_output
79
90
  )
80
- except ValidationError as e:
91
+ except ValidationError:
81
92
  valid_json = False
82
93
 
83
94
  self.score = 1 if valid_json else 0
@@ -106,7 +117,16 @@ class JsonCorrectnessMetric(BaseMetric):
106
117
  _log_metric_to_confident: bool = True,
107
118
  ) -> float:
108
119
 
109
- check_llm_test_case_params(test_case, self._required_params, self)
120
+ multimodal = test_case.multimodal
121
+ check_llm_test_case_params(
122
+ test_case,
123
+ self._required_params,
124
+ None,
125
+ None,
126
+ self,
127
+ self.model,
128
+ multimodal,
129
+ )
110
130
 
111
131
  self.evaluation_cost = 0 if self.using_native_model else None
112
132
  with metric_progress_indicator(
@@ -120,7 +140,7 @@ class JsonCorrectnessMetric(BaseMetric):
120
140
  self.expected_schema.model_validate_json(
121
141
  test_case.actual_output
122
142
  )
123
- except ValidationError as e:
143
+ except ValidationError:
124
144
  valid_json = False
125
145
 
126
146
  self.score = 1 if valid_json else 0
@@ -156,22 +176,13 @@ class JsonCorrectnessMetric(BaseMetric):
156
176
  is_valid_json=is_valid_json,
157
177
  )
158
178
 
159
- if self.using_native_model:
160
- res, cost = await self.model.a_generate(
161
- prompt, schema=JsonCorrectnessScoreReason
162
- )
163
- self.evaluation_cost += cost
164
- return res.reason
165
- else:
166
- try:
167
- res: JsonCorrectnessScoreReason = await self.model.a_generate(
168
- prompt, schema=JsonCorrectnessScoreReason
169
- )
170
- return res.reason
171
- except TypeError:
172
- res = await self.model.a_generate(prompt)
173
- data = trimAndLoadJson(res, self)
174
- return data["reason"]
179
+ return await a_generate_with_schema_and_extract(
180
+ metric=self,
181
+ prompt=prompt,
182
+ schema_cls=JsonCorrectnessScoreReason,
183
+ extract_schema=lambda s: s.reason,
184
+ extract_json=lambda data: data["reason"],
185
+ )
175
186
 
176
187
  def generate_reason(self, actual_output: str) -> str:
177
188
  if self.include_reason is False:
@@ -189,22 +200,13 @@ class JsonCorrectnessMetric(BaseMetric):
189
200
  is_valid_json=is_valid_json,
190
201
  )
191
202
 
192
- if self.using_native_model:
193
- res, cost = self.model.generate(
194
- prompt, schema=JsonCorrectnessScoreReason
195
- )
196
- self.evaluation_cost += cost
197
- return res.reason
198
- else:
199
- try:
200
- res: JsonCorrectnessScoreReason = self.model.generate(
201
- prompt, schema=JsonCorrectnessScoreReason
202
- )
203
- return res.reason
204
- except TypeError:
205
- res = self.model.generate(prompt)
206
- data = trimAndLoadJson(res, self)
207
- return data["reason"]
203
+ return generate_with_schema_and_extract(
204
+ metric=self,
205
+ prompt=prompt,
206
+ schema_cls=JsonCorrectnessScoreReason,
207
+ extract_schema=lambda s: s.reason,
208
+ extract_json=lambda data: data["reason"],
209
+ )
208
210
 
209
211
  def is_successful(self) -> bool:
210
212
  if self.error is not None:
@@ -212,7 +214,7 @@ class JsonCorrectnessMetric(BaseMetric):
212
214
  else:
213
215
  try:
214
216
  self.success = self.score >= self.threshold
215
- except:
217
+ except TypeError:
216
218
  self.success = False
217
219
  return self.success
218
220
 
@@ -2,12 +2,22 @@ from typing import Optional
2
2
 
3
3
 
4
4
  class JsonCorrectnessTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_reason(
7
15
  actual_output: str, expected_schema: str, is_valid_json: bool
8
16
  ):
9
17
  return f"""Based on the given generated json, generated by an LLM, and a boolean stating whether it is a valid JSON based on the expected json schema, give a reason why it is OR is not a valid Json.
10
18
 
19
+ {JsonCorrectnessTemplate.multimodal_rules}
20
+
11
21
  **
12
22
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
13
23
  Example JSON: