deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -3,11 +3,23 @@ import textwrap
3
3
 
4
4
 
5
5
  class GEvalTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
13
+
6
14
  @staticmethod
7
- def generate_evaluation_steps(parameters: str, criteria: str):
15
+ def generate_evaluation_steps(
16
+ parameters: str, criteria: str, multimodal: bool = False
17
+ ):
8
18
  return textwrap.dedent(
9
19
  f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another.
10
20
 
21
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
22
+
11
23
  Evaluation Criteria:
12
24
  {criteria}
13
25
 
@@ -31,6 +43,7 @@ class GEvalTemplate:
31
43
  rubric: Optional[str] = None,
32
44
  score_range: Tuple[int, int] = (0, 10),
33
45
  _additional_context: Optional[str] = None,
46
+ multimodal: bool = False,
34
47
  ):
35
48
  rubric_text = f"Rubric:\n{rubric}\n" if rubric else ""
36
49
  dependencies = (
@@ -62,6 +75,7 @@ class GEvalTemplate:
62
75
  - {reasoning_expectation}
63
76
  - Mention key details from the test case parameters.
64
77
  - Be concise, clear, and focused on the evaluation logic.
78
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
65
79
 
66
80
  Only return valid JSON. Do **not** include any extra commentary or text.
67
81
 
@@ -95,6 +109,7 @@ class GEvalTemplate:
95
109
  test_case_content: str,
96
110
  parameters: str,
97
111
  _additional_context: Optional[str] = None,
112
+ multimodal: bool = False,
98
113
  ):
99
114
  additional_context = (
100
115
  f"\n\nAdditional Context:\n{_additional_context}\n"
@@ -104,6 +119,8 @@ class GEvalTemplate:
104
119
  return textwrap.dedent(
105
120
  f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
106
121
 
122
+ {GEvalTemplate.multimodal_rules if multimodal else ""}
123
+
107
124
  Evaluation Steps:
108
125
  {evaluation_steps}
109
126
 
@@ -9,8 +9,8 @@ from deepeval.test_case import (
9
9
  LLMTestCase,
10
10
  ToolCall,
11
11
  )
12
- from deepeval.models.llms.openai_model import unsupported_log_probs_gpt_models
13
12
  from pydantic import BaseModel, field_validator
13
+ from deepeval.models.llms.constants import OPENAI_MODELS_DATA
14
14
 
15
15
  from deepeval.test_case.conversational_test_case import ConversationalTestCase
16
16
 
@@ -114,16 +114,17 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
114
114
 
115
115
  def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
116
116
 
117
- if isinstance(model, str) and model in unsupported_log_probs_gpt_models:
118
- return True
117
+ if isinstance(model, str):
118
+ model_data = OPENAI_MODELS_DATA.get(model)
119
+ if not model_data.supports_log_probs:
120
+ return True
119
121
  elif (
120
- isinstance(model, GPTModel)
121
- and model.model_name in unsupported_log_probs_gpt_models
122
+ isinstance(model, GPTModel) and not model.model_data.supports_log_probs
122
123
  ):
123
124
  return True
124
125
  elif (
125
126
  isinstance(model, AzureOpenAIModel)
126
- and model.model_name in unsupported_log_probs_gpt_models
127
+ and not model.model_data.supports_log_probs
127
128
  ):
128
129
  return True
129
130
 
@@ -3,11 +3,12 @@ import asyncio
3
3
  from deepeval.utils import get_or_create_event_loop, prettify_list
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  get_unit_interactions,
8
7
  print_tools_called,
9
8
  check_conversational_test_case_params,
10
9
  initialize_model,
10
+ a_generate_with_schema_and_extract,
11
+ generate_with_schema_and_extract,
11
12
  )
12
13
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
13
14
  from deepeval.metrics import BaseConversationalMetric
@@ -55,8 +56,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
55
56
  _in_component: bool = False,
56
57
  _log_metric_to_confident: bool = True,
57
58
  ):
59
+ multimodal = test_case.multimodal
58
60
  check_conversational_test_case_params(
59
- test_case, self._required_test_case_params, self
61
+ test_case,
62
+ self._required_test_case_params,
63
+ self,
64
+ None,
65
+ self.model,
66
+ multimodal,
60
67
  )
61
68
 
62
69
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -80,17 +87,21 @@ class GoalAccuracyMetric(BaseConversationalMetric):
80
87
  )
81
88
  goal_scores = [
82
89
  self._get_goal_accuracy_score(
83
- task.user_goal, task.steps_taken
90
+ task.user_goal, task.steps_taken, multimodal
84
91
  )
85
92
  for task in goal_and_steps_taken
86
93
  ]
87
94
  plan_scores = [
88
- self._get_plan_scores(task.user_goal, task.steps_taken)
95
+ self._get_plan_scores(
96
+ task.user_goal, task.steps_taken, multimodal
97
+ )
89
98
  for task in goal_and_steps_taken
90
99
  ]
91
100
  self.score = self._calculate_score(goal_scores, plan_scores)
92
101
  self.success = self.score >= self.threshold
93
- self.reason = self._generate_reason(goal_scores, plan_scores)
102
+ self.reason = self._generate_reason(
103
+ goal_scores, plan_scores, multimodal
104
+ )
94
105
 
95
106
  self.verbose_logs = construct_verbose_logs(
96
107
  self,
@@ -117,8 +128,14 @@ class GoalAccuracyMetric(BaseConversationalMetric):
117
128
  _in_component: bool = False,
118
129
  _log_metric_to_confident: bool = True,
119
130
  ):
131
+ multimodal = test_case.multimodal
120
132
  check_conversational_test_case_params(
121
- test_case, self._required_test_case_params, self
133
+ test_case,
134
+ self._required_test_case_params,
135
+ self,
136
+ None,
137
+ self.model,
138
+ multimodal,
122
139
  )
123
140
 
124
141
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -134,21 +151,23 @@ class GoalAccuracyMetric(BaseConversationalMetric):
134
151
  goal_scores = await asyncio.gather(
135
152
  *[
136
153
  self._a_get_goal_accuracy_score(
137
- task.user_goal, task.steps_taken
154
+ task.user_goal, task.steps_taken, multimodal
138
155
  )
139
156
  for task in goal_and_steps_taken
140
157
  ]
141
158
  )
142
159
  plan_scores = await asyncio.gather(
143
160
  *[
144
- self._a_get_plan_scores(task.user_goal, task.steps_taken)
161
+ self._a_get_plan_scores(
162
+ task.user_goal, task.steps_taken, multimodal
163
+ )
145
164
  for task in goal_and_steps_taken
146
165
  ]
147
166
  )
148
167
  self.score = self._calculate_score(goal_scores, plan_scores)
149
168
  self.success = self.score >= self.threshold
150
169
  self.reason = await self._a_generate_reason(
151
- goal_scores, plan_scores
170
+ goal_scores, plan_scores, multimodal
152
171
  )
153
172
 
154
173
  self.verbose_logs = construct_verbose_logs(
@@ -191,41 +210,31 @@ class GoalAccuracyMetric(BaseConversationalMetric):
191
210
  goal_and_steps_taken.append(new_goal_steps)
192
211
  return goal_and_steps_taken
193
212
 
194
- def _get_plan_scores(self, user_goal, steps_taken):
213
+ def _get_plan_scores(self, user_goal, steps_taken, multimodal: bool):
195
214
  prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
196
- user_goal, "\n".join(steps_taken)
215
+ user_goal, "\n".join(steps_taken), multimodal
216
+ )
217
+ return generate_with_schema_and_extract(
218
+ metric=self,
219
+ prompt=prompt,
220
+ schema_cls=PlanScore,
221
+ extract_schema=lambda s: s,
222
+ extract_json=lambda data: PlanScore(**data),
197
223
  )
198
- if self.using_native_model:
199
- res, cost = self.model.generate(prompt, schema=PlanScore)
200
- self.evaluation_cost += cost
201
- return res
202
- else:
203
- try:
204
- res: PlanScore = self.model.generate(prompt, schema=PlanScore)
205
- return res
206
- except TypeError:
207
- res = self.model.generate(prompt)
208
- data = trimAndLoadJson(res, self)
209
- return PlanScore(**data)
210
224
 
211
- async def _a_get_plan_scores(self, user_goal, steps_taken):
225
+ async def _a_get_plan_scores(
226
+ self, user_goal, steps_taken, multimodal: bool
227
+ ):
212
228
  prompt = GoalAccuracyTemplate.get_plan_evaluation_score(
213
- user_goal, "\n".join(steps_taken)
229
+ user_goal, "\n".join(steps_taken), multimodal
230
+ )
231
+ return await a_generate_with_schema_and_extract(
232
+ metric=self,
233
+ prompt=prompt,
234
+ schema_cls=PlanScore,
235
+ extract_schema=lambda s: s,
236
+ extract_json=lambda data: PlanScore(**data),
214
237
  )
215
- if self.using_native_model:
216
- res, cost = await self.model.a_generate(prompt, schema=PlanScore)
217
- self.evaluation_cost += cost
218
- return res
219
- else:
220
- try:
221
- res: PlanScore = await self.model.a_generate(
222
- prompt, schema=PlanScore
223
- )
224
- return res
225
- except TypeError:
226
- res = await self.model.a_generate(prompt)
227
- data = trimAndLoadJson(res, self)
228
- return PlanScore(**data)
229
238
 
230
239
  def _calculate_score(
231
240
  self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
@@ -240,7 +249,10 @@ class GoalAccuracyMetric(BaseConversationalMetric):
240
249
  return 0 if self.strict_mode and score < self.threshold else score
241
250
 
242
251
  def _generate_reason(
243
- self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
252
+ self,
253
+ goal_scores: List[GoalScore],
254
+ plan_scores: List[PlanScore],
255
+ multimodal: bool,
244
256
  ):
245
257
  goal_evaluations = ""
246
258
  for goal_score in goal_scores:
@@ -254,18 +266,25 @@ class GoalAccuracyMetric(BaseConversationalMetric):
254
266
  )
255
267
 
256
268
  prompt = GoalAccuracyTemplate.get_final_reason(
257
- self.score, self.threshold, goal_evaluations, plan_evalautions
269
+ self.score,
270
+ self.threshold,
271
+ goal_evaluations,
272
+ plan_evalautions,
273
+ multimodal,
258
274
  )
259
275
  if self.using_native_model:
260
276
  res, cost = self.model.generate(prompt)
261
- self.evaluation_cost += cost
277
+ self._accrue_cost(cost)
262
278
  return res
263
279
  else:
264
280
  res = self.model.generate(prompt)
265
281
  return res
266
282
 
267
283
  async def _a_generate_reason(
268
- self, goal_scores: List[GoalScore], plan_scores: List[PlanScore]
284
+ self,
285
+ goal_scores: List[GoalScore],
286
+ plan_scores: List[PlanScore],
287
+ multimodal: bool,
269
288
  ):
270
289
  goal_evaluations = ""
271
290
  for goal_score in goal_scores:
@@ -279,51 +298,47 @@ class GoalAccuracyMetric(BaseConversationalMetric):
279
298
  )
280
299
 
281
300
  prompt = GoalAccuracyTemplate.get_final_reason(
282
- self.score, self.threshold, goal_evaluations, plan_evalautions
301
+ self.score,
302
+ self.threshold,
303
+ goal_evaluations,
304
+ plan_evalautions,
305
+ multimodal,
283
306
  )
284
307
  if self.using_native_model:
285
308
  res, cost = await self.model.a_generate(prompt)
286
- self.evaluation_cost += cost
309
+ self._accrue_cost(cost)
287
310
  return res
288
311
  else:
289
312
  res = await self.model.a_generate(prompt)
290
313
  return res
291
314
 
292
- def _get_goal_accuracy_score(self, user_goal, steps_taken):
315
+ def _get_goal_accuracy_score(
316
+ self, user_goal, steps_taken, multimodal: bool
317
+ ):
293
318
  prompt = GoalAccuracyTemplate.get_accuracy_score(
294
- user_goal, "\n".join(steps_taken)
319
+ user_goal, "\n".join(steps_taken), multimodal
320
+ )
321
+ return generate_with_schema_and_extract(
322
+ metric=self,
323
+ prompt=prompt,
324
+ schema_cls=GoalScore,
325
+ extract_schema=lambda s: s,
326
+ extract_json=lambda data: GoalScore(**data),
295
327
  )
296
- if self.using_native_model:
297
- res, cost = self.model.generate(prompt, schema=GoalScore)
298
- self.evaluation_cost += cost
299
- return res
300
- else:
301
- try:
302
- res: GoalScore = self.model.generate(prompt, schema=GoalScore)
303
- return res
304
- except TypeError:
305
- res = self.model.generate(prompt)
306
- data = trimAndLoadJson(res, self)
307
- return GoalScore(**data)
308
328
 
309
- async def _a_get_goal_accuracy_score(self, user_goal, steps_taken):
329
+ async def _a_get_goal_accuracy_score(
330
+ self, user_goal, steps_taken, multimodal: bool
331
+ ):
310
332
  prompt = GoalAccuracyTemplate.get_accuracy_score(
311
- user_goal, "\n".join(steps_taken)
333
+ user_goal, "\n".join(steps_taken), multimodal
334
+ )
335
+ return await a_generate_with_schema_and_extract(
336
+ metric=self,
337
+ prompt=prompt,
338
+ schema_cls=GoalScore,
339
+ extract_schema=lambda s: s,
340
+ extract_json=lambda data: GoalScore(**data),
312
341
  )
313
- if self.using_native_model:
314
- res, cost = await self.model.a_generate(prompt, schema=GoalScore)
315
- self.evaluation_cost += cost
316
- return res
317
- else:
318
- try:
319
- res: GoalScore = await self.model.a_generate(
320
- prompt, schema=GoalScore
321
- )
322
- return res
323
- except TypeError:
324
- res = await self.model.a_generate(prompt)
325
- data = trimAndLoadJson(res, self)
326
- return GoalScore(**data)
327
342
 
328
343
  def print_goals_and_steps_taken(self, goals_and_steps):
329
344
  final_goals_and_steps = ""
@@ -340,7 +355,7 @@ class GoalAccuracyMetric(BaseConversationalMetric):
340
355
  else:
341
356
  try:
342
357
  self.success = self.score >= self.threshold
343
- except:
358
+ except TypeError:
344
359
  self.success = False
345
360
  return self.success
346
361
 
@@ -3,8 +3,16 @@ import textwrap
3
3
 
4
4
 
5
5
  class GoalAccuracyTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
13
+
6
14
  @staticmethod
7
- def get_accuracy_score(task, steps_taken):
15
+ def get_accuracy_score(task, steps_taken, multimodal: bool = False):
8
16
  return textwrap.dedent(
9
17
  f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
10
18
 
@@ -36,6 +44,8 @@ class GoalAccuracyTemplate:
36
44
  - When uncertain, assume the goal was **not achieved**.
37
45
  - The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
38
46
 
47
+ {GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
48
+
39
49
  SCORING GUIDE:
40
50
 
41
51
  - **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
@@ -102,7 +112,7 @@ class GoalAccuracyTemplate:
102
112
  )
103
113
 
104
114
  @staticmethod
105
- def get_plan_evaluation_score(task, steps_taken):
115
+ def get_plan_evaluation_score(task, steps_taken, multimodal: bool = False):
106
116
  return textwrap.dedent(
107
117
  f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
108
118
 
@@ -132,6 +142,8 @@ class GoalAccuracyTemplate:
132
142
  - Tool use should be coherent within the plan, not ad hoc or speculative.
133
143
  - This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
134
144
 
145
+ {GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
146
+
135
147
  SCORING GUIDE:
136
148
 
137
149
  - **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
@@ -188,7 +200,11 @@ class GoalAccuracyTemplate:
188
200
 
189
201
  @staticmethod
190
202
  def get_final_reason(
191
- final_score, threshold, goal_evaluations, plan_evalautions
203
+ final_score,
204
+ threshold,
205
+ goal_evaluations,
206
+ plan_evalautions,
207
+ multimodal: bool = False,
192
208
  ):
193
209
  return textwrap.dedent(
194
210
  f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
@@ -213,6 +229,8 @@ class GoalAccuracyTemplate:
213
229
  - If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
214
230
  - Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
215
231
 
232
+ {GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
233
+
216
234
  ---
217
235
 
218
236
  FORMAT:
@@ -8,14 +8,19 @@ from deepeval.metrics import BaseMetric
8
8
  from deepeval.utils import get_or_create_event_loop, prettify_list
9
9
  from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
11
  check_llm_test_case_params,
13
12
  initialize_model,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
16
  from deepeval.metrics.hallucination.template import HallucinationTemplate
16
17
  from deepeval.models import DeepEvalBaseLLM
17
18
  from deepeval.metrics.indicator import metric_progress_indicator
18
- from deepeval.metrics.hallucination.schema import *
19
+ from deepeval.metrics.hallucination.schema import (
20
+ HallucinationVerdict,
21
+ Verdicts,
22
+ HallucinationScoreReason,
23
+ )
19
24
  from deepeval.metrics.api import metric_data_manager
20
25
 
21
26
 
@@ -55,7 +60,16 @@ class HallucinationMetric(BaseMetric):
55
60
  _log_metric_to_confident: bool = True,
56
61
  ) -> float:
57
62
 
58
- check_llm_test_case_params(test_case, self._required_params, self)
63
+ multimodal = test_case.multimodal
64
+ check_llm_test_case_params(
65
+ test_case,
66
+ self._required_params,
67
+ None,
68
+ None,
69
+ self,
70
+ self.model,
71
+ multimodal,
72
+ )
59
73
 
60
74
  self.evaluation_cost = 0 if self.using_native_model else None
61
75
  with metric_progress_indicator(
@@ -102,7 +116,16 @@ class HallucinationMetric(BaseMetric):
102
116
  _log_metric_to_confident: bool = True,
103
117
  ) -> float:
104
118
 
105
- check_llm_test_case_params(test_case, self._required_params, self)
119
+ multimodal = test_case.multimodal
120
+ check_llm_test_case_params(
121
+ test_case,
122
+ self._required_params,
123
+ None,
124
+ None,
125
+ self,
126
+ self.model,
127
+ multimodal,
128
+ )
106
129
 
107
130
  self.evaluation_cost = 0 if self.using_native_model else None
108
131
  with metric_progress_indicator(
@@ -150,22 +173,13 @@ class HallucinationMetric(BaseMetric):
150
173
  score=format(self.score, ".2f"),
151
174
  )
152
175
 
153
- if self.using_native_model:
154
- res, cost = await self.model.a_generate(
155
- prompt, schema=HallucinationScoreReason
156
- )
157
- self.evaluation_cost += cost
158
- return res.reason
159
- else:
160
- try:
161
- res: HallucinationScoreReason = await self.model.a_generate(
162
- prompt, schema=HallucinationScoreReason
163
- )
164
- return res.reason
165
- except TypeError:
166
- res = await self.model.a_generate(prompt)
167
- data = trimAndLoadJson(res, self)
168
- return data["reason"]
176
+ return await a_generate_with_schema_and_extract(
177
+ metric=self,
178
+ prompt=prompt,
179
+ schema_cls=HallucinationScoreReason,
180
+ extract_schema=lambda s: s.reason,
181
+ extract_json=lambda data: data["reason"],
182
+ )
169
183
 
170
184
  def _generate_reason(self):
171
185
  if self.include_reason is False:
@@ -185,74 +199,45 @@ class HallucinationMetric(BaseMetric):
185
199
  score=format(self.score, ".2f"),
186
200
  )
187
201
 
188
- if self.using_native_model:
189
- res, cost = self.model.generate(
190
- prompt, schema=HallucinationScoreReason
191
- )
192
- self.evaluation_cost += cost
193
- return res.reason
194
- else:
195
- try:
196
- res: HallucinationScoreReason = self.model.generate(
197
- prompt, schema=HallucinationScoreReason
198
- )
199
- return res.reason
200
- except TypeError:
201
- res = self.model.generate(prompt)
202
- data = trimAndLoadJson(res, self)
203
- return data["reason"]
202
+ return generate_with_schema_and_extract(
203
+ metric=self,
204
+ prompt=prompt,
205
+ schema_cls=HallucinationScoreReason,
206
+ extract_schema=lambda s: s.reason,
207
+ extract_json=lambda data: data["reason"],
208
+ )
204
209
 
205
210
  async def _a_generate_verdicts(
206
211
  self, actual_output: str, contexts: List[str]
207
212
  ) -> List[HallucinationVerdict]:
208
- verdicts: List[HallucinationVerdict] = []
209
213
  prompt = self.evaluation_template.generate_verdicts(
210
214
  actual_output=actual_output, contexts=contexts
211
215
  )
212
- if self.using_native_model:
213
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
214
- self.evaluation_cost += cost
215
- verdicts = [item for item in res.verdicts]
216
- return verdicts
217
- else:
218
- try:
219
- res: Verdicts = await self.model.a_generate(
220
- prompt, schema=Verdicts
221
- )
222
- verdicts = [item for item in res.verdicts]
223
- return verdicts
224
- except TypeError:
225
- res = await self.model.a_generate(prompt)
226
- data = trimAndLoadJson(res, self)
227
- verdicts = [
228
- HallucinationVerdict(**item) for item in data["verdicts"]
229
- ]
230
- return verdicts
216
+ return await a_generate_with_schema_and_extract(
217
+ metric=self,
218
+ prompt=prompt,
219
+ schema_cls=Verdicts,
220
+ extract_schema=lambda s: list(s.verdicts),
221
+ extract_json=lambda data: [
222
+ HallucinationVerdict(**item) for item in data["verdicts"]
223
+ ],
224
+ )
231
225
 
232
226
  def _generate_verdicts(
233
227
  self, actual_output: str, contexts: List[str]
234
228
  ) -> List[HallucinationVerdict]:
235
- verdicts: List[HallucinationVerdict] = []
236
229
  prompt = self.evaluation_template.generate_verdicts(
237
230
  actual_output=actual_output, contexts=contexts
238
231
  )
239
- if self.using_native_model:
240
- res, cost = self.model.generate(prompt, schema=Verdicts)
241
- self.evaluation_cost += cost
242
- verdicts = [item for item in res.verdicts]
243
- return verdicts
244
- else:
245
- try:
246
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
247
- verdicts = [item for item in res.verdicts]
248
- return verdicts
249
- except TypeError:
250
- res = self.model.generate(prompt)
251
- data = trimAndLoadJson(res, self)
252
- verdicts = [
253
- HallucinationVerdict(**item) for item in data["verdicts"]
254
- ]
255
- return verdicts
232
+ return generate_with_schema_and_extract(
233
+ metric=self,
234
+ prompt=prompt,
235
+ schema_cls=Verdicts,
236
+ extract_schema=lambda s: list(s.verdicts),
237
+ extract_json=lambda data: [
238
+ HallucinationVerdict(**item) for item in data["verdicts"]
239
+ ],
240
+ )
256
241
 
257
242
  def _calculate_score(self) -> float:
258
243
  number_of_verdicts = len(self.verdicts)
@@ -273,7 +258,7 @@ class HallucinationMetric(BaseMetric):
273
258
  else:
274
259
  try:
275
260
  self.success = self.score <= self.threshold
276
- except:
261
+ except TypeError:
277
262
  self.success = False
278
263
  return self.success
279
264
 
@@ -2,9 +2,20 @@ from typing import List
2
2
 
3
3
 
4
4
  class HallucinationTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_verdicts(actual_output: str, contexts: List[str]):
7
15
  return f"""For each context in contexts, which is a list of strings, please generate a list of JSON objects to indicate whether the given 'actual output' agrees with EACH context. The JSON will have 2 fields: 'verdict' and 'reason'.
16
+
17
+ {HallucinationTemplate.multimodal_rules}
18
+
8
19
  The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the given text agrees with the context.
9
20
  The 'reason' is the reason for the verdict. When the answer is 'no', try to provide a correction in the reason.
10
21
 
@@ -46,6 +57,8 @@ JSON:
46
57
  ):
47
58
  return f"""Given a list of factual alignments and contradictions, which highlights alignment/contradictions between the `actual output` and `contexts, use it to provide a reason for the hallucination score in a CONCISELY. Note that The hallucination score ranges from 0 - 1, and the lower the better.
48
59
 
60
+ {HallucinationTemplate.multimodal_rules}
61
+
49
62
  **
50
63
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
51
64
  Example JSON: