deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -10,12 +10,18 @@ from deepeval.models import DeepEvalBaseLLM
10
10
  from deepeval.utils import get_or_create_event_loop, prettify_list
11
11
  from deepeval.metrics.utils import (
12
12
  construct_verbose_logs,
13
- trimAndLoadJson,
14
13
  check_llm_test_case_params,
15
14
  initialize_model,
15
+ a_generate_with_schema_and_extract,
16
+ generate_with_schema_and_extract,
16
17
  )
17
18
  from deepeval.metrics.misuse.template import MisuseTemplate
18
- from deepeval.metrics.misuse.schema import *
19
+ from deepeval.metrics.misuse.schema import (
20
+ Misuses,
21
+ MisuseVerdict,
22
+ Verdicts,
23
+ MisuseScoreReason,
24
+ )
19
25
  from deepeval.metrics.api import metric_data_manager
20
26
 
21
27
 
@@ -57,7 +63,16 @@ class MisuseMetric(BaseMetric):
57
63
  _log_metric_to_confident: bool = True,
58
64
  ) -> float:
59
65
 
60
- check_llm_test_case_params(test_case, self._required_params, self)
66
+ multimodal = test_case.multimodal
67
+ check_llm_test_case_params(
68
+ test_case,
69
+ self._required_params,
70
+ None,
71
+ None,
72
+ self,
73
+ self.model,
74
+ multimodal,
75
+ )
61
76
 
62
77
  self.evaluation_cost = 0 if self.using_native_model else None
63
78
  with metric_progress_indicator(
@@ -104,7 +119,16 @@ class MisuseMetric(BaseMetric):
104
119
  _log_metric_to_confident: bool = True,
105
120
  ) -> float:
106
121
 
107
- check_llm_test_case_params(test_case, self._required_params, self)
122
+ multimodal = test_case.multimodal
123
+ check_llm_test_case_params(
124
+ test_case,
125
+ self._required_params,
126
+ None,
127
+ None,
128
+ self,
129
+ self.model,
130
+ multimodal,
131
+ )
108
132
 
109
133
  self.evaluation_cost = 0 if self.using_native_model else None
110
134
  with metric_progress_indicator(
@@ -136,7 +160,7 @@ class MisuseMetric(BaseMetric):
136
160
  )
137
161
  return self.score
138
162
 
139
- async def _a_generate_reason(self) -> str:
163
+ async def _a_generate_reason(self) -> Optional[str]:
140
164
  if self.include_reason is False:
141
165
  return None
142
166
 
@@ -150,24 +174,15 @@ class MisuseMetric(BaseMetric):
150
174
  score=format(self.score, ".2f"),
151
175
  )
152
176
 
153
- if self.using_native_model:
154
- res, cost = await self.model.a_generate(
155
- prompt, schema=MisuseScoreReason
156
- )
157
- self.evaluation_cost += cost
158
- return res.reason
159
- else:
160
- try:
161
- res: MisuseScoreReason = await self.model.a_generate(
162
- prompt, schema=MisuseScoreReason
163
- )
164
- return res.reason
165
- except TypeError:
166
- res = await self.model.a_generate(prompt)
167
- data = trimAndLoadJson(res, self)
168
- return data["reason"]
177
+ return await a_generate_with_schema_and_extract(
178
+ metric=self,
179
+ prompt=prompt,
180
+ schema_cls=MisuseScoreReason,
181
+ extract_schema=lambda s: s.reason,
182
+ extract_json=lambda data: data["reason"],
183
+ )
169
184
 
170
- def _generate_reason(self) -> str:
185
+ def _generate_reason(self) -> Optional[str]:
171
186
  if self.include_reason is False:
172
187
  return None
173
188
 
@@ -181,106 +196,71 @@ class MisuseMetric(BaseMetric):
181
196
  score=format(self.score, ".2f"),
182
197
  )
183
198
 
184
- if self.using_native_model:
185
- res, cost = self.model.generate(prompt, schema=MisuseScoreReason)
186
- self.evaluation_cost += cost
187
- return res.reason
188
- else:
189
- try:
190
- res: MisuseScoreReason = self.model.generate(
191
- prompt, schema=MisuseScoreReason
192
- )
193
- return res.reason
194
- except TypeError:
195
- res = self.model.generate(prompt)
196
- data = trimAndLoadJson(res, self)
197
- return data["reason"]
199
+ return generate_with_schema_and_extract(
200
+ metric=self,
201
+ prompt=prompt,
202
+ schema_cls=MisuseScoreReason,
203
+ extract_schema=lambda s: s.reason,
204
+ extract_json=lambda data: data["reason"],
205
+ )
198
206
 
199
207
  async def _a_generate_verdicts(self) -> List[MisuseVerdict]:
200
208
  if len(self.misuses) == 0:
201
209
  return []
202
210
 
203
- verdicts: List[MisuseVerdict] = []
204
211
  prompt = self.evaluation_template.generate_verdicts(
205
212
  misuses=self.misuses, domain=self.domain
206
213
  )
207
- if self.using_native_model:
208
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
209
- self.evaluation_cost += cost
210
- verdicts = [item for item in res.verdicts]
211
- return verdicts
212
- else:
213
- try:
214
- res: Verdicts = await self.model.a_generate(
215
- prompt, schema=Verdicts
216
- )
217
- verdicts = [item for item in res.verdicts]
218
- return verdicts
219
- except TypeError:
220
- res = await self.model.a_generate(prompt)
221
- data = trimAndLoadJson(res, self)
222
- verdicts = [MisuseVerdict(**item) for item in data["verdicts"]]
223
- return verdicts
214
+ return await a_generate_with_schema_and_extract(
215
+ metric=self,
216
+ prompt=prompt,
217
+ schema_cls=Verdicts,
218
+ extract_schema=lambda s: list(s.verdicts),
219
+ extract_json=lambda data: [
220
+ MisuseVerdict(**item) for item in data["verdicts"]
221
+ ],
222
+ )
224
223
 
225
224
  def _generate_verdicts(self) -> List[MisuseVerdict]:
226
225
  if len(self.misuses) == 0:
227
226
  return []
228
227
 
229
- verdicts: List[MisuseVerdict] = []
230
228
  prompt = self.evaluation_template.generate_verdicts(
231
229
  misuses=self.misuses, domain=self.domain
232
230
  )
233
- if self.using_native_model:
234
- res, cost = self.model.generate(prompt, schema=Verdicts)
235
- self.evaluation_cost += cost
236
- verdicts = [item for item in res.verdicts]
237
- return verdicts
238
- else:
239
- try:
240
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
241
- verdicts = [item for item in res.verdicts]
242
- return verdicts
243
- except TypeError:
244
- res = self.model.generate(prompt)
245
- data = trimAndLoadJson(res, self)
246
- verdicts = [MisuseVerdict(**item) for item in data["verdicts"]]
247
- return verdicts
231
+ return generate_with_schema_and_extract(
232
+ metric=self,
233
+ prompt=prompt,
234
+ schema_cls=Verdicts,
235
+ extract_schema=lambda s: list(s.verdicts),
236
+ extract_json=lambda data: [
237
+ MisuseVerdict(**item) for item in data["verdicts"]
238
+ ],
239
+ )
248
240
 
249
241
  async def _a_generate_misuses(self, actual_output: str) -> List[str]:
250
242
  prompt = self.evaluation_template.generate_misuses(
251
243
  actual_output=actual_output, domain=self.domain
252
244
  )
253
- if self.using_native_model:
254
- res, cost = await self.model.a_generate(prompt, schema=Misuses)
255
- self.evaluation_cost += cost
256
- return res.misuses
257
- else:
258
- try:
259
- res: Misuses = await self.model.a_generate(
260
- prompt, schema=Misuses
261
- )
262
- return res.misuses
263
- except TypeError:
264
- res = await self.model.a_generate(prompt)
265
- data = trimAndLoadJson(res, self)
266
- return data["misuses"]
245
+ return await a_generate_with_schema_and_extract(
246
+ metric=self,
247
+ prompt=prompt,
248
+ schema_cls=Misuses,
249
+ extract_schema=lambda s: s.misuses,
250
+ extract_json=lambda data: data["misuses"],
251
+ )
267
252
 
268
253
  def _generate_misuses(self, actual_output: str) -> List[str]:
269
254
  prompt = self.evaluation_template.generate_misuses(
270
255
  actual_output=actual_output, domain=self.domain
271
256
  )
272
- if self.using_native_model:
273
- res, cost = self.model.generate(prompt, schema=Misuses)
274
- self.evaluation_cost += cost
275
- return res.misuses
276
- else:
277
- try:
278
- res: Misuses = self.model.generate(prompt, schema=Misuses)
279
- return res.misuses
280
- except TypeError:
281
- res = self.model.generate(prompt)
282
- data = trimAndLoadJson(res, self)
283
- return data["misuses"]
257
+ return generate_with_schema_and_extract(
258
+ metric=self,
259
+ prompt=prompt,
260
+ schema_cls=Misuses,
261
+ extract_schema=lambda s: s.misuses,
262
+ extract_json=lambda data: data["misuses"],
263
+ )
284
264
 
285
265
  def _calculate_score(self) -> float:
286
266
  number_of_verdicts = len(self.verdicts)
@@ -301,7 +281,7 @@ class MisuseMetric(BaseMetric):
301
281
  else:
302
282
  try:
303
283
  self.success = self.score <= self.threshold
304
- except:
284
+ except TypeError:
305
285
  self.success = False
306
286
  return self.success
307
287
 
@@ -2,10 +2,20 @@ from typing import List
2
2
 
3
3
 
4
4
  class MisuseTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_reason(misuse_violations: list, score: str):
7
15
  return f"""Based on the misuse violations identified: {misuse_violations}, and the misuse score: {score}, please provide a comprehensive reason for the score.
8
16
 
17
+ {MisuseTemplate.multimodal_rules}
18
+
9
19
  In your reason, explain why the misuse score is {score} by addressing the specific misuse violations found (if any).
10
20
 
11
21
  **
@@ -32,6 +42,8 @@ For each misuse statement, return:
32
42
  - "verdict": "yes" if it represents inappropriate use of a {domain} chatbot, "no" if it doesn't
33
43
  - "reason": Brief explanation of your verdict (only if verdict is "yes")
34
44
 
45
+ {MisuseTemplate.multimodal_rules}
46
+
35
47
  **
36
48
  IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key. No words or explanation is needed.
37
49
  **
@@ -56,6 +68,8 @@ Example JSON:
56
68
 
57
69
  Text: {actual_output}
58
70
 
71
+ {MisuseTemplate.multimodal_rules}
72
+
59
73
  Focus on identifying statements that represent:
60
74
  - Queries or requests that fall outside the {domain} domain scope
61
75
  - Topics unrelated to {domain} expertise or focus area
@@ -66,6 +80,7 @@ Avoid flagging:
66
80
  - Appropriate {domain}-related requests and discussions
67
81
  - Queries within the {domain} expertise area
68
82
 
83
+
69
84
  Return a list of these misuse statements.
70
85
 
71
86
  **
@@ -3,22 +3,3 @@ from .image_editing.image_editing import ImageEditingMetric
3
3
  from .image_coherence.image_coherence import ImageCoherenceMetric
4
4
  from .image_helpfulness.image_helpfulness import ImageHelpfulnessMetric
5
5
  from .image_reference.image_reference import ImageReferenceMetric
6
- from .multimodal_contextual_recall.multimodal_contextual_recall import (
7
- MultimodalContextualRecallMetric,
8
- )
9
- from .multimodal_contextual_relevancy.multimodal_contextual_relevancy import (
10
- MultimodalContextualRelevancyMetric,
11
- )
12
- from .multimodal_contextual_precision.multimodal_contextual_precision import (
13
- MultimodalContextualPrecisionMetric,
14
- )
15
- from .multimodal_answer_relevancy.multimodal_answer_relevancy import (
16
- MultimodalAnswerRelevancyMetric,
17
- )
18
- from .multimodal_faithfulness.multimodal_faithfulness import (
19
- MultimodalFaithfulnessMetric,
20
- )
21
- from .multimodal_tool_correctness.multimodal_tool_correctness import (
22
- MultimodalToolCorrectnessMetric,
23
- )
24
- from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval
@@ -1,41 +1,45 @@
1
1
  import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
- from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
4
+ from deepeval.metrics import BaseMetric
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_coherence.template import (
7
7
  ImageCoherenceTemplate,
8
8
  )
9
9
  from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
- check_mllm_test_case_params,
13
- initialize_multimodal_model,
11
+ check_llm_test_case_params,
12
+ initialize_model,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
- from deepeval.models import DeepEvalBaseMLLM
16
+ from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.multimodal_metrics.image_coherence.schema import (
17
18
  ReasonScore,
18
19
  )
19
20
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
21
+ from deepeval.utils import (
22
+ get_or_create_event_loop,
23
+ convert_to_multi_modal_array,
24
+ )
21
25
 
22
26
 
23
- class ImageCoherenceMetric(BaseMultimodalMetric):
24
- _required_params: List[MLLMTestCaseParams] = [
25
- MLLMTestCaseParams.INPUT,
26
- MLLMTestCaseParams.ACTUAL_OUTPUT,
27
+ class ImageCoherenceMetric(BaseMetric):
28
+ _required_params: List[LLMTestCaseParams] = [
29
+ LLMTestCaseParams.INPUT,
30
+ LLMTestCaseParams.ACTUAL_OUTPUT,
27
31
  ]
28
32
 
29
33
  def __init__(
30
34
  self,
31
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
32
36
  threshold: float = 0.5,
33
37
  async_mode: bool = True,
34
38
  strict_mode: bool = False,
35
39
  verbose_mode: bool = False,
36
40
  max_context_size: Optional[int] = None,
37
41
  ):
38
- self.model, self.using_native_model = initialize_multimodal_model(model)
42
+ self.model, self.using_native_model = initialize_model(model)
39
43
  self.evaluation_model = self.model.get_model_name()
40
44
  self.threshold = 1 if strict_mode else threshold
41
45
  self.strict_mode = strict_mode
@@ -45,13 +49,19 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
45
49
 
46
50
  def measure(
47
51
  self,
48
- test_case: MLLMTestCase,
52
+ test_case: LLMTestCase,
49
53
  _show_indicator: bool = True,
50
54
  _in_component: bool = False,
51
55
  _log_metric_to_confident: bool = True,
52
56
  ) -> float:
53
- check_mllm_test_case_params(
54
- test_case, self._required_params, None, None, self
57
+ check_llm_test_case_params(
58
+ test_case,
59
+ self._required_params,
60
+ None,
61
+ None,
62
+ self,
63
+ self.model,
64
+ test_case.multimodal,
55
65
  )
56
66
  self.evaluation_cost = 0 if self.using_native_model else None
57
67
  with metric_progress_indicator(
@@ -68,7 +78,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
68
78
  )
69
79
  )
70
80
  else:
71
- actual_output = test_case.actual_output
81
+ actual_output = convert_to_multi_modal_array(
82
+ test_case.actual_output
83
+ )
72
84
  self.contexts_above = []
73
85
  self.contexts_below = []
74
86
  self.scores = []
@@ -145,13 +157,19 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
145
157
 
146
158
  async def a_measure(
147
159
  self,
148
- test_case: MLLMTestCase,
160
+ test_case: LLMTestCase,
149
161
  _show_indicator: bool = True,
150
162
  _in_component: bool = False,
151
163
  _log_metric_to_confident: bool = True,
152
164
  ) -> float:
153
- check_mllm_test_case_params(
154
- test_case, self._required_params, None, None, self
165
+ check_llm_test_case_params(
166
+ test_case,
167
+ self._required_params,
168
+ None,
169
+ None,
170
+ self,
171
+ self.model,
172
+ test_case.multimodal,
155
173
  )
156
174
  self.evaluation_cost = 0 if self.using_native_model else None
157
175
  with metric_progress_indicator(
@@ -160,7 +178,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
160
178
  _show_indicator=_show_indicator,
161
179
  _in_component=_in_component,
162
180
  ):
163
- actual_output = test_case.actual_output
181
+ actual_output = convert_to_multi_modal_array(
182
+ test_case.actual_output
183
+ )
164
184
  self.contexts_above = []
165
185
  self.contexts_below = []
166
186
  self.scores = []
@@ -253,21 +273,14 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
253
273
  instructions = ImageCoherenceTemplate.evaluate_image_coherence(
254
274
  context_above, context_below
255
275
  )
256
- prompt = [instructions] + [image]
257
- if self.using_native_model:
258
- res, cost = self.model.generate(prompt, ReasonScore)
259
- self.evaluation_cost += cost
260
- return res.score, res.reasoning
261
- else:
262
- try:
263
- res: ReasonScore = self.model.generate(
264
- prompt, schema=ReasonScore
265
- )
266
- return res.score, res.reasoning
267
- except TypeError:
268
- res = self.model.generate(prompt)
269
- data = trimAndLoadJson(res, self)
270
- return data["score"], data["reasoning"]
276
+ prompt = f"{instructions} \nImages: {image}"
277
+ return generate_with_schema_and_extract(
278
+ metric=self,
279
+ prompt=prompt,
280
+ schema_cls=ReasonScore,
281
+ extract_schema=lambda s: (s.score, s.reasoning),
282
+ extract_json=lambda data: (data["score"], data["reasoning"]),
283
+ )
271
284
 
272
285
  async def a_evaluate_image_coherence(
273
286
  self,
@@ -278,21 +291,14 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
278
291
  instructions = ImageCoherenceTemplate.evaluate_image_coherence(
279
292
  context_above, context_below
280
293
  )
281
- prompt = [instructions] + [image]
282
- if self.using_native_model:
283
- res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
284
- self.evaluation_cost += cost
285
- return res.score, res.reasoning
286
- else:
287
- try:
288
- res: ReasonScore = await self.model.a_generate(
289
- prompt, schema=ReasonScore
290
- )
291
- return res.score, res.reasoning
292
- except TypeError:
293
- res = await self.model.a_generate(prompt)
294
- data = trimAndLoadJson(res, self)
295
- return data["score"], data["reasoning"]
294
+ prompt = f"{instructions} \nImages: {image}"
295
+ return await a_generate_with_schema_and_extract(
296
+ metric=self,
297
+ prompt=prompt,
298
+ schema_cls=ReasonScore,
299
+ extract_schema=lambda s: (s.score, s.reasoning),
300
+ extract_json=lambda data: (data["score"], data["reasoning"]),
301
+ )
296
302
 
297
303
  def get_image_context(
298
304
  self, image_index: int, actual_output: List[Union[str, MLLMImage]]
@@ -327,7 +333,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
327
333
  if isinstance(element, MLLMImage)
328
334
  ]
329
335
 
330
- def calculate_score(self, scores: List[float]):
336
+ def calculate_score(self, scores: List[float]) -> float:
331
337
  return sum(scores) / len(scores)
332
338
 
333
339
  def is_successful(self) -> bool:
@@ -336,7 +342,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
336
342
  else:
337
343
  try:
338
344
  self.success = self.score >= self.threshold
339
- except:
345
+ except TypeError:
340
346
  self.success = False
341
347
  return self.success
342
348