deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -10,12 +10,18 @@ from deepeval.models import DeepEvalBaseLLM
10
10
  from deepeval.utils import get_or_create_event_loop, prettify_list
11
11
  from deepeval.metrics.utils import (
12
12
  construct_verbose_logs,
13
- trimAndLoadJson,
14
13
  check_llm_test_case_params,
15
14
  initialize_model,
15
+ a_generate_with_schema_and_extract,
16
+ generate_with_schema_and_extract,
16
17
  )
17
18
  from deepeval.metrics.misuse.template import MisuseTemplate
18
- from deepeval.metrics.misuse.schema import *
19
+ from deepeval.metrics.misuse.schema import (
20
+ Misuses,
21
+ MisuseVerdict,
22
+ Verdicts,
23
+ MisuseScoreReason,
24
+ )
19
25
  from deepeval.metrics.api import metric_data_manager
20
26
 
21
27
 
@@ -57,7 +63,16 @@ class MisuseMetric(BaseMetric):
57
63
  _log_metric_to_confident: bool = True,
58
64
  ) -> float:
59
65
 
60
- check_llm_test_case_params(test_case, self._required_params, self)
66
+ multimodal = test_case.multimodal
67
+ check_llm_test_case_params(
68
+ test_case,
69
+ self._required_params,
70
+ None,
71
+ None,
72
+ self,
73
+ self.model,
74
+ multimodal,
75
+ )
61
76
 
62
77
  self.evaluation_cost = 0 if self.using_native_model else None
63
78
  with metric_progress_indicator(
@@ -104,7 +119,16 @@ class MisuseMetric(BaseMetric):
104
119
  _log_metric_to_confident: bool = True,
105
120
  ) -> float:
106
121
 
107
- check_llm_test_case_params(test_case, self._required_params, self)
122
+ multimodal = test_case.multimodal
123
+ check_llm_test_case_params(
124
+ test_case,
125
+ self._required_params,
126
+ None,
127
+ None,
128
+ self,
129
+ self.model,
130
+ multimodal,
131
+ )
108
132
 
109
133
  self.evaluation_cost = 0 if self.using_native_model else None
110
134
  with metric_progress_indicator(
@@ -136,7 +160,7 @@ class MisuseMetric(BaseMetric):
136
160
  )
137
161
  return self.score
138
162
 
139
- async def _a_generate_reason(self) -> str:
163
+ async def _a_generate_reason(self) -> Optional[str]:
140
164
  if self.include_reason is False:
141
165
  return None
142
166
 
@@ -150,24 +174,15 @@ class MisuseMetric(BaseMetric):
150
174
  score=format(self.score, ".2f"),
151
175
  )
152
176
 
153
- if self.using_native_model:
154
- res, cost = await self.model.a_generate(
155
- prompt, schema=MisuseScoreReason
156
- )
157
- self.evaluation_cost += cost
158
- return res.reason
159
- else:
160
- try:
161
- res: MisuseScoreReason = await self.model.a_generate(
162
- prompt, schema=MisuseScoreReason
163
- )
164
- return res.reason
165
- except TypeError:
166
- res = await self.model.a_generate(prompt)
167
- data = trimAndLoadJson(res, self)
168
- return data["reason"]
177
+ return await a_generate_with_schema_and_extract(
178
+ metric=self,
179
+ prompt=prompt,
180
+ schema_cls=MisuseScoreReason,
181
+ extract_schema=lambda s: s.reason,
182
+ extract_json=lambda data: data["reason"],
183
+ )
169
184
 
170
- def _generate_reason(self) -> str:
185
+ def _generate_reason(self) -> Optional[str]:
171
186
  if self.include_reason is False:
172
187
  return None
173
188
 
@@ -181,106 +196,71 @@ class MisuseMetric(BaseMetric):
181
196
  score=format(self.score, ".2f"),
182
197
  )
183
198
 
184
- if self.using_native_model:
185
- res, cost = self.model.generate(prompt, schema=MisuseScoreReason)
186
- self.evaluation_cost += cost
187
- return res.reason
188
- else:
189
- try:
190
- res: MisuseScoreReason = self.model.generate(
191
- prompt, schema=MisuseScoreReason
192
- )
193
- return res.reason
194
- except TypeError:
195
- res = self.model.generate(prompt)
196
- data = trimAndLoadJson(res, self)
197
- return data["reason"]
199
+ return generate_with_schema_and_extract(
200
+ metric=self,
201
+ prompt=prompt,
202
+ schema_cls=MisuseScoreReason,
203
+ extract_schema=lambda s: s.reason,
204
+ extract_json=lambda data: data["reason"],
205
+ )
198
206
 
199
207
  async def _a_generate_verdicts(self) -> List[MisuseVerdict]:
200
208
  if len(self.misuses) == 0:
201
209
  return []
202
210
 
203
- verdicts: List[MisuseVerdict] = []
204
211
  prompt = self.evaluation_template.generate_verdicts(
205
212
  misuses=self.misuses, domain=self.domain
206
213
  )
207
- if self.using_native_model:
208
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
209
- self.evaluation_cost += cost
210
- verdicts = [item for item in res.verdicts]
211
- return verdicts
212
- else:
213
- try:
214
- res: Verdicts = await self.model.a_generate(
215
- prompt, schema=Verdicts
216
- )
217
- verdicts = [item for item in res.verdicts]
218
- return verdicts
219
- except TypeError:
220
- res = await self.model.a_generate(prompt)
221
- data = trimAndLoadJson(res, self)
222
- verdicts = [MisuseVerdict(**item) for item in data["verdicts"]]
223
- return verdicts
214
+ return await a_generate_with_schema_and_extract(
215
+ metric=self,
216
+ prompt=prompt,
217
+ schema_cls=Verdicts,
218
+ extract_schema=lambda s: list(s.verdicts),
219
+ extract_json=lambda data: [
220
+ MisuseVerdict(**item) for item in data["verdicts"]
221
+ ],
222
+ )
224
223
 
225
224
  def _generate_verdicts(self) -> List[MisuseVerdict]:
226
225
  if len(self.misuses) == 0:
227
226
  return []
228
227
 
229
- verdicts: List[MisuseVerdict] = []
230
228
  prompt = self.evaluation_template.generate_verdicts(
231
229
  misuses=self.misuses, domain=self.domain
232
230
  )
233
- if self.using_native_model:
234
- res, cost = self.model.generate(prompt, schema=Verdicts)
235
- self.evaluation_cost += cost
236
- verdicts = [item for item in res.verdicts]
237
- return verdicts
238
- else:
239
- try:
240
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
241
- verdicts = [item for item in res.verdicts]
242
- return verdicts
243
- except TypeError:
244
- res = self.model.generate(prompt)
245
- data = trimAndLoadJson(res, self)
246
- verdicts = [MisuseVerdict(**item) for item in data["verdicts"]]
247
- return verdicts
231
+ return generate_with_schema_and_extract(
232
+ metric=self,
233
+ prompt=prompt,
234
+ schema_cls=Verdicts,
235
+ extract_schema=lambda s: list(s.verdicts),
236
+ extract_json=lambda data: [
237
+ MisuseVerdict(**item) for item in data["verdicts"]
238
+ ],
239
+ )
248
240
 
249
241
  async def _a_generate_misuses(self, actual_output: str) -> List[str]:
250
242
  prompt = self.evaluation_template.generate_misuses(
251
243
  actual_output=actual_output, domain=self.domain
252
244
  )
253
- if self.using_native_model:
254
- res, cost = await self.model.a_generate(prompt, schema=Misuses)
255
- self.evaluation_cost += cost
256
- return res.misuses
257
- else:
258
- try:
259
- res: Misuses = await self.model.a_generate(
260
- prompt, schema=Misuses
261
- )
262
- return res.misuses
263
- except TypeError:
264
- res = await self.model.a_generate(prompt)
265
- data = trimAndLoadJson(res, self)
266
- return data["misuses"]
245
+ return await a_generate_with_schema_and_extract(
246
+ metric=self,
247
+ prompt=prompt,
248
+ schema_cls=Misuses,
249
+ extract_schema=lambda s: s.misuses,
250
+ extract_json=lambda data: data["misuses"],
251
+ )
267
252
 
268
253
  def _generate_misuses(self, actual_output: str) -> List[str]:
269
254
  prompt = self.evaluation_template.generate_misuses(
270
255
  actual_output=actual_output, domain=self.domain
271
256
  )
272
- if self.using_native_model:
273
- res, cost = self.model.generate(prompt, schema=Misuses)
274
- self.evaluation_cost += cost
275
- return res.misuses
276
- else:
277
- try:
278
- res: Misuses = self.model.generate(prompt, schema=Misuses)
279
- return res.misuses
280
- except TypeError:
281
- res = self.model.generate(prompt)
282
- data = trimAndLoadJson(res, self)
283
- return data["misuses"]
257
+ return generate_with_schema_and_extract(
258
+ metric=self,
259
+ prompt=prompt,
260
+ schema_cls=Misuses,
261
+ extract_schema=lambda s: s.misuses,
262
+ extract_json=lambda data: data["misuses"],
263
+ )
284
264
 
285
265
  def _calculate_score(self) -> float:
286
266
  number_of_verdicts = len(self.verdicts)
@@ -301,7 +281,7 @@ class MisuseMetric(BaseMetric):
301
281
  else:
302
282
  try:
303
283
  self.success = self.score <= self.threshold
304
- except:
284
+ except TypeError:
305
285
  self.success = False
306
286
  return self.success
307
287
 
@@ -2,10 +2,20 @@ from typing import List
2
2
 
3
3
 
4
4
  class MisuseTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_reason(misuse_violations: list, score: str):
7
15
  return f"""Based on the misuse violations identified: {misuse_violations}, and the misuse score: {score}, please provide a comprehensive reason for the score.
8
16
 
17
+ {MisuseTemplate.multimodal_rules}
18
+
9
19
  In your reason, explain why the misuse score is {score} by addressing the specific misuse violations found (if any).
10
20
 
11
21
  **
@@ -32,6 +42,8 @@ For each misuse statement, return:
32
42
  - "verdict": "yes" if it represents inappropriate use of a {domain} chatbot, "no" if it doesn't
33
43
  - "reason": Brief explanation of your verdict (only if verdict is "yes")
34
44
 
45
+ {MisuseTemplate.multimodal_rules}
46
+
35
47
  **
36
48
  IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key. No words or explanation is needed.
37
49
  **
@@ -56,6 +68,8 @@ Example JSON:
56
68
 
57
69
  Text: {actual_output}
58
70
 
71
+ {MisuseTemplate.multimodal_rules}
72
+
59
73
  Focus on identifying statements that represent:
60
74
  - Queries or requests that fall outside the {domain} domain scope
61
75
  - Topics unrelated to {domain} expertise or focus area
@@ -66,6 +80,7 @@ Avoid flagging:
66
80
  - Appropriate {domain}-related requests and discussions
67
81
  - Queries within the {domain} expertise area
68
82
 
83
+
69
84
  Return a list of these misuse statements.
70
85
 
71
86
  **
@@ -3,4 +3,3 @@ from .image_editing.image_editing import ImageEditingMetric
3
3
  from .image_coherence.image_coherence import ImageCoherenceMetric
4
4
  from .image_helpfulness.image_helpfulness import ImageHelpfulnessMetric
5
5
  from .image_reference.image_reference import ImageReferenceMetric
6
- from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval
@@ -1,16 +1,17 @@
1
1
  import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
- from deepeval.metrics import BaseMultimodalMetric
4
+ from deepeval.metrics import BaseMetric
5
5
  from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_coherence.template import (
7
7
  ImageCoherenceTemplate,
8
8
  )
9
9
  from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
- check_mllm_test_case_params,
11
+ check_llm_test_case_params,
13
12
  initialize_model,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.multimodal_metrics.image_coherence.schema import (
@@ -23,7 +24,7 @@ from deepeval.utils import (
23
24
  )
24
25
 
25
26
 
26
- class ImageCoherenceMetric(BaseMultimodalMetric):
27
+ class ImageCoherenceMetric(BaseMetric):
27
28
  _required_params: List[LLMTestCaseParams] = [
28
29
  LLMTestCaseParams.INPUT,
29
30
  LLMTestCaseParams.ACTUAL_OUTPUT,
@@ -53,8 +54,14 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
53
54
  _in_component: bool = False,
54
55
  _log_metric_to_confident: bool = True,
55
56
  ) -> float:
56
- check_mllm_test_case_params(
57
- test_case, self._required_params, None, None, self, self.model
57
+ check_llm_test_case_params(
58
+ test_case,
59
+ self._required_params,
60
+ None,
61
+ None,
62
+ self,
63
+ self.model,
64
+ test_case.multimodal,
58
65
  )
59
66
  self.evaluation_cost = 0 if self.using_native_model else None
60
67
  with metric_progress_indicator(
@@ -155,8 +162,14 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
155
162
  _in_component: bool = False,
156
163
  _log_metric_to_confident: bool = True,
157
164
  ) -> float:
158
- check_mllm_test_case_params(
159
- test_case, self._required_params, None, None, self, self.model
165
+ check_llm_test_case_params(
166
+ test_case,
167
+ self._required_params,
168
+ None,
169
+ None,
170
+ self,
171
+ self.model,
172
+ test_case.multimodal,
160
173
  )
161
174
  self.evaluation_cost = 0 if self.using_native_model else None
162
175
  with metric_progress_indicator(
@@ -261,20 +274,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
261
274
  context_above, context_below
262
275
  )
263
276
  prompt = f"{instructions} \nImages: {image}"
264
- if self.using_native_model:
265
- res, cost = self.model.generate(prompt, ReasonScore)
266
- self.evaluation_cost += cost
267
- return res.score, res.reasoning
268
- else:
269
- try:
270
- res: ReasonScore = self.model.generate(
271
- prompt, schema=ReasonScore
272
- )
273
- return res.score, res.reasoning
274
- except TypeError:
275
- res = self.model.generate(prompt)
276
- data = trimAndLoadJson(res, self)
277
- return data["score"], data["reasoning"]
277
+ return generate_with_schema_and_extract(
278
+ metric=self,
279
+ prompt=prompt,
280
+ schema_cls=ReasonScore,
281
+ extract_schema=lambda s: (s.score, s.reasoning),
282
+ extract_json=lambda data: (data["score"], data["reasoning"]),
283
+ )
278
284
 
279
285
  async def a_evaluate_image_coherence(
280
286
  self,
@@ -286,20 +292,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
286
292
  context_above, context_below
287
293
  )
288
294
  prompt = f"{instructions} \nImages: {image}"
289
- if self.using_native_model:
290
- res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
291
- self.evaluation_cost += cost
292
- return res.score, res.reasoning
293
- else:
294
- try:
295
- res: ReasonScore = await self.model.a_generate(
296
- prompt, schema=ReasonScore
297
- )
298
- return res.score, res.reasoning
299
- except TypeError:
300
- res = await self.model.a_generate(prompt)
301
- data = trimAndLoadJson(res, self)
302
- return data["score"], data["reasoning"]
295
+ return await a_generate_with_schema_and_extract(
296
+ metric=self,
297
+ prompt=prompt,
298
+ schema_cls=ReasonScore,
299
+ extract_schema=lambda s: (s.score, s.reasoning),
300
+ extract_json=lambda data: (data["score"], data["reasoning"]),
301
+ )
303
302
 
304
303
  def get_image_context(
305
304
  self, image_index: int, actual_output: List[Union[str, MLLMImage]]
@@ -334,7 +333,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
334
333
  if isinstance(element, MLLMImage)
335
334
  ]
336
335
 
337
- def calculate_score(self, scores: List[float]):
336
+ def calculate_score(self, scores: List[float]) -> float:
338
337
  return sum(scores) / len(scores)
339
338
 
340
339
  def is_successful(self) -> bool:
@@ -343,7 +342,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
343
342
  else:
344
343
  try:
345
344
  self.success = self.score >= self.threshold
346
- except:
345
+ except TypeError:
347
346
  self.success = False
348
347
  return self.success
349
348
 
@@ -3,7 +3,7 @@ from typing import Optional, List, Tuple, Union
3
3
  import math
4
4
  import textwrap
5
5
 
6
- from deepeval.metrics import BaseMultimodalMetric
6
+ from deepeval.metrics import BaseMetric
7
7
  from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
8
8
  from deepeval.metrics.multimodal_metrics.image_editing.template import (
9
9
  ImageEditingTemplate,
@@ -14,16 +14,17 @@ from deepeval.utils import (
14
14
  )
15
15
  from deepeval.metrics.utils import (
16
16
  construct_verbose_logs,
17
- trimAndLoadJson,
18
- check_mllm_test_case_params,
17
+ check_llm_test_case_params,
19
18
  initialize_model,
19
+ a_generate_with_schema_and_extract,
20
+ generate_with_schema_and_extract,
20
21
  )
21
22
  from deepeval.models import DeepEvalBaseLLM
22
23
  from deepeval.metrics.multimodal_metrics.image_editing.schema import ReasonScore
23
24
  from deepeval.metrics.indicator import metric_progress_indicator
24
25
 
25
26
 
26
- class ImageEditingMetric(BaseMultimodalMetric):
27
+ class ImageEditingMetric(BaseMetric):
27
28
 
28
29
  _required_params: List[LLMTestCaseParams] = [
29
30
  LLMTestCaseParams.INPUT,
@@ -52,8 +53,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
52
53
  _in_component: bool = False,
53
54
  _log_metric_to_confident: bool = True,
54
55
  ) -> float:
55
- check_mllm_test_case_params(
56
- test_case, self._required_params, 1, 1, self, self.model
56
+ check_llm_test_case_params(
57
+ test_case,
58
+ self._required_params,
59
+ 1,
60
+ 1,
61
+ self,
62
+ self.model,
63
+ test_case.multimodal,
57
64
  )
58
65
 
59
66
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -103,7 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
103
110
  steps=[
104
111
  f"Semantic Consistency Scores:\n{self.SC_scores}",
105
112
  f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
106
- f"Perceptual Quality Scores:\n{self.SC_scores}",
113
+ f"Perceptual Quality Scores:\n{self.PQ_scores}",
107
114
  f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
108
115
  f"Score: {self.score}\nReason: {self.reason}",
109
116
  ],
@@ -117,8 +124,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
117
124
  _in_component: bool = False,
118
125
  _log_metric_to_confident: bool = True,
119
126
  ) -> float:
120
- check_mllm_test_case_params(
121
- test_case, self._required_params, 1, 1, self, self.model
127
+ check_llm_test_case_params(
128
+ test_case,
129
+ self._required_params,
130
+ 1,
131
+ 1,
132
+ self,
133
+ self.model,
134
+ test_case.multimodal,
122
135
  )
123
136
 
124
137
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -158,7 +171,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
158
171
  steps=[
159
172
  f"Semantic Consistency Scores:\n{self.SC_scores}",
160
173
  f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
161
- f"Perceptual Quality Scores:\n{self.SC_scores}",
174
+ f"Perceptual Quality Scores:\n{self.PQ_scores}",
162
175
  f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
163
176
  f"Score: {self.score}\nReason: {self.reason}",
164
177
  ],
@@ -190,24 +203,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
190
203
  text_prompt=text_prompt
191
204
  )
192
205
  ]
193
- if self.using_native_model:
194
- res, cost = await self.model.a_generate(
195
- prompt + images, schema=ReasonScore
196
- )
197
- self.evaluation_cost += cost
198
- return res.score, res.reasoning
199
- else:
200
- try:
201
- res: ReasonScore = await self.model.a_generate(
202
- prompt + images, schema=ReasonScore
203
- )
204
- return res.score, res.reasoning
205
- except TypeError:
206
- res = await self.model.a_generate(
207
- prompt + images, input_text=prompt
208
- )
209
- data = trimAndLoadJson(res, self)
210
- return data["score"], data["reasoning"]
206
+ return await a_generate_with_schema_and_extract(
207
+ metric=self,
208
+ prompt=f"{prompt} {images}",
209
+ schema_cls=ReasonScore,
210
+ extract_schema=lambda s: (s.score, s.reasoning),
211
+ extract_json=lambda data: (data["score"], data["reasoning"]),
212
+ )
211
213
 
212
214
  def _evaluate_semantic_consistency(
213
215
  self,
@@ -222,20 +224,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
222
224
  text_prompt=text_prompt
223
225
  )
224
226
  ]
225
- if self.using_native_model:
226
- res, cost = self.model.generate(prompt + images, schema=ReasonScore)
227
- self.evaluation_cost += cost
228
- return res.score, res.reasoning
229
- else:
230
- try:
231
- res: ReasonScore = self.model.generate(
232
- prompt + images, schema=ReasonScore
233
- )
234
- return res.score, res.reasoning
235
- except TypeError:
236
- res = self.model.generate(prompt + images)
237
- data = trimAndLoadJson(res, self)
238
- return data["score"], data["reasoning"]
227
+ return generate_with_schema_and_extract(
228
+ metric=self,
229
+ prompt=f"{prompt} {images}",
230
+ schema_cls=ReasonScore,
231
+ extract_schema=lambda s: (s.score, s.reasoning),
232
+ extract_json=lambda data: (data["score"], data["reasoning"]),
233
+ )
239
234
 
240
235
  async def _a_evaluate_perceptual_quality(
241
236
  self, actual_image_output: MLLMImage
@@ -244,22 +239,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
244
239
  prompt = [
245
240
  ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
246
241
  ]
247
- if self.using_native_model:
248
- res, cost = await self.model.a_generate(
249
- prompt + images, schema=ReasonScore
250
- )
251
- self.evaluation_cost += cost
252
- return res.score, res.reasoning
253
- else:
254
- try:
255
- res: ReasonScore = await self.model.a_generate(
256
- prompt + images, schema=ReasonScore
257
- )
258
- return res.score, res.reasoning
259
- except TypeError:
260
- res = await self.model.a_generate(prompt + images)
261
- data = trimAndLoadJson(res, self)
262
- return data["score"], data["reasoning"]
242
+ return await a_generate_with_schema_and_extract(
243
+ metric=self,
244
+ prompt=f"{prompt} {images}",
245
+ schema_cls=ReasonScore,
246
+ extract_schema=lambda s: (s.score, s.reasoning),
247
+ extract_json=lambda data: (data["score"], data["reasoning"]),
248
+ )
263
249
 
264
250
  def _evaluate_perceptual_quality(
265
251
  self, actual_image_output: MLLMImage
@@ -268,22 +254,15 @@ class ImageEditingMetric(BaseMultimodalMetric):
268
254
  prompt = [
269
255
  ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
270
256
  ]
271
- if self.using_native_model:
272
- res, cost = self.model.generate(prompt + images, schema=ReasonScore)
273
- self.evaluation_cost += cost
274
- return res.score, res.reasoning
275
- else:
276
- try:
277
- res: ReasonScore = self.model.generate(
278
- prompt + images, schema=ReasonScore
279
- )
280
- return res.score, res.reasoning
281
- except TypeError:
282
- res = self.model.generate(prompt + images)
283
- data = trimAndLoadJson(res, self)
284
- return data["score"], data["reasoning"]
257
+ return generate_with_schema_and_extract(
258
+ metric=self,
259
+ prompt=f"{prompt} {images}",
260
+ schema_cls=ReasonScore,
261
+ extract_schema=lambda s: (s.score, s.reasoning),
262
+ extract_json=lambda data: (data["score"], data["reasoning"]),
263
+ )
285
264
 
286
- def _calculate_score(self) -> List[str]:
265
+ def _calculate_score(self) -> float:
287
266
  min_SC_score = min(self.SC_scores)
288
267
  min_PQ_score = min(self.PQ_scores)
289
268
  return math.sqrt(min_SC_score * min_PQ_score) / 10
@@ -293,14 +272,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
293
272
  self.success = False
294
273
  else:
295
274
  try:
296
- self.score >= self.threshold
297
- except:
275
+ self.success = self.score >= self.threshold
276
+ except TypeError:
298
277
  self.success = False
299
278
  return self.success
300
279
 
301
280
  def _generate_reason(
302
281
  self,
303
- ) -> Tuple[List[float], str]:
282
+ ) -> str:
304
283
  return textwrap.dedent(
305
284
  f"""
306
285
  The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}