deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -3,39 +3,43 @@ from typing import Optional, List, Tuple, Union
3
3
  import math
4
4
  import textwrap
5
5
 
6
- from deepeval.metrics import BaseMultimodalMetric
7
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
6
+ from deepeval.metrics import BaseMetric
7
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
8
8
  from deepeval.metrics.multimodal_metrics.image_editing.template import (
9
9
  ImageEditingTemplate,
10
10
  )
11
- from deepeval.utils import get_or_create_event_loop
11
+ from deepeval.utils import (
12
+ get_or_create_event_loop,
13
+ convert_to_multi_modal_array,
14
+ )
12
15
  from deepeval.metrics.utils import (
13
16
  construct_verbose_logs,
14
- trimAndLoadJson,
15
- check_mllm_test_case_params,
16
- initialize_multimodal_model,
17
+ check_llm_test_case_params,
18
+ initialize_model,
19
+ a_generate_with_schema_and_extract,
20
+ generate_with_schema_and_extract,
17
21
  )
18
- from deepeval.models import DeepEvalBaseMLLM
22
+ from deepeval.models import DeepEvalBaseLLM
19
23
  from deepeval.metrics.multimodal_metrics.image_editing.schema import ReasonScore
20
24
  from deepeval.metrics.indicator import metric_progress_indicator
21
25
 
22
26
 
23
- class ImageEditingMetric(BaseMultimodalMetric):
27
+ class ImageEditingMetric(BaseMetric):
24
28
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
29
+ _required_params: List[LLMTestCaseParams] = [
30
+ LLMTestCaseParams.INPUT,
31
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
32
  ]
29
33
 
30
34
  def __init__(
31
35
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
36
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
37
  threshold: float = 0.5,
34
38
  async_mode: bool = True,
35
39
  strict_mode: bool = False,
36
40
  verbose_mode: bool = False,
37
41
  ):
38
- self.model, self.using_native_model = initialize_multimodal_model(model)
42
+ self.model, self.using_native_model = initialize_model(model)
39
43
  self.evaluation_model = self.model.get_model_name()
40
44
  self.threshold = 1 if strict_mode else threshold
41
45
  self.strict_mode = strict_mode
@@ -44,13 +48,19 @@ class ImageEditingMetric(BaseMultimodalMetric):
44
48
 
45
49
  def measure(
46
50
  self,
47
- test_case: MLLMTestCase,
51
+ test_case: LLMTestCase,
48
52
  _show_indicator: bool = True,
49
53
  _in_component: bool = False,
50
54
  _log_metric_to_confident: bool = True,
51
55
  ) -> float:
52
- check_mllm_test_case_params(
53
- test_case, self._required_params, 1, 1, self
56
+ check_llm_test_case_params(
57
+ test_case,
58
+ self._required_params,
59
+ 1,
60
+ 1,
61
+ self,
62
+ self.model,
63
+ test_case.multimodal,
54
64
  )
55
65
 
56
66
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -68,12 +78,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
68
78
  )
69
79
  )
70
80
  else:
71
- input_texts, input_images = self.separate_images_from_text(
72
- test_case.input
73
- )
74
- _, output_images = self.separate_images_from_text(
81
+ input = convert_to_multi_modal_array(test_case.input)
82
+ actual_output = convert_to_multi_modal_array(
75
83
  test_case.actual_output
76
84
  )
85
+ input_texts, input_images = self.separate_images_from_text(
86
+ input
87
+ )
88
+ _, output_images = self.separate_images_from_text(actual_output)
77
89
 
78
90
  self.SC_scores, self.SC_reasoning = (
79
91
  self._evaluate_semantic_consistency(
@@ -98,7 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
98
110
  steps=[
99
111
  f"Semantic Consistency Scores:\n{self.SC_scores}",
100
112
  f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
101
- f"Perceptual Quality Scores:\n{self.SC_scores}",
113
+ f"Perceptual Quality Scores:\n{self.PQ_scores}",
102
114
  f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
103
115
  f"Score: {self.score}\nReason: {self.reason}",
104
116
  ],
@@ -107,13 +119,19 @@ class ImageEditingMetric(BaseMultimodalMetric):
107
119
 
108
120
  async def a_measure(
109
121
  self,
110
- test_case: MLLMTestCase,
122
+ test_case: LLMTestCase,
111
123
  _show_indicator: bool = True,
112
124
  _in_component: bool = False,
113
125
  _log_metric_to_confident: bool = True,
114
126
  ) -> float:
115
- check_mllm_test_case_params(
116
- test_case, self._required_params, 1, 1, self
127
+ check_llm_test_case_params(
128
+ test_case,
129
+ self._required_params,
130
+ 1,
131
+ 1,
132
+ self,
133
+ self.model,
134
+ test_case.multimodal,
117
135
  )
118
136
 
119
137
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -123,12 +141,12 @@ class ImageEditingMetric(BaseMultimodalMetric):
123
141
  _show_indicator=_show_indicator,
124
142
  _in_component=_in_component,
125
143
  ):
126
- input_texts, input_images = self.separate_images_from_text(
127
- test_case.input
128
- )
129
- _, output_images = self.separate_images_from_text(
144
+ input = convert_to_multi_modal_array(test_case.input)
145
+ actual_output = convert_to_multi_modal_array(
130
146
  test_case.actual_output
131
147
  )
148
+ input_texts, input_images = self.separate_images_from_text(input)
149
+ _, output_images = self.separate_images_from_text(actual_output)
132
150
  (self.SC_scores, self.SC_reasoning), (
133
151
  self.PQ_scores,
134
152
  self.PQ_reasoning,
@@ -153,7 +171,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
153
171
  steps=[
154
172
  f"Semantic Consistency Scores:\n{self.SC_scores}",
155
173
  f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
156
- f"Perceptual Quality Scores:\n{self.SC_scores}",
174
+ f"Perceptual Quality Scores:\n{self.PQ_scores}",
157
175
  f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
158
176
  f"Score: {self.score}\nReason: {self.reason}",
159
177
  ],
@@ -185,24 +203,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
185
203
  text_prompt=text_prompt
186
204
  )
187
205
  ]
188
- if self.using_native_model:
189
- res, cost = await self.model.a_generate(
190
- prompt + images, schema=ReasonScore
191
- )
192
- self.evaluation_cost += cost
193
- return res.score, res.reasoning
194
- else:
195
- try:
196
- res: ReasonScore = await self.model.a_generate(
197
- prompt + images, schema=ReasonScore
198
- )
199
- return res.score, res.reasoning
200
- except TypeError:
201
- res = await self.model.a_generate(
202
- prompt + images, input_text=prompt
203
- )
204
- data = trimAndLoadJson(res, self)
205
- return data["score"], data["reasoning"]
206
+ return await a_generate_with_schema_and_extract(
207
+ metric=self,
208
+ prompt=f"{prompt} {images}",
209
+ schema_cls=ReasonScore,
210
+ extract_schema=lambda s: (s.score, s.reasoning),
211
+ extract_json=lambda data: (data["score"], data["reasoning"]),
212
+ )
206
213
 
207
214
  def _evaluate_semantic_consistency(
208
215
  self,
@@ -217,20 +224,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
217
224
  text_prompt=text_prompt
218
225
  )
219
226
  ]
220
- if self.using_native_model:
221
- res, cost = self.model.generate(prompt + images, schema=ReasonScore)
222
- self.evaluation_cost += cost
223
- return res.score, res.reasoning
224
- else:
225
- try:
226
- res: ReasonScore = self.model.generate(
227
- prompt + images, schema=ReasonScore
228
- )
229
- return res.score, res.reasoning
230
- except TypeError:
231
- res = self.model.generate(prompt + images)
232
- data = trimAndLoadJson(res, self)
233
- return data["score"], data["reasoning"]
227
+ return generate_with_schema_and_extract(
228
+ metric=self,
229
+ prompt=f"{prompt} {images}",
230
+ schema_cls=ReasonScore,
231
+ extract_schema=lambda s: (s.score, s.reasoning),
232
+ extract_json=lambda data: (data["score"], data["reasoning"]),
233
+ )
234
234
 
235
235
  async def _a_evaluate_perceptual_quality(
236
236
  self, actual_image_output: MLLMImage
@@ -239,22 +239,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
239
239
  prompt = [
240
240
  ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
241
241
  ]
242
- if self.using_native_model:
243
- res, cost = await self.model.a_generate(
244
- prompt + images, schema=ReasonScore
245
- )
246
- self.evaluation_cost += cost
247
- return res.score, res.reasoning
248
- else:
249
- try:
250
- res: ReasonScore = await self.model.a_generate(
251
- prompt + images, schema=ReasonScore
252
- )
253
- return res.score, res.reasoning
254
- except TypeError:
255
- res = await self.model.a_generate(prompt + images)
256
- data = trimAndLoadJson(res, self)
257
- return data["score"], data["reasoning"]
242
+ return await a_generate_with_schema_and_extract(
243
+ metric=self,
244
+ prompt=f"{prompt} {images}",
245
+ schema_cls=ReasonScore,
246
+ extract_schema=lambda s: (s.score, s.reasoning),
247
+ extract_json=lambda data: (data["score"], data["reasoning"]),
248
+ )
258
249
 
259
250
  def _evaluate_perceptual_quality(
260
251
  self, actual_image_output: MLLMImage
@@ -263,22 +254,15 @@ class ImageEditingMetric(BaseMultimodalMetric):
263
254
  prompt = [
264
255
  ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
265
256
  ]
266
- if self.using_native_model:
267
- res, cost = self.model.generate(prompt + images, schema=ReasonScore)
268
- self.evaluation_cost += cost
269
- return res.score, res.reasoning
270
- else:
271
- try:
272
- res: ReasonScore = self.model.generate(
273
- prompt + images, schema=ReasonScore
274
- )
275
- return res.score, res.reasoning
276
- except TypeError:
277
- res = self.model.generate(prompt + images)
278
- data = trimAndLoadJson(res, self)
279
- return data["score"], data["reasoning"]
257
+ return generate_with_schema_and_extract(
258
+ metric=self,
259
+ prompt=f"{prompt} {images}",
260
+ schema_cls=ReasonScore,
261
+ extract_schema=lambda s: (s.score, s.reasoning),
262
+ extract_json=lambda data: (data["score"], data["reasoning"]),
263
+ )
280
264
 
281
- def _calculate_score(self) -> List[str]:
265
+ def _calculate_score(self) -> float:
282
266
  min_SC_score = min(self.SC_scores)
283
267
  min_PQ_score = min(self.PQ_scores)
284
268
  return math.sqrt(min_SC_score * min_PQ_score) / 10
@@ -288,14 +272,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
288
272
  self.success = False
289
273
  else:
290
274
  try:
291
- self.score >= self.threshold
292
- except:
275
+ self.success = self.score >= self.threshold
276
+ except TypeError:
293
277
  self.success = False
294
278
  return self.success
295
279
 
296
280
  def _generate_reason(
297
281
  self,
298
- ) -> Tuple[List[float], str]:
282
+ ) -> str:
299
283
  return textwrap.dedent(
300
284
  f"""
301
285
  The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}
@@ -1,42 +1,46 @@
1
1
  import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
- from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
4
+ from deepeval.metrics import BaseMetric
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_helpfulness.template import (
7
7
  ImageHelpfulnessTemplate,
8
8
  )
9
9
  from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
- check_mllm_test_case_params,
13
- initialize_multimodal_model,
11
+ check_llm_test_case_params,
12
+ initialize_model,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
- from deepeval.models import DeepEvalBaseMLLM
16
+ from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.multimodal_metrics.image_helpfulness.schema import (
17
18
  ReasonScore,
18
19
  )
19
20
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
21
+ from deepeval.utils import (
22
+ get_or_create_event_loop,
23
+ convert_to_multi_modal_array,
24
+ )
21
25
 
22
26
 
23
- class ImageHelpfulnessMetric(BaseMultimodalMetric):
27
+ class ImageHelpfulnessMetric(BaseMetric):
24
28
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
29
+ _required_params: List[LLMTestCaseParams] = [
30
+ LLMTestCaseParams.INPUT,
31
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
32
  ]
29
33
 
30
34
  def __init__(
31
35
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
36
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
37
  threshold: float = 0.5,
34
38
  async_mode: bool = True,
35
39
  strict_mode: bool = False,
36
40
  verbose_mode: bool = False,
37
41
  max_context_size: Optional[int] = None,
38
42
  ):
39
- self.model, self.using_native_model = initialize_multimodal_model(model)
43
+ self.model, self.using_native_model = initialize_model(model)
40
44
  self.evaluation_model = self.model.get_model_name()
41
45
  self.threshold = 1 if strict_mode else threshold
42
46
  self.strict_mode = strict_mode
@@ -46,13 +50,19 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
46
50
 
47
51
  def measure(
48
52
  self,
49
- test_case: MLLMTestCase,
53
+ test_case: LLMTestCase,
50
54
  _show_indicator: bool = True,
51
55
  _in_component: bool = False,
52
56
  _log_metric_to_confident: bool = True,
53
57
  ) -> float:
54
- check_mllm_test_case_params(
55
- test_case, self._required_params, None, None, self
58
+ check_llm_test_case_params(
59
+ test_case,
60
+ self._required_params,
61
+ None,
62
+ None,
63
+ self,
64
+ self.model,
65
+ test_case.multimodal,
56
66
  )
57
67
  self.evaluation_cost = 0 if self.using_native_model else None
58
68
  with metric_progress_indicator(
@@ -69,7 +79,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
69
79
  )
70
80
  )
71
81
  else:
72
- actual_output = test_case.actual_output
82
+ actual_output = convert_to_multi_modal_array(
83
+ test_case.actual_output
84
+ )
73
85
  self.contexts_above = []
74
86
  self.contexts_below = []
75
87
  self.scores = []
@@ -146,13 +158,19 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
146
158
 
147
159
  async def a_measure(
148
160
  self,
149
- test_case: MLLMTestCase,
161
+ test_case: LLMTestCase,
150
162
  _show_indicator: bool = True,
151
163
  _in_component: bool = False,
152
164
  _log_metric_to_confident: bool = True,
153
165
  ) -> float:
154
- check_mllm_test_case_params(
155
- test_case, self._required_params, None, None, self
166
+ check_llm_test_case_params(
167
+ test_case,
168
+ self._required_params,
169
+ None,
170
+ None,
171
+ self,
172
+ self.model,
173
+ test_case.multimodal,
156
174
  )
157
175
  self.evaluation_cost = 0 if self.using_native_model else None
158
176
  with metric_progress_indicator(
@@ -161,7 +179,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
161
179
  _show_indicator=_show_indicator,
162
180
  _in_component=_in_component,
163
181
  ):
164
- actual_output = test_case.actual_output
182
+ actual_output = convert_to_multi_modal_array(
183
+ test_case.actual_output
184
+ )
165
185
  self.contexts_above = []
166
186
  self.contexts_below = []
167
187
  self.scores = []
@@ -254,21 +274,14 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
254
274
  instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
255
275
  context_above, context_below
256
276
  )
257
- prompt = [instructions] + [image]
258
- if self.using_native_model:
259
- res, cost = self.model.generate(prompt, schema=ReasonScore)
260
- self.evaluation_cost += cost
261
- return res.score, res.reasoning
262
- else:
263
- try:
264
- res: ReasonScore = self.model.generate(
265
- prompt, schema=ReasonScore
266
- )
267
- return res.score, res.reasoning
268
- except TypeError:
269
- res = self.model.generate(prompt)
270
- data = trimAndLoadJson(res, self)
271
- return data["score"], data["reasoning"]
277
+ prompt = f"{instructions} \nImages: {image}"
278
+ return generate_with_schema_and_extract(
279
+ metric=self,
280
+ prompt=prompt,
281
+ schema_cls=ReasonScore,
282
+ extract_schema=lambda s: (s.score, s.reasoning),
283
+ extract_json=lambda data: (data["score"], data["reasoning"]),
284
+ )
272
285
 
273
286
  async def a_evaluate_image_helpfulness(
274
287
  self,
@@ -279,21 +292,14 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
279
292
  instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
280
293
  context_above, context_below
281
294
  )
282
- prompt = [instructions] + [image]
283
- if self.using_native_model:
284
- res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
285
- self.evaluation_cost += cost
286
- return res.score, res.reasoning
287
- else:
288
- try:
289
- res: ReasonScore = await self.model.a_generate(
290
- prompt, schema=ReasonScore
291
- )
292
- return res.score, res.reasoning
293
- except TypeError:
294
- res = await self.model.a_generate(prompt)
295
- data = trimAndLoadJson(res, self)
296
- return data["score"], data["reasoning"]
295
+ prompt = f"{instructions} \nImages: {image}"
296
+ return await a_generate_with_schema_and_extract(
297
+ metric=self,
298
+ prompt=prompt,
299
+ schema_cls=ReasonScore,
300
+ extract_schema=lambda s: (s.score, s.reasoning),
301
+ extract_json=lambda data: (data["score"], data["reasoning"]),
302
+ )
297
303
 
298
304
  def get_image_context(
299
305
  self, image_index: int, actual_output: List[Union[str, MLLMImage]]
@@ -328,7 +334,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
328
334
  if isinstance(element, MLLMImage)
329
335
  ]
330
336
 
331
- def calculate_score(self, scores: List[float]):
337
+ def calculate_score(self, scores: List[float]) -> float:
332
338
  return sum(scores) / len(scores)
333
339
 
334
340
  def is_successful(self) -> bool:
@@ -337,7 +343,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
337
343
  else:
338
344
  try:
339
345
  self.success = self.score >= self.threshold
340
- except:
346
+ except TypeError:
341
347
  self.success = False
342
348
  return self.success
343
349