deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,42 +1,46 @@
1
1
  import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
- from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
4
+ from deepeval.metrics import BaseMetric
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_reference.template import (
7
7
  ImageReferenceTemplate,
8
8
  )
9
9
  from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
- trimAndLoadJson,
12
- check_mllm_test_case_params,
13
- initialize_multimodal_model,
11
+ check_llm_test_case_params,
12
+ initialize_model,
13
+ a_generate_with_schema_and_extract,
14
+ generate_with_schema_and_extract,
14
15
  )
15
- from deepeval.models import DeepEvalBaseMLLM
16
+ from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.multimodal_metrics.image_reference.schema import (
17
18
  ReasonScore,
18
19
  )
19
20
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
21
+ from deepeval.utils import (
22
+ get_or_create_event_loop,
23
+ convert_to_multi_modal_array,
24
+ )
21
25
 
22
26
 
23
- class ImageReferenceMetric(BaseMultimodalMetric):
27
+ class ImageReferenceMetric(BaseMetric):
24
28
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
29
+ _required_params: List[LLMTestCaseParams] = [
30
+ LLMTestCaseParams.INPUT,
31
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
32
  ]
29
33
 
30
34
  def __init__(
31
35
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
36
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
37
  threshold: float = 0.5,
34
38
  async_mode: bool = True,
35
39
  strict_mode: bool = False,
36
40
  verbose_mode: bool = False,
37
41
  max_context_size: Optional[int] = None,
38
42
  ):
39
- self.model, self.using_native_model = initialize_multimodal_model(model)
43
+ self.model, self.using_native_model = initialize_model(model)
40
44
  self.evaluation_model = self.model.get_model_name()
41
45
  self.threshold = 1 if strict_mode else threshold
42
46
  self.strict_mode = strict_mode
@@ -46,13 +50,19 @@ class ImageReferenceMetric(BaseMultimodalMetric):
46
50
 
47
51
  def measure(
48
52
  self,
49
- test_case: MLLMTestCase,
53
+ test_case: LLMTestCase,
50
54
  _show_indicator: bool = True,
51
55
  _in_component: bool = False,
52
56
  _log_metric_to_confident: bool = True,
53
57
  ) -> float:
54
- check_mllm_test_case_params(
55
- test_case, self._required_params, None, None, self
58
+ check_llm_test_case_params(
59
+ test_case,
60
+ self._required_params,
61
+ None,
62
+ None,
63
+ self,
64
+ self.model,
65
+ test_case.multimodal,
56
66
  )
57
67
  self.evaluation_cost = 0 if self.using_native_model else None
58
68
  with metric_progress_indicator(
@@ -69,7 +79,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
69
79
  )
70
80
  )
71
81
  else:
72
- actual_output = test_case.actual_output
82
+ actual_output = convert_to_multi_modal_array(
83
+ test_case.actual_output
84
+ )
73
85
  self.contexts_above = []
74
86
  self.contexts_below = []
75
87
  self.scores = []
@@ -146,13 +158,19 @@ class ImageReferenceMetric(BaseMultimodalMetric):
146
158
 
147
159
  async def a_measure(
148
160
  self,
149
- test_case: MLLMTestCase,
161
+ test_case: LLMTestCase,
150
162
  _show_indicator: bool = True,
151
163
  _in_component: bool = False,
152
164
  _log_metric_to_confident: bool = True,
153
165
  ) -> float:
154
- check_mllm_test_case_params(
155
- test_case, self._required_params, None, None, self
166
+ check_llm_test_case_params(
167
+ test_case,
168
+ self._required_params,
169
+ None,
170
+ None,
171
+ self,
172
+ self.model,
173
+ test_case.multimodal,
156
174
  )
157
175
  self.evaluation_cost = 0 if self.using_native_model else None
158
176
  with metric_progress_indicator(
@@ -161,7 +179,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
161
179
  _show_indicator=_show_indicator,
162
180
  _in_component=_in_component,
163
181
  ):
164
- actual_output = test_case.actual_output
182
+ actual_output = convert_to_multi_modal_array(
183
+ test_case.actual_output
184
+ )
165
185
  self.contexts_above = []
166
186
  self.contexts_below = []
167
187
  self.scores = []
@@ -254,21 +274,14 @@ class ImageReferenceMetric(BaseMultimodalMetric):
254
274
  instructions = ImageReferenceTemplate.evaluate_image_reference(
255
275
  context_above, context_below
256
276
  )
257
- prompt = [instructions] + [image]
258
- if self.using_native_model:
259
- res, cost = self.model.generate(prompt, schema=ReasonScore)
260
- self.evaluation_cost += cost
261
- return res.score, res.reasoning
262
- else:
263
- try:
264
- res: ReasonScore = self.model.generate(
265
- prompt, schema=ReasonScore
266
- )
267
- return res.score, res.reasoning
268
- except TypeError:
269
- res = self.model.generate(prompt)
270
- data = trimAndLoadJson(res, self)
271
- return data["score"], data["reasoning"]
277
+ prompt = f"{instructions} \nImages: {image}"
278
+ return generate_with_schema_and_extract(
279
+ metric=self,
280
+ prompt=prompt,
281
+ schema_cls=ReasonScore,
282
+ extract_schema=lambda s: (s.score, s.reasoning),
283
+ extract_json=lambda data: (data["score"], data["reasoning"]),
284
+ )
272
285
 
273
286
  async def a_evaluate_image_reference(
274
287
  self,
@@ -279,21 +292,14 @@ class ImageReferenceMetric(BaseMultimodalMetric):
279
292
  instructions = ImageReferenceTemplate.evaluate_image_reference(
280
293
  context_above, context_below
281
294
  )
282
- prompt = [instructions] + [image]
283
- if self.using_native_model:
284
- res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
285
- self.evaluation_cost += cost
286
- return res.score, res.reasoning
287
- else:
288
- try:
289
- res: ReasonScore = await self.model.a_generate(
290
- prompt, schema=ReasonScore
291
- )
292
- return res.score, res.reasoning
293
- except TypeError:
294
- res = await self.model.a_generate(prompt)
295
- data = trimAndLoadJson(res, self)
296
- return data["score"], data["reasoning"]
295
+ prompt = f"{instructions} \nImages: {image}"
296
+ return await a_generate_with_schema_and_extract(
297
+ metric=self,
298
+ prompt=prompt,
299
+ schema_cls=ReasonScore,
300
+ extract_schema=lambda s: (s.score, s.reasoning),
301
+ extract_json=lambda data: (data["score"], data["reasoning"]),
302
+ )
297
303
 
298
304
  def get_image_context(
299
305
  self, image_index: int, actual_output: List[Union[str, MLLMImage]]
@@ -328,7 +334,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
328
334
  if isinstance(element, MLLMImage)
329
335
  ]
330
336
 
331
- def calculate_score(self, scores: List[float]):
337
+ def calculate_score(self, scores: List[float]) -> float:
332
338
  return sum(scores) / len(scores)
333
339
 
334
340
  def is_successful(self) -> bool:
@@ -337,7 +343,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
337
343
  else:
338
344
  try:
339
345
  self.success = self.score >= self.threshold
340
- except:
346
+ except TypeError:
341
347
  self.success = False
342
348
  return self.success
343
349
 
@@ -3,38 +3,42 @@ from typing import Optional, List, Tuple, Union
3
3
  import math
4
4
  import textwrap
5
5
 
6
- from deepeval.metrics import BaseMultimodalMetric
7
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
6
+ from deepeval.metrics import BaseMetric
7
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
8
8
  from deepeval.metrics.multimodal_metrics.text_to_image.template import (
9
9
  TextToImageTemplate,
10
10
  )
11
- from deepeval.utils import get_or_create_event_loop
11
+ from deepeval.utils import (
12
+ get_or_create_event_loop,
13
+ convert_to_multi_modal_array,
14
+ )
12
15
  from deepeval.metrics.utils import (
13
16
  construct_verbose_logs,
14
- trimAndLoadJson,
15
- check_mllm_test_case_params,
16
- initialize_multimodal_model,
17
+ check_llm_test_case_params,
18
+ initialize_model,
19
+ a_generate_with_schema_and_extract,
20
+ generate_with_schema_and_extract,
17
21
  )
18
- from deepeval.models import DeepEvalBaseMLLM
22
+ from deepeval.models import DeepEvalBaseLLM
19
23
  from deepeval.metrics.multimodal_metrics.text_to_image.schema import ReasonScore
20
24
  from deepeval.metrics.indicator import metric_progress_indicator
21
25
 
22
- required_params: List[MLLMTestCaseParams] = [
23
- MLLMTestCaseParams.INPUT,
24
- MLLMTestCaseParams.ACTUAL_OUTPUT,
26
+ required_params: List[LLMTestCaseParams] = [
27
+ LLMTestCaseParams.INPUT,
28
+ LLMTestCaseParams.ACTUAL_OUTPUT,
25
29
  ]
26
30
 
27
31
 
28
- class TextToImageMetric(BaseMultimodalMetric):
32
+ class TextToImageMetric(BaseMetric):
29
33
  def __init__(
30
34
  self,
31
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
32
36
  threshold: float = 0.5,
33
37
  async_mode: bool = True,
34
38
  strict_mode: bool = False,
35
39
  verbose_mode: bool = False,
36
40
  ):
37
- self.model, self.using_native_model = initialize_multimodal_model(model)
41
+ self.model, self.using_native_model = initialize_model(model)
38
42
  self.evaluation_model = self.model.get_model_name()
39
43
  self.threshold = 1 if strict_mode else threshold
40
44
  self.strict_mode = strict_mode
@@ -43,11 +47,19 @@ class TextToImageMetric(BaseMultimodalMetric):
43
47
 
44
48
  def measure(
45
49
  self,
46
- test_case: MLLMTestCase,
50
+ test_case: LLMTestCase,
47
51
  _show_indicator: bool = True,
48
52
  _in_component: bool = False,
49
53
  ) -> float:
50
- check_mllm_test_case_params(test_case, required_params, 0, 1, self)
54
+ check_llm_test_case_params(
55
+ test_case,
56
+ required_params,
57
+ 0,
58
+ 1,
59
+ self,
60
+ self.model,
61
+ test_case.multimodal,
62
+ )
51
63
 
52
64
  self.evaluation_cost = 0 if self.using_native_model else None
53
65
  with metric_progress_indicator(
@@ -63,10 +75,12 @@ class TextToImageMetric(BaseMultimodalMetric):
63
75
  )
64
76
  )
65
77
  else:
66
- input_texts, _ = self.separate_images_from_text(test_case.input)
67
- _, output_images = self.separate_images_from_text(
78
+ input = convert_to_multi_modal_array(test_case.input)
79
+ actual_output = convert_to_multi_modal_array(
68
80
  test_case.actual_output
69
81
  )
82
+ input_texts, _ = self.separate_images_from_text(input)
83
+ _, output_images = self.separate_images_from_text(actual_output)
70
84
 
71
85
  self.SC_scores, self.SC_reasoning = (
72
86
  self._evaluate_semantic_consistency(
@@ -90,7 +104,7 @@ class TextToImageMetric(BaseMultimodalMetric):
90
104
  steps=[
91
105
  f"Semantic Consistency Scores:\n{self.SC_scores}",
92
106
  f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
93
- f"Perceptual Quality Scores:\n{self.SC_scores}",
107
+ f"Perceptual Quality Scores:\n{self.PQ_scores}",
94
108
  f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
95
109
  f"Score: {self.score}\nReason: {self.reason}",
96
110
  ],
@@ -99,11 +113,19 @@ class TextToImageMetric(BaseMultimodalMetric):
99
113
 
100
114
  async def a_measure(
101
115
  self,
102
- test_case: MLLMTestCase,
116
+ test_case: LLMTestCase,
103
117
  _show_indicator: bool = True,
104
118
  _in_component: bool = False,
105
119
  ) -> float:
106
- check_mllm_test_case_params(test_case, required_params, 0, 1, self)
120
+ check_llm_test_case_params(
121
+ test_case,
122
+ required_params,
123
+ 0,
124
+ 1,
125
+ self,
126
+ self.model,
127
+ test_case.multimodal,
128
+ )
107
129
 
108
130
  self.evaluation_cost = 0 if self.using_native_model else None
109
131
  with metric_progress_indicator(
@@ -112,10 +134,12 @@ class TextToImageMetric(BaseMultimodalMetric):
112
134
  _show_indicator=_show_indicator,
113
135
  _in_component=_in_component,
114
136
  ):
115
- input_texts, _ = self.separate_images_from_text(test_case.input)
116
- _, output_images = self.separate_images_from_text(
137
+ input = convert_to_multi_modal_array(test_case.input)
138
+ actual_output = convert_to_multi_modal_array(
117
139
  test_case.actual_output
118
140
  )
141
+ input_texts, _ = self.separate_images_from_text(input)
142
+ _, output_images = self.separate_images_from_text(actual_output)
119
143
  (self.SC_scores, self.SC_reasoning), (
120
144
  self.PQ_scores,
121
145
  self.PQ_reasoning,
@@ -139,7 +163,7 @@ class TextToImageMetric(BaseMultimodalMetric):
139
163
  steps=[
140
164
  f"Semantic Consistency Scores:\n{self.SC_scores}",
141
165
  f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
142
- f"Perceptual Quality Scores:\n{self.SC_scores}",
166
+ f"Perceptual Quality Scores:\n{self.PQ_scores}",
143
167
  f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
144
168
  f"Score: {self.score}\nReason: {self.reason}",
145
169
  ],
@@ -163,106 +187,86 @@ class TextToImageMetric(BaseMultimodalMetric):
163
187
  text_prompt: str,
164
188
  actual_image_output: MLLMImage,
165
189
  ) -> Tuple[List[int], str]:
166
- images: List[MLLMImage] = []
167
- images.append(actual_image_output)
168
- prompt = [
169
- TextToImageTemplate.generate_semantic_consistency_evaluation_results(
170
- text_prompt=text_prompt
171
- )
172
- ]
173
- if self.using_native_model:
174
- res, cost = await self.model.a_generate(
175
- prompt + images, ReasonScore
176
- )
177
- self.evaluation_cost += cost
178
- return res.score, res.reasoning
179
- else:
180
- try:
181
- res: ReasonScore = await self.model.a_generate(
182
- prompt + images, schema=ReasonScore
183
- )
184
- return res.score, res.reasoning
185
- except TypeError:
186
- res = await self.model.a_generate(
187
- prompt + images, input_text=prompt
190
+ images: List[MLLMImage] = [actual_image_output]
191
+ prompt = f"""
192
+ {
193
+ TextToImageTemplate.generate_semantic_consistency_evaluation_results(
194
+ text_prompt=text_prompt
188
195
  )
189
- data = trimAndLoadJson(res, self)
190
- return data["score"], data["reasoning"]
196
+ }
197
+ Images:
198
+ {images}
199
+ """
200
+ return await a_generate_with_schema_and_extract(
201
+ metric=self,
202
+ prompt=prompt,
203
+ schema_cls=ReasonScore,
204
+ extract_schema=lambda s: (s.score, s.reasoning),
205
+ extract_json=lambda data: (data["score"], data["reasoning"]),
206
+ )
191
207
 
192
208
  def _evaluate_semantic_consistency(
193
209
  self,
194
210
  text_prompt: str,
195
211
  actual_image_output: MLLMImage,
196
212
  ) -> Tuple[List[int], str]:
197
- images: List[MLLMImage] = []
198
- images.append(actual_image_output)
199
- prompt = [
200
- TextToImageTemplate.generate_semantic_consistency_evaluation_results(
201
- text_prompt=text_prompt
202
- )
203
- ]
204
- if self.using_native_model:
205
- res, cost = self.model.generate(prompt + images, ReasonScore)
206
- self.evaluation_cost += cost
207
- return res.score, res.reasoning
208
- else:
209
- try:
210
- res: ReasonScore = self.model.generate(
211
- prompt + images, schema=ReasonScore
213
+ images: List[MLLMImage] = [actual_image_output]
214
+ prompt = f"""
215
+ {
216
+ TextToImageTemplate.generate_semantic_consistency_evaluation_results(
217
+ text_prompt=text_prompt
212
218
  )
213
- return res.score, res.reasoning
214
- except TypeError:
215
- res = self.model.generate(prompt + images)
216
- data = trimAndLoadJson(res, self)
217
- return data["score"], data["reasoning"]
219
+ }
220
+ Images:
221
+ {images}
222
+ """
223
+ return generate_with_schema_and_extract(
224
+ metric=self,
225
+ prompt=prompt,
226
+ schema_cls=ReasonScore,
227
+ extract_schema=lambda s: (s.score, s.reasoning),
228
+ extract_json=lambda data: (data["score"], data["reasoning"]),
229
+ )
218
230
 
219
231
  async def _a_evaluate_perceptual_quality(
220
232
  self, actual_image_output: MLLMImage
221
233
  ) -> Tuple[List[int], str]:
222
234
  images: List[MLLMImage] = [actual_image_output]
223
- prompt = [
224
- TextToImageTemplate.generate_perceptual_quality_evaluation_results()
225
- ]
226
- if self.using_native_model:
227
- res, cost = await self.model.a_generate(
228
- prompt + images, ReasonScore
229
- )
230
- self.evaluation_cost += cost
231
- return res.score, res.reasoning
232
- else:
233
- try:
234
- res: ReasonScore = await self.model.a_generate(
235
- prompt + images, schema=ReasonScore
236
- )
237
- return res.score, res.reasoning
238
- except TypeError:
239
- res = await self.model.a_generate(prompt + images)
240
- data = trimAndLoadJson(res, self)
241
- return data["score"], data["reasoning"]
235
+ prompt = f"""
236
+ {
237
+ TextToImageTemplate.generate_perceptual_quality_evaluation_results()
238
+ }
239
+ Images:
240
+ {images}
241
+ """
242
+ return await a_generate_with_schema_and_extract(
243
+ metric=self,
244
+ prompt=prompt,
245
+ schema_cls=ReasonScore,
246
+ extract_schema=lambda s: (s.score, s.reasoning),
247
+ extract_json=lambda data: (data["score"], data["reasoning"]),
248
+ )
242
249
 
243
250
  def _evaluate_perceptual_quality(
244
251
  self, actual_image_output: MLLMImage
245
252
  ) -> Tuple[List[int], str]:
246
253
  images: List[MLLMImage] = [actual_image_output]
247
- prompt = [
248
- TextToImageTemplate.generate_perceptual_quality_evaluation_results()
249
- ]
250
- if self.using_native_model:
251
- res, cost = self.model.generate(prompt + images, ReasonScore)
252
- self.evaluation_cost += cost
253
- return res.score, res.reasoning
254
- else:
255
- try:
256
- res: ReasonScore = self.model.generate(
257
- prompt + images, schema=ReasonScore
258
- )
259
- return res.score, res.reasoning
260
- except TypeError:
261
- res = self.model.generate(prompt + images)
262
- data = trimAndLoadJson(res, self)
263
- return data["score"], data["reasoning"]
254
+ prompt = f"""
255
+ {
256
+ TextToImageTemplate.generate_perceptual_quality_evaluation_results()
257
+ }
258
+ Images:
259
+ {images}
260
+ """
261
+ return generate_with_schema_and_extract(
262
+ metric=self,
263
+ prompt=prompt,
264
+ schema_cls=ReasonScore,
265
+ extract_schema=lambda s: (s.score, s.reasoning),
266
+ extract_json=lambda data: (data["score"], data["reasoning"]),
267
+ )
264
268
 
265
- def _calculate_score(self) -> List[str]:
269
+ def _calculate_score(self) -> float:
266
270
  min_SC_score = min(self.SC_scores)
267
271
  min_PQ_score = min(self.PQ_scores)
268
272
  return math.sqrt(min_SC_score * min_PQ_score) / 10
@@ -272,14 +276,12 @@ class TextToImageMetric(BaseMultimodalMetric):
272
276
  self.success = False
273
277
  else:
274
278
  try:
275
- self.score >= self.threshold
276
- except:
279
+ self.success = self.score >= self.threshold
280
+ except TypeError:
277
281
  self.success = False
278
282
  return self.success
279
283
 
280
- def _generate_reason(
281
- self,
282
- ) -> Tuple[List[float], str]:
284
+ def _generate_reason(self) -> str:
283
285
  return textwrap.dedent(
284
286
  f"""
285
287
  The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}