deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/dataset/golden.py +54 -2
  3. deepeval/evaluate/evaluate.py +16 -8
  4. deepeval/evaluate/execute.py +70 -26
  5. deepeval/evaluate/utils.py +26 -22
  6. deepeval/integrations/pydantic_ai/agent.py +19 -2
  7. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  8. deepeval/metrics/__init__.py +14 -12
  9. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  10. deepeval/metrics/answer_relevancy/template.py +188 -92
  11. deepeval/metrics/base_metric.py +2 -5
  12. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  13. deepeval/metrics/contextual_precision/template.py +115 -66
  14. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  15. deepeval/metrics/contextual_recall/template.py +106 -55
  16. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  17. deepeval/metrics/contextual_relevancy/template.py +87 -58
  18. deepeval/metrics/dag/templates.py +2 -2
  19. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  20. deepeval/metrics/faithfulness/schema.py +1 -1
  21. deepeval/metrics/faithfulness/template.py +200 -115
  22. deepeval/metrics/g_eval/utils.py +2 -2
  23. deepeval/metrics/indicator.py +4 -4
  24. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  25. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  26. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  27. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  28. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  29. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  30. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  31. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  32. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  33. deepeval/metrics/ragas.py +3 -3
  34. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  35. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  36. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  37. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  38. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  39. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  40. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  41. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  42. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  43. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  44. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  45. deepeval/metrics/turn_faithfulness/template.py +218 -0
  46. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  47. deepeval/metrics/utils.py +39 -58
  48. deepeval/models/__init__.py +0 -12
  49. deepeval/models/base_model.py +16 -38
  50. deepeval/models/embedding_models/__init__.py +7 -0
  51. deepeval/models/embedding_models/azure_embedding_model.py +52 -28
  52. deepeval/models/embedding_models/local_embedding_model.py +18 -14
  53. deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
  54. deepeval/models/embedding_models/openai_embedding_model.py +40 -21
  55. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  56. deepeval/models/llms/anthropic_model.py +44 -23
  57. deepeval/models/llms/azure_model.py +121 -36
  58. deepeval/models/llms/deepseek_model.py +18 -13
  59. deepeval/models/llms/gemini_model.py +129 -43
  60. deepeval/models/llms/grok_model.py +18 -13
  61. deepeval/models/llms/kimi_model.py +18 -13
  62. deepeval/models/llms/litellm_model.py +42 -22
  63. deepeval/models/llms/local_model.py +12 -7
  64. deepeval/models/llms/ollama_model.py +114 -12
  65. deepeval/models/llms/openai_model.py +137 -41
  66. deepeval/models/llms/portkey_model.py +24 -7
  67. deepeval/models/llms/utils.py +5 -3
  68. deepeval/models/retry_policy.py +17 -14
  69. deepeval/models/utils.py +46 -1
  70. deepeval/optimizer/__init__.py +5 -0
  71. deepeval/optimizer/algorithms/__init__.py +6 -0
  72. deepeval/optimizer/algorithms/base.py +29 -0
  73. deepeval/optimizer/algorithms/configs.py +18 -0
  74. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  75. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  76. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  77. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  78. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  79. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  80. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  81. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  82. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  83. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  84. deepeval/{optimization → optimizer}/configs.py +5 -8
  85. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  86. deepeval/optimizer/prompt_optimizer.py +263 -0
  87. deepeval/optimizer/rewriter/__init__.py +5 -0
  88. deepeval/optimizer/rewriter/rewriter.py +124 -0
  89. deepeval/optimizer/rewriter/utils.py +214 -0
  90. deepeval/optimizer/scorer/__init__.py +5 -0
  91. deepeval/optimizer/scorer/base.py +86 -0
  92. deepeval/optimizer/scorer/scorer.py +316 -0
  93. deepeval/optimizer/scorer/utils.py +30 -0
  94. deepeval/optimizer/types.py +148 -0
  95. deepeval/{optimization → optimizer}/utils.py +47 -165
  96. deepeval/prompt/prompt.py +5 -9
  97. deepeval/test_case/__init__.py +1 -3
  98. deepeval/test_case/api.py +12 -10
  99. deepeval/test_case/conversational_test_case.py +19 -1
  100. deepeval/test_case/llm_test_case.py +152 -1
  101. deepeval/test_case/utils.py +4 -8
  102. deepeval/test_run/api.py +15 -14
  103. deepeval/test_run/test_run.py +3 -3
  104. deepeval/tracing/patchers.py +9 -4
  105. deepeval/tracing/tracing.py +2 -2
  106. deepeval/utils.py +65 -0
  107. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  108. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
  109. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  110. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  111. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  112. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  113. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  114. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  115. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  116. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  117. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  118. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  119. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  120. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  121. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  122. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  123. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  124. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  125. deepeval/models/mlllms/__init__.py +0 -4
  126. deepeval/models/mlllms/azure_model.py +0 -343
  127. deepeval/models/mlllms/gemini_model.py +0 -313
  128. deepeval/models/mlllms/ollama_model.py +0 -175
  129. deepeval/models/mlllms/openai_model.py +0 -309
  130. deepeval/optimization/__init__.py +0 -13
  131. deepeval/optimization/adapters/__init__.py +0 -2
  132. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  133. deepeval/optimization/aggregates.py +0 -14
  134. deepeval/optimization/copro/configs.py +0 -31
  135. deepeval/optimization/gepa/__init__.py +0 -7
  136. deepeval/optimization/gepa/configs.py +0 -115
  137. deepeval/optimization/miprov2/configs.py +0 -134
  138. deepeval/optimization/miprov2/loop.py +0 -785
  139. deepeval/optimization/mutations/__init__.py +0 -0
  140. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  141. deepeval/optimization/policies/__init__.py +0 -16
  142. deepeval/optimization/policies/tie_breaker.py +0 -67
  143. deepeval/optimization/prompt_optimizer.py +0 -462
  144. deepeval/optimization/simba/__init__.py +0 -0
  145. deepeval/optimization/simba/configs.py +0 -33
  146. deepeval/optimization/types.py +0 -361
  147. deepeval/test_case/mllm_test_case.py +0 -170
  148. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  149. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  150. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  152. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  153. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  154. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  155. {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -2,7 +2,7 @@ import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
4
  from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_coherence.template import (
7
7
  ImageCoherenceTemplate,
8
8
  )
@@ -10,32 +10,35 @@ from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
11
  trimAndLoadJson,
12
12
  check_mllm_test_case_params,
13
- initialize_multimodal_model,
13
+ initialize_model,
14
14
  )
15
- from deepeval.models import DeepEvalBaseMLLM
15
+ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.multimodal_metrics.image_coherence.schema import (
17
17
  ReasonScore,
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
20
+ from deepeval.utils import (
21
+ get_or_create_event_loop,
22
+ convert_to_multi_modal_array,
23
+ )
21
24
 
22
25
 
23
26
  class ImageCoherenceMetric(BaseMultimodalMetric):
24
- _required_params: List[MLLMTestCaseParams] = [
25
- MLLMTestCaseParams.INPUT,
26
- MLLMTestCaseParams.ACTUAL_OUTPUT,
27
+ _required_params: List[LLMTestCaseParams] = [
28
+ LLMTestCaseParams.INPUT,
29
+ LLMTestCaseParams.ACTUAL_OUTPUT,
27
30
  ]
28
31
 
29
32
  def __init__(
30
33
  self,
31
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
34
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
32
35
  threshold: float = 0.5,
33
36
  async_mode: bool = True,
34
37
  strict_mode: bool = False,
35
38
  verbose_mode: bool = False,
36
39
  max_context_size: Optional[int] = None,
37
40
  ):
38
- self.model, self.using_native_model = initialize_multimodal_model(model)
41
+ self.model, self.using_native_model = initialize_model(model)
39
42
  self.evaluation_model = self.model.get_model_name()
40
43
  self.threshold = 1 if strict_mode else threshold
41
44
  self.strict_mode = strict_mode
@@ -45,13 +48,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
45
48
 
46
49
  def measure(
47
50
  self,
48
- test_case: MLLMTestCase,
51
+ test_case: LLMTestCase,
49
52
  _show_indicator: bool = True,
50
53
  _in_component: bool = False,
51
54
  _log_metric_to_confident: bool = True,
52
55
  ) -> float:
53
56
  check_mllm_test_case_params(
54
- test_case, self._required_params, None, None, self
57
+ test_case, self._required_params, None, None, self, self.model
55
58
  )
56
59
  self.evaluation_cost = 0 if self.using_native_model else None
57
60
  with metric_progress_indicator(
@@ -68,7 +71,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
68
71
  )
69
72
  )
70
73
  else:
71
- actual_output = test_case.actual_output
74
+ actual_output = convert_to_multi_modal_array(
75
+ test_case.actual_output
76
+ )
72
77
  self.contexts_above = []
73
78
  self.contexts_below = []
74
79
  self.scores = []
@@ -145,13 +150,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
145
150
 
146
151
  async def a_measure(
147
152
  self,
148
- test_case: MLLMTestCase,
153
+ test_case: LLMTestCase,
149
154
  _show_indicator: bool = True,
150
155
  _in_component: bool = False,
151
156
  _log_metric_to_confident: bool = True,
152
157
  ) -> float:
153
158
  check_mllm_test_case_params(
154
- test_case, self._required_params, None, None, self
159
+ test_case, self._required_params, None, None, self, self.model
155
160
  )
156
161
  self.evaluation_cost = 0 if self.using_native_model else None
157
162
  with metric_progress_indicator(
@@ -160,7 +165,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
160
165
  _show_indicator=_show_indicator,
161
166
  _in_component=_in_component,
162
167
  ):
163
- actual_output = test_case.actual_output
168
+ actual_output = convert_to_multi_modal_array(
169
+ test_case.actual_output
170
+ )
164
171
  self.contexts_above = []
165
172
  self.contexts_below = []
166
173
  self.scores = []
@@ -253,7 +260,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
253
260
  instructions = ImageCoherenceTemplate.evaluate_image_coherence(
254
261
  context_above, context_below
255
262
  )
256
- prompt = [instructions] + [image]
263
+ prompt = f"{instructions} \nImages: {image}"
257
264
  if self.using_native_model:
258
265
  res, cost = self.model.generate(prompt, ReasonScore)
259
266
  self.evaluation_cost += cost
@@ -278,7 +285,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
278
285
  instructions = ImageCoherenceTemplate.evaluate_image_coherence(
279
286
  context_above, context_below
280
287
  )
281
- prompt = [instructions] + [image]
288
+ prompt = f"{instructions} \nImages: {image}"
282
289
  if self.using_native_model:
283
290
  res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
284
291
  self.evaluation_cost += cost
@@ -4,38 +4,41 @@ import math
4
4
  import textwrap
5
5
 
6
6
  from deepeval.metrics import BaseMultimodalMetric
7
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
7
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
8
8
  from deepeval.metrics.multimodal_metrics.image_editing.template import (
9
9
  ImageEditingTemplate,
10
10
  )
11
- from deepeval.utils import get_or_create_event_loop
11
+ from deepeval.utils import (
12
+ get_or_create_event_loop,
13
+ convert_to_multi_modal_array,
14
+ )
12
15
  from deepeval.metrics.utils import (
13
16
  construct_verbose_logs,
14
17
  trimAndLoadJson,
15
18
  check_mllm_test_case_params,
16
- initialize_multimodal_model,
19
+ initialize_model,
17
20
  )
18
- from deepeval.models import DeepEvalBaseMLLM
21
+ from deepeval.models import DeepEvalBaseLLM
19
22
  from deepeval.metrics.multimodal_metrics.image_editing.schema import ReasonScore
20
23
  from deepeval.metrics.indicator import metric_progress_indicator
21
24
 
22
25
 
23
26
  class ImageEditingMetric(BaseMultimodalMetric):
24
27
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
28
+ _required_params: List[LLMTestCaseParams] = [
29
+ LLMTestCaseParams.INPUT,
30
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
31
  ]
29
32
 
30
33
  def __init__(
31
34
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
36
  threshold: float = 0.5,
34
37
  async_mode: bool = True,
35
38
  strict_mode: bool = False,
36
39
  verbose_mode: bool = False,
37
40
  ):
38
- self.model, self.using_native_model = initialize_multimodal_model(model)
41
+ self.model, self.using_native_model = initialize_model(model)
39
42
  self.evaluation_model = self.model.get_model_name()
40
43
  self.threshold = 1 if strict_mode else threshold
41
44
  self.strict_mode = strict_mode
@@ -44,13 +47,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
44
47
 
45
48
  def measure(
46
49
  self,
47
- test_case: MLLMTestCase,
50
+ test_case: LLMTestCase,
48
51
  _show_indicator: bool = True,
49
52
  _in_component: bool = False,
50
53
  _log_metric_to_confident: bool = True,
51
54
  ) -> float:
52
55
  check_mllm_test_case_params(
53
- test_case, self._required_params, 1, 1, self
56
+ test_case, self._required_params, 1, 1, self, self.model
54
57
  )
55
58
 
56
59
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -68,12 +71,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
68
71
  )
69
72
  )
70
73
  else:
71
- input_texts, input_images = self.separate_images_from_text(
72
- test_case.input
73
- )
74
- _, output_images = self.separate_images_from_text(
74
+ input = convert_to_multi_modal_array(test_case.input)
75
+ actual_output = convert_to_multi_modal_array(
75
76
  test_case.actual_output
76
77
  )
78
+ input_texts, input_images = self.separate_images_from_text(
79
+ input
80
+ )
81
+ _, output_images = self.separate_images_from_text(actual_output)
77
82
 
78
83
  self.SC_scores, self.SC_reasoning = (
79
84
  self._evaluate_semantic_consistency(
@@ -107,13 +112,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
107
112
 
108
113
  async def a_measure(
109
114
  self,
110
- test_case: MLLMTestCase,
115
+ test_case: LLMTestCase,
111
116
  _show_indicator: bool = True,
112
117
  _in_component: bool = False,
113
118
  _log_metric_to_confident: bool = True,
114
119
  ) -> float:
115
120
  check_mllm_test_case_params(
116
- test_case, self._required_params, 1, 1, self
121
+ test_case, self._required_params, 1, 1, self, self.model
117
122
  )
118
123
 
119
124
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -123,12 +128,12 @@ class ImageEditingMetric(BaseMultimodalMetric):
123
128
  _show_indicator=_show_indicator,
124
129
  _in_component=_in_component,
125
130
  ):
126
- input_texts, input_images = self.separate_images_from_text(
127
- test_case.input
128
- )
129
- _, output_images = self.separate_images_from_text(
131
+ input = convert_to_multi_modal_array(test_case.input)
132
+ actual_output = convert_to_multi_modal_array(
130
133
  test_case.actual_output
131
134
  )
135
+ input_texts, input_images = self.separate_images_from_text(input)
136
+ _, output_images = self.separate_images_from_text(actual_output)
132
137
  (self.SC_scores, self.SC_reasoning), (
133
138
  self.PQ_scores,
134
139
  self.PQ_reasoning,
@@ -2,7 +2,7 @@ import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
4
  from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_helpfulness.template import (
7
7
  ImageHelpfulnessTemplate,
8
8
  )
@@ -10,33 +10,36 @@ from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
11
  trimAndLoadJson,
12
12
  check_mllm_test_case_params,
13
- initialize_multimodal_model,
13
+ initialize_model,
14
14
  )
15
- from deepeval.models import DeepEvalBaseMLLM
15
+ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.multimodal_metrics.image_helpfulness.schema import (
17
17
  ReasonScore,
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
20
+ from deepeval.utils import (
21
+ get_or_create_event_loop,
22
+ convert_to_multi_modal_array,
23
+ )
21
24
 
22
25
 
23
26
  class ImageHelpfulnessMetric(BaseMultimodalMetric):
24
27
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
28
+ _required_params: List[LLMTestCaseParams] = [
29
+ LLMTestCaseParams.INPUT,
30
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
31
  ]
29
32
 
30
33
  def __init__(
31
34
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
36
  threshold: float = 0.5,
34
37
  async_mode: bool = True,
35
38
  strict_mode: bool = False,
36
39
  verbose_mode: bool = False,
37
40
  max_context_size: Optional[int] = None,
38
41
  ):
39
- self.model, self.using_native_model = initialize_multimodal_model(model)
42
+ self.model, self.using_native_model = initialize_model(model)
40
43
  self.evaluation_model = self.model.get_model_name()
41
44
  self.threshold = 1 if strict_mode else threshold
42
45
  self.strict_mode = strict_mode
@@ -46,13 +49,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
46
49
 
47
50
  def measure(
48
51
  self,
49
- test_case: MLLMTestCase,
52
+ test_case: LLMTestCase,
50
53
  _show_indicator: bool = True,
51
54
  _in_component: bool = False,
52
55
  _log_metric_to_confident: bool = True,
53
56
  ) -> float:
54
57
  check_mllm_test_case_params(
55
- test_case, self._required_params, None, None, self
58
+ test_case, self._required_params, None, None, self, self.model
56
59
  )
57
60
  self.evaluation_cost = 0 if self.using_native_model else None
58
61
  with metric_progress_indicator(
@@ -69,7 +72,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
69
72
  )
70
73
  )
71
74
  else:
72
- actual_output = test_case.actual_output
75
+ actual_output = convert_to_multi_modal_array(
76
+ test_case.actual_output
77
+ )
73
78
  self.contexts_above = []
74
79
  self.contexts_below = []
75
80
  self.scores = []
@@ -146,13 +151,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
146
151
 
147
152
  async def a_measure(
148
153
  self,
149
- test_case: MLLMTestCase,
154
+ test_case: LLMTestCase,
150
155
  _show_indicator: bool = True,
151
156
  _in_component: bool = False,
152
157
  _log_metric_to_confident: bool = True,
153
158
  ) -> float:
154
159
  check_mllm_test_case_params(
155
- test_case, self._required_params, None, None, self
160
+ test_case, self._required_params, None, None, self, self.model
156
161
  )
157
162
  self.evaluation_cost = 0 if self.using_native_model else None
158
163
  with metric_progress_indicator(
@@ -161,7 +166,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
161
166
  _show_indicator=_show_indicator,
162
167
  _in_component=_in_component,
163
168
  ):
164
- actual_output = test_case.actual_output
169
+ actual_output = convert_to_multi_modal_array(
170
+ test_case.actual_output
171
+ )
165
172
  self.contexts_above = []
166
173
  self.contexts_below = []
167
174
  self.scores = []
@@ -254,7 +261,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
254
261
  instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
255
262
  context_above, context_below
256
263
  )
257
- prompt = [instructions] + [image]
264
+ prompt = f"{instructions} \nImages: {image}"
258
265
  if self.using_native_model:
259
266
  res, cost = self.model.generate(prompt, schema=ReasonScore)
260
267
  self.evaluation_cost += cost
@@ -279,7 +286,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
279
286
  instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
280
287
  context_above, context_below
281
288
  )
282
- prompt = [instructions] + [image]
289
+ prompt = f"{instructions} \nImages: {image}"
283
290
  if self.using_native_model:
284
291
  res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
285
292
  self.evaluation_cost += cost
@@ -2,7 +2,7 @@ import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
4
  from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_reference.template import (
7
7
  ImageReferenceTemplate,
8
8
  )
@@ -10,33 +10,36 @@ from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
11
  trimAndLoadJson,
12
12
  check_mllm_test_case_params,
13
- initialize_multimodal_model,
13
+ initialize_model,
14
14
  )
15
- from deepeval.models import DeepEvalBaseMLLM
15
+ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.multimodal_metrics.image_reference.schema import (
17
17
  ReasonScore,
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
20
+ from deepeval.utils import (
21
+ get_or_create_event_loop,
22
+ convert_to_multi_modal_array,
23
+ )
21
24
 
22
25
 
23
26
  class ImageReferenceMetric(BaseMultimodalMetric):
24
27
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
28
+ _required_params: List[LLMTestCaseParams] = [
29
+ LLMTestCaseParams.INPUT,
30
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
31
  ]
29
32
 
30
33
  def __init__(
31
34
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
36
  threshold: float = 0.5,
34
37
  async_mode: bool = True,
35
38
  strict_mode: bool = False,
36
39
  verbose_mode: bool = False,
37
40
  max_context_size: Optional[int] = None,
38
41
  ):
39
- self.model, self.using_native_model = initialize_multimodal_model(model)
42
+ self.model, self.using_native_model = initialize_model(model)
40
43
  self.evaluation_model = self.model.get_model_name()
41
44
  self.threshold = 1 if strict_mode else threshold
42
45
  self.strict_mode = strict_mode
@@ -46,13 +49,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
46
49
 
47
50
  def measure(
48
51
  self,
49
- test_case: MLLMTestCase,
52
+ test_case: LLMTestCase,
50
53
  _show_indicator: bool = True,
51
54
  _in_component: bool = False,
52
55
  _log_metric_to_confident: bool = True,
53
56
  ) -> float:
54
57
  check_mllm_test_case_params(
55
- test_case, self._required_params, None, None, self
58
+ test_case, self._required_params, None, None, self, self.model
56
59
  )
57
60
  self.evaluation_cost = 0 if self.using_native_model else None
58
61
  with metric_progress_indicator(
@@ -69,7 +72,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
69
72
  )
70
73
  )
71
74
  else:
72
- actual_output = test_case.actual_output
75
+ actual_output = convert_to_multi_modal_array(
76
+ test_case.actual_output
77
+ )
73
78
  self.contexts_above = []
74
79
  self.contexts_below = []
75
80
  self.scores = []
@@ -146,13 +151,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
146
151
 
147
152
  async def a_measure(
148
153
  self,
149
- test_case: MLLMTestCase,
154
+ test_case: LLMTestCase,
150
155
  _show_indicator: bool = True,
151
156
  _in_component: bool = False,
152
157
  _log_metric_to_confident: bool = True,
153
158
  ) -> float:
154
159
  check_mllm_test_case_params(
155
- test_case, self._required_params, None, None, self
160
+ test_case, self._required_params, None, None, self, self.model
156
161
  )
157
162
  self.evaluation_cost = 0 if self.using_native_model else None
158
163
  with metric_progress_indicator(
@@ -161,7 +166,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
161
166
  _show_indicator=_show_indicator,
162
167
  _in_component=_in_component,
163
168
  ):
164
- actual_output = test_case.actual_output
169
+ actual_output = convert_to_multi_modal_array(
170
+ test_case.actual_output
171
+ )
165
172
  self.contexts_above = []
166
173
  self.contexts_below = []
167
174
  self.scores = []
@@ -254,7 +261,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
254
261
  instructions = ImageReferenceTemplate.evaluate_image_reference(
255
262
  context_above, context_below
256
263
  )
257
- prompt = [instructions] + [image]
264
+ prompt = f"{instructions} \nImages: {image}"
258
265
  if self.using_native_model:
259
266
  res, cost = self.model.generate(prompt, schema=ReasonScore)
260
267
  self.evaluation_cost += cost
@@ -279,7 +286,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
279
286
  instructions = ImageReferenceTemplate.evaluate_image_reference(
280
287
  context_above, context_below
281
288
  )
282
- prompt = [instructions] + [image]
289
+ prompt = f"{instructions} \nImages: {image}"
283
290
  if self.using_native_model:
284
291
  res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
285
292
  self.evaluation_cost += cost
@@ -1,11 +1,11 @@
1
1
  """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
3
  from typing import Optional, List, Tuple, Type, Union
4
- from deepeval.models import DeepEvalBaseMLLM
4
+ from deepeval.models import DeepEvalBaseLLM
5
5
  from deepeval.metrics import BaseMultimodalMetric
6
6
  from deepeval.test_case import (
7
- MLLMTestCaseParams,
8
- MLLMTestCase,
7
+ LLMTestCaseParams,
8
+ LLMTestCase,
9
9
  )
10
10
  from deepeval.metrics.multimodal_metrics.multimodal_g_eval.template import (
11
11
  MultimodalGEvalTemplate,
@@ -17,7 +17,7 @@ from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import (
17
17
  from deepeval.utils import get_or_create_event_loop, prettify_list
18
18
  from deepeval.metrics.indicator import metric_progress_indicator
19
19
  from deepeval.metrics.utils import (
20
- initialize_multimodal_model,
20
+ initialize_model,
21
21
  check_mllm_test_case_params,
22
22
  construct_verbose_logs,
23
23
  trimAndLoadJson,
@@ -42,11 +42,11 @@ class MultimodalGEval(BaseMultimodalMetric):
42
42
  def __init__(
43
43
  self,
44
44
  name: str,
45
- evaluation_params: List[MLLMTestCaseParams],
45
+ evaluation_params: List[LLMTestCaseParams],
46
46
  criteria: Optional[str] = None,
47
47
  evaluation_steps: Optional[List[str]] = None,
48
48
  rubric: Optional[List[Rubric]] = None,
49
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
49
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
50
50
  threshold: float = 0.5,
51
51
  top_logprobs: int = 20,
52
52
  async_mode: bool = True,
@@ -62,7 +62,7 @@ class MultimodalGEval(BaseMultimodalMetric):
62
62
  self.evaluation_params = evaluation_params
63
63
  self.criteria = criteria
64
64
  self.rubric = validate_and_sort_rubrics(rubric)
65
- self.model, self.using_native_model = initialize_multimodal_model(model)
65
+ self.model, self.using_native_model = initialize_model(model)
66
66
  self.evaluation_model = self.model.get_model_name()
67
67
  self.evaluation_steps = (
68
68
  evaluation_steps
@@ -79,7 +79,7 @@ class MultimodalGEval(BaseMultimodalMetric):
79
79
 
80
80
  def measure(
81
81
  self,
82
- test_case: MLLMTestCase,
82
+ test_case: LLMTestCase,
83
83
  _show_indicator: bool = True,
84
84
  _in_component: bool = False,
85
85
  _log_metric_to_confident: bool = True,
@@ -87,7 +87,7 @@ class MultimodalGEval(BaseMultimodalMetric):
87
87
  ) -> float:
88
88
 
89
89
  check_mllm_test_case_params(
90
- test_case, self.evaluation_params, None, None, self
90
+ test_case, self.evaluation_params, None, None, self, self.model
91
91
  )
92
92
 
93
93
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -134,7 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
134
134
 
135
135
  async def a_measure(
136
136
  self,
137
- test_case: MLLMTestCase,
137
+ test_case: LLMTestCase,
138
138
  _show_indicator: bool = True,
139
139
  _in_component: bool = False,
140
140
  _additional_context: Optional[str] = None,
@@ -142,7 +142,7 @@ class MultimodalGEval(BaseMultimodalMetric):
142
142
  ) -> float:
143
143
 
144
144
  check_mllm_test_case_params(
145
- test_case, self.evaluation_params, None, None, self
145
+ test_case, self.evaluation_params, None, None, self, self.model
146
146
  )
147
147
 
148
148
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -185,15 +185,15 @@ class MultimodalGEval(BaseMultimodalMetric):
185
185
  criteria=self.criteria, parameters=g_eval_params_str
186
186
  )
187
187
  if self.using_native_model:
188
- res, cost = await self.model.a_generate([prompt], schema=Steps)
188
+ res, cost = await self.model.a_generate(prompt, schema=Steps)
189
189
  self.evaluation_cost += cost
190
190
  return res.steps
191
191
  else:
192
192
  try:
193
- res: Steps = await self.model.a_generate([prompt], schema=Steps)
193
+ res: Steps = await self.model.a_generate(prompt, schema=Steps)
194
194
  return res.steps
195
195
  except TypeError:
196
- res = await self.model.a_generate([prompt])
196
+ res = await self.model.a_generate(prompt)
197
197
  data = trimAndLoadJson(res, self)
198
198
  return data["steps"]
199
199
 
@@ -208,20 +208,20 @@ class MultimodalGEval(BaseMultimodalMetric):
208
208
  criteria=self.criteria, parameters=g_eval_params_str
209
209
  )
210
210
  if self.using_native_model:
211
- res, cost = self.model.generate([prompt], schema=Steps)
211
+ res, cost = self.model.generate(prompt, schema=Steps)
212
212
  self.evaluation_cost += cost
213
213
  return res.steps
214
214
  else:
215
215
  try:
216
- res: Steps = self.model.generate([prompt], schema=Steps)
216
+ res: Steps = self.model.generate(prompt, schema=Steps)
217
217
  return res.steps
218
218
  except TypeError:
219
- res = self.model.generate([prompt])
219
+ res = self.model.generate(prompt)
220
220
  data = trimAndLoadJson(res, self)
221
221
  return data["steps"]
222
222
 
223
223
  async def _a_evaluate(
224
- self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
224
+ self, test_case: LLMTestCase, _additional_context: Optional[str] = None
225
225
  ) -> Tuple[Union[int, float], str]:
226
226
  test_case_list = construct_test_case_list(
227
227
  self.evaluation_params, test_case
@@ -296,7 +296,7 @@ class MultimodalGEval(BaseMultimodalMetric):
296
296
  return data["score"], data["reason"]
297
297
 
298
298
  def _evaluate(
299
- self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
299
+ self, test_case: LLMTestCase, _additional_context: Optional[str] = None
300
300
  ) -> Tuple[Union[int, float], str]:
301
301
  test_case_list = construct_test_case_list(
302
302
  self.evaluation_params, test_case