deepeval 3.7.3__py3-none-any.whl → 3.7.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/test.py +1 -1
  3. deepeval/config/settings.py +102 -13
  4. deepeval/dataset/golden.py +54 -2
  5. deepeval/evaluate/configs.py +1 -1
  6. deepeval/evaluate/evaluate.py +16 -8
  7. deepeval/evaluate/execute.py +74 -27
  8. deepeval/evaluate/utils.py +26 -22
  9. deepeval/integrations/pydantic_ai/agent.py +19 -2
  10. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  11. deepeval/metrics/__init__.py +14 -12
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
  13. deepeval/metrics/answer_relevancy/template.py +188 -92
  14. deepeval/metrics/argument_correctness/template.py +2 -2
  15. deepeval/metrics/base_metric.py +2 -5
  16. deepeval/metrics/bias/template.py +3 -3
  17. deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
  18. deepeval/metrics/contextual_precision/template.py +115 -66
  19. deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
  20. deepeval/metrics/contextual_recall/template.py +106 -55
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
  22. deepeval/metrics/contextual_relevancy/template.py +87 -58
  23. deepeval/metrics/conversation_completeness/template.py +2 -2
  24. deepeval/metrics/conversational_dag/templates.py +4 -4
  25. deepeval/metrics/conversational_g_eval/template.py +4 -3
  26. deepeval/metrics/dag/templates.py +5 -5
  27. deepeval/metrics/faithfulness/faithfulness.py +70 -27
  28. deepeval/metrics/faithfulness/schema.py +1 -1
  29. deepeval/metrics/faithfulness/template.py +200 -115
  30. deepeval/metrics/g_eval/utils.py +2 -2
  31. deepeval/metrics/hallucination/template.py +4 -4
  32. deepeval/metrics/indicator.py +4 -4
  33. deepeval/metrics/misuse/template.py +2 -2
  34. deepeval/metrics/multimodal_metrics/__init__.py +0 -18
  35. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
  36. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
  37. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
  38. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
  39. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
  40. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
  41. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
  42. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
  43. deepeval/metrics/non_advice/template.py +2 -2
  44. deepeval/metrics/pii_leakage/template.py +2 -2
  45. deepeval/metrics/prompt_alignment/template.py +4 -4
  46. deepeval/metrics/ragas.py +3 -3
  47. deepeval/metrics/role_violation/template.py +2 -2
  48. deepeval/metrics/step_efficiency/step_efficiency.py +1 -1
  49. deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
  50. deepeval/metrics/toxicity/template.py +4 -4
  51. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  52. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  53. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
  54. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  55. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  56. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
  57. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  58. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  59. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
  60. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  61. deepeval/metrics/turn_faithfulness/template.py +218 -0
  62. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
  63. deepeval/metrics/turn_relevancy/template.py +2 -2
  64. deepeval/metrics/utils.py +39 -58
  65. deepeval/models/__init__.py +0 -12
  66. deepeval/models/base_model.py +16 -38
  67. deepeval/models/embedding_models/__init__.py +7 -0
  68. deepeval/models/embedding_models/azure_embedding_model.py +69 -32
  69. deepeval/models/embedding_models/local_embedding_model.py +39 -22
  70. deepeval/models/embedding_models/ollama_embedding_model.py +42 -18
  71. deepeval/models/embedding_models/openai_embedding_model.py +50 -15
  72. deepeval/models/llms/amazon_bedrock_model.py +1 -2
  73. deepeval/models/llms/anthropic_model.py +53 -20
  74. deepeval/models/llms/azure_model.py +140 -43
  75. deepeval/models/llms/deepseek_model.py +38 -23
  76. deepeval/models/llms/gemini_model.py +222 -103
  77. deepeval/models/llms/grok_model.py +39 -27
  78. deepeval/models/llms/kimi_model.py +39 -23
  79. deepeval/models/llms/litellm_model.py +103 -45
  80. deepeval/models/llms/local_model.py +35 -22
  81. deepeval/models/llms/ollama_model.py +129 -17
  82. deepeval/models/llms/openai_model.py +151 -50
  83. deepeval/models/llms/portkey_model.py +149 -0
  84. deepeval/models/llms/utils.py +5 -3
  85. deepeval/models/retry_policy.py +17 -14
  86. deepeval/models/utils.py +94 -4
  87. deepeval/optimizer/__init__.py +5 -0
  88. deepeval/optimizer/algorithms/__init__.py +6 -0
  89. deepeval/optimizer/algorithms/base.py +29 -0
  90. deepeval/optimizer/algorithms/configs.py +18 -0
  91. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  92. deepeval/optimizer/algorithms/copro/copro.py +836 -0
  93. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  94. deepeval/optimizer/algorithms/gepa/gepa.py +737 -0
  95. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  96. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  97. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  98. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  99. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  100. deepeval/optimizer/algorithms/simba/simba.py +999 -0
  101. deepeval/optimizer/algorithms/simba/types.py +15 -0
  102. deepeval/optimizer/configs.py +31 -0
  103. deepeval/optimizer/policies.py +227 -0
  104. deepeval/optimizer/prompt_optimizer.py +263 -0
  105. deepeval/optimizer/rewriter/__init__.py +5 -0
  106. deepeval/optimizer/rewriter/rewriter.py +124 -0
  107. deepeval/optimizer/rewriter/utils.py +214 -0
  108. deepeval/optimizer/scorer/__init__.py +5 -0
  109. deepeval/optimizer/scorer/base.py +86 -0
  110. deepeval/optimizer/scorer/scorer.py +316 -0
  111. deepeval/optimizer/scorer/utils.py +30 -0
  112. deepeval/optimizer/types.py +148 -0
  113. deepeval/optimizer/utils.py +480 -0
  114. deepeval/prompt/prompt.py +7 -6
  115. deepeval/test_case/__init__.py +1 -3
  116. deepeval/test_case/api.py +12 -10
  117. deepeval/test_case/conversational_test_case.py +19 -1
  118. deepeval/test_case/llm_test_case.py +152 -1
  119. deepeval/test_case/utils.py +4 -8
  120. deepeval/test_run/api.py +15 -14
  121. deepeval/test_run/cache.py +2 -0
  122. deepeval/test_run/test_run.py +9 -4
  123. deepeval/tracing/patchers.py +9 -4
  124. deepeval/tracing/tracing.py +2 -2
  125. deepeval/utils.py +89 -0
  126. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
  127. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/RECORD +134 -118
  128. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  129. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  130. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  131. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  132. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  133. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  134. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  135. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  136. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  137. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  138. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  139. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  140. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  141. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  142. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  143. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  144. deepeval/models/mlllms/__init__.py +0 -4
  145. deepeval/models/mlllms/azure_model.py +0 -334
  146. deepeval/models/mlllms/gemini_model.py +0 -284
  147. deepeval/models/mlllms/ollama_model.py +0 -144
  148. deepeval/models/mlllms/openai_model.py +0 -258
  149. deepeval/test_case/mllm_test_case.py +0 -170
  150. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  151. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  152. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  153. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  154. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
  155. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
  156. {deepeval-3.7.3.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
@@ -4,38 +4,41 @@ import math
4
4
  import textwrap
5
5
 
6
6
  from deepeval.metrics import BaseMultimodalMetric
7
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
7
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
8
8
  from deepeval.metrics.multimodal_metrics.image_editing.template import (
9
9
  ImageEditingTemplate,
10
10
  )
11
- from deepeval.utils import get_or_create_event_loop
11
+ from deepeval.utils import (
12
+ get_or_create_event_loop,
13
+ convert_to_multi_modal_array,
14
+ )
12
15
  from deepeval.metrics.utils import (
13
16
  construct_verbose_logs,
14
17
  trimAndLoadJson,
15
18
  check_mllm_test_case_params,
16
- initialize_multimodal_model,
19
+ initialize_model,
17
20
  )
18
- from deepeval.models import DeepEvalBaseMLLM
21
+ from deepeval.models import DeepEvalBaseLLM
19
22
  from deepeval.metrics.multimodal_metrics.image_editing.schema import ReasonScore
20
23
  from deepeval.metrics.indicator import metric_progress_indicator
21
24
 
22
25
 
23
26
  class ImageEditingMetric(BaseMultimodalMetric):
24
27
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
28
+ _required_params: List[LLMTestCaseParams] = [
29
+ LLMTestCaseParams.INPUT,
30
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
31
  ]
29
32
 
30
33
  def __init__(
31
34
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
36
  threshold: float = 0.5,
34
37
  async_mode: bool = True,
35
38
  strict_mode: bool = False,
36
39
  verbose_mode: bool = False,
37
40
  ):
38
- self.model, self.using_native_model = initialize_multimodal_model(model)
41
+ self.model, self.using_native_model = initialize_model(model)
39
42
  self.evaluation_model = self.model.get_model_name()
40
43
  self.threshold = 1 if strict_mode else threshold
41
44
  self.strict_mode = strict_mode
@@ -44,13 +47,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
44
47
 
45
48
  def measure(
46
49
  self,
47
- test_case: MLLMTestCase,
50
+ test_case: LLMTestCase,
48
51
  _show_indicator: bool = True,
49
52
  _in_component: bool = False,
50
53
  _log_metric_to_confident: bool = True,
51
54
  ) -> float:
52
55
  check_mllm_test_case_params(
53
- test_case, self._required_params, 1, 1, self
56
+ test_case, self._required_params, 1, 1, self, self.model
54
57
  )
55
58
 
56
59
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -68,12 +71,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
68
71
  )
69
72
  )
70
73
  else:
71
- input_texts, input_images = self.separate_images_from_text(
72
- test_case.input
73
- )
74
- _, output_images = self.separate_images_from_text(
74
+ input = convert_to_multi_modal_array(test_case.input)
75
+ actual_output = convert_to_multi_modal_array(
75
76
  test_case.actual_output
76
77
  )
78
+ input_texts, input_images = self.separate_images_from_text(
79
+ input
80
+ )
81
+ _, output_images = self.separate_images_from_text(actual_output)
77
82
 
78
83
  self.SC_scores, self.SC_reasoning = (
79
84
  self._evaluate_semantic_consistency(
@@ -107,13 +112,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
107
112
 
108
113
  async def a_measure(
109
114
  self,
110
- test_case: MLLMTestCase,
115
+ test_case: LLMTestCase,
111
116
  _show_indicator: bool = True,
112
117
  _in_component: bool = False,
113
118
  _log_metric_to_confident: bool = True,
114
119
  ) -> float:
115
120
  check_mllm_test_case_params(
116
- test_case, self._required_params, 1, 1, self
121
+ test_case, self._required_params, 1, 1, self, self.model
117
122
  )
118
123
 
119
124
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -123,12 +128,12 @@ class ImageEditingMetric(BaseMultimodalMetric):
123
128
  _show_indicator=_show_indicator,
124
129
  _in_component=_in_component,
125
130
  ):
126
- input_texts, input_images = self.separate_images_from_text(
127
- test_case.input
128
- )
129
- _, output_images = self.separate_images_from_text(
131
+ input = convert_to_multi_modal_array(test_case.input)
132
+ actual_output = convert_to_multi_modal_array(
130
133
  test_case.actual_output
131
134
  )
135
+ input_texts, input_images = self.separate_images_from_text(input)
136
+ _, output_images = self.separate_images_from_text(actual_output)
132
137
  (self.SC_scores, self.SC_reasoning), (
133
138
  self.PQ_scores,
134
139
  self.PQ_reasoning,
@@ -2,7 +2,7 @@ import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
4
  from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_helpfulness.template import (
7
7
  ImageHelpfulnessTemplate,
8
8
  )
@@ -10,33 +10,36 @@ from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
11
  trimAndLoadJson,
12
12
  check_mllm_test_case_params,
13
- initialize_multimodal_model,
13
+ initialize_model,
14
14
  )
15
- from deepeval.models import DeepEvalBaseMLLM
15
+ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.multimodal_metrics.image_helpfulness.schema import (
17
17
  ReasonScore,
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
20
+ from deepeval.utils import (
21
+ get_or_create_event_loop,
22
+ convert_to_multi_modal_array,
23
+ )
21
24
 
22
25
 
23
26
  class ImageHelpfulnessMetric(BaseMultimodalMetric):
24
27
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
28
+ _required_params: List[LLMTestCaseParams] = [
29
+ LLMTestCaseParams.INPUT,
30
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
31
  ]
29
32
 
30
33
  def __init__(
31
34
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
36
  threshold: float = 0.5,
34
37
  async_mode: bool = True,
35
38
  strict_mode: bool = False,
36
39
  verbose_mode: bool = False,
37
40
  max_context_size: Optional[int] = None,
38
41
  ):
39
- self.model, self.using_native_model = initialize_multimodal_model(model)
42
+ self.model, self.using_native_model = initialize_model(model)
40
43
  self.evaluation_model = self.model.get_model_name()
41
44
  self.threshold = 1 if strict_mode else threshold
42
45
  self.strict_mode = strict_mode
@@ -46,13 +49,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
46
49
 
47
50
  def measure(
48
51
  self,
49
- test_case: MLLMTestCase,
52
+ test_case: LLMTestCase,
50
53
  _show_indicator: bool = True,
51
54
  _in_component: bool = False,
52
55
  _log_metric_to_confident: bool = True,
53
56
  ) -> float:
54
57
  check_mllm_test_case_params(
55
- test_case, self._required_params, None, None, self
58
+ test_case, self._required_params, None, None, self, self.model
56
59
  )
57
60
  self.evaluation_cost = 0 if self.using_native_model else None
58
61
  with metric_progress_indicator(
@@ -69,7 +72,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
69
72
  )
70
73
  )
71
74
  else:
72
- actual_output = test_case.actual_output
75
+ actual_output = convert_to_multi_modal_array(
76
+ test_case.actual_output
77
+ )
73
78
  self.contexts_above = []
74
79
  self.contexts_below = []
75
80
  self.scores = []
@@ -146,13 +151,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
146
151
 
147
152
  async def a_measure(
148
153
  self,
149
- test_case: MLLMTestCase,
154
+ test_case: LLMTestCase,
150
155
  _show_indicator: bool = True,
151
156
  _in_component: bool = False,
152
157
  _log_metric_to_confident: bool = True,
153
158
  ) -> float:
154
159
  check_mllm_test_case_params(
155
- test_case, self._required_params, None, None, self
160
+ test_case, self._required_params, None, None, self, self.model
156
161
  )
157
162
  self.evaluation_cost = 0 if self.using_native_model else None
158
163
  with metric_progress_indicator(
@@ -161,7 +166,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
161
166
  _show_indicator=_show_indicator,
162
167
  _in_component=_in_component,
163
168
  ):
164
- actual_output = test_case.actual_output
169
+ actual_output = convert_to_multi_modal_array(
170
+ test_case.actual_output
171
+ )
165
172
  self.contexts_above = []
166
173
  self.contexts_below = []
167
174
  self.scores = []
@@ -254,7 +261,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
254
261
  instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
255
262
  context_above, context_below
256
263
  )
257
- prompt = [instructions] + [image]
264
+ prompt = f"{instructions} \nImages: {image}"
258
265
  if self.using_native_model:
259
266
  res, cost = self.model.generate(prompt, schema=ReasonScore)
260
267
  self.evaluation_cost += cost
@@ -279,7 +286,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
279
286
  instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
280
287
  context_above, context_below
281
288
  )
282
- prompt = [instructions] + [image]
289
+ prompt = f"{instructions} \nImages: {image}"
283
290
  if self.using_native_model:
284
291
  res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
285
292
  self.evaluation_cost += cost
@@ -2,7 +2,7 @@ import asyncio
2
2
  from typing import Optional, List, Tuple, Union
3
3
 
4
4
  from deepeval.metrics import BaseMultimodalMetric
5
- from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
5
+ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
6
6
  from deepeval.metrics.multimodal_metrics.image_reference.template import (
7
7
  ImageReferenceTemplate,
8
8
  )
@@ -10,33 +10,36 @@ from deepeval.metrics.utils import (
10
10
  construct_verbose_logs,
11
11
  trimAndLoadJson,
12
12
  check_mllm_test_case_params,
13
- initialize_multimodal_model,
13
+ initialize_model,
14
14
  )
15
- from deepeval.models import DeepEvalBaseMLLM
15
+ from deepeval.models import DeepEvalBaseLLM
16
16
  from deepeval.metrics.multimodal_metrics.image_reference.schema import (
17
17
  ReasonScore,
18
18
  )
19
19
  from deepeval.metrics.indicator import metric_progress_indicator
20
- from deepeval.utils import get_or_create_event_loop
20
+ from deepeval.utils import (
21
+ get_or_create_event_loop,
22
+ convert_to_multi_modal_array,
23
+ )
21
24
 
22
25
 
23
26
  class ImageReferenceMetric(BaseMultimodalMetric):
24
27
 
25
- _required_params: List[MLLMTestCaseParams] = [
26
- MLLMTestCaseParams.INPUT,
27
- MLLMTestCaseParams.ACTUAL_OUTPUT,
28
+ _required_params: List[LLMTestCaseParams] = [
29
+ LLMTestCaseParams.INPUT,
30
+ LLMTestCaseParams.ACTUAL_OUTPUT,
28
31
  ]
29
32
 
30
33
  def __init__(
31
34
  self,
32
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
35
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
33
36
  threshold: float = 0.5,
34
37
  async_mode: bool = True,
35
38
  strict_mode: bool = False,
36
39
  verbose_mode: bool = False,
37
40
  max_context_size: Optional[int] = None,
38
41
  ):
39
- self.model, self.using_native_model = initialize_multimodal_model(model)
42
+ self.model, self.using_native_model = initialize_model(model)
40
43
  self.evaluation_model = self.model.get_model_name()
41
44
  self.threshold = 1 if strict_mode else threshold
42
45
  self.strict_mode = strict_mode
@@ -46,13 +49,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
46
49
 
47
50
  def measure(
48
51
  self,
49
- test_case: MLLMTestCase,
52
+ test_case: LLMTestCase,
50
53
  _show_indicator: bool = True,
51
54
  _in_component: bool = False,
52
55
  _log_metric_to_confident: bool = True,
53
56
  ) -> float:
54
57
  check_mllm_test_case_params(
55
- test_case, self._required_params, None, None, self
58
+ test_case, self._required_params, None, None, self, self.model
56
59
  )
57
60
  self.evaluation_cost = 0 if self.using_native_model else None
58
61
  with metric_progress_indicator(
@@ -69,7 +72,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
69
72
  )
70
73
  )
71
74
  else:
72
- actual_output = test_case.actual_output
75
+ actual_output = convert_to_multi_modal_array(
76
+ test_case.actual_output
77
+ )
73
78
  self.contexts_above = []
74
79
  self.contexts_below = []
75
80
  self.scores = []
@@ -146,13 +151,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
146
151
 
147
152
  async def a_measure(
148
153
  self,
149
- test_case: MLLMTestCase,
154
+ test_case: LLMTestCase,
150
155
  _show_indicator: bool = True,
151
156
  _in_component: bool = False,
152
157
  _log_metric_to_confident: bool = True,
153
158
  ) -> float:
154
159
  check_mllm_test_case_params(
155
- test_case, self._required_params, None, None, self
160
+ test_case, self._required_params, None, None, self, self.model
156
161
  )
157
162
  self.evaluation_cost = 0 if self.using_native_model else None
158
163
  with metric_progress_indicator(
@@ -161,7 +166,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
161
166
  _show_indicator=_show_indicator,
162
167
  _in_component=_in_component,
163
168
  ):
164
- actual_output = test_case.actual_output
169
+ actual_output = convert_to_multi_modal_array(
170
+ test_case.actual_output
171
+ )
165
172
  self.contexts_above = []
166
173
  self.contexts_below = []
167
174
  self.scores = []
@@ -254,7 +261,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
254
261
  instructions = ImageReferenceTemplate.evaluate_image_reference(
255
262
  context_above, context_below
256
263
  )
257
- prompt = [instructions] + [image]
264
+ prompt = f"{instructions} \nImages: {image}"
258
265
  if self.using_native_model:
259
266
  res, cost = self.model.generate(prompt, schema=ReasonScore)
260
267
  self.evaluation_cost += cost
@@ -279,7 +286,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
279
286
  instructions = ImageReferenceTemplate.evaluate_image_reference(
280
287
  context_above, context_below
281
288
  )
282
- prompt = [instructions] + [image]
289
+ prompt = f"{instructions} \nImages: {image}"
283
290
  if self.using_native_model:
284
291
  res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
285
292
  self.evaluation_cost += cost
@@ -1,11 +1,11 @@
1
1
  """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
2
2
 
3
3
  from typing import Optional, List, Tuple, Type, Union
4
- from deepeval.models import DeepEvalBaseMLLM
4
+ from deepeval.models import DeepEvalBaseLLM
5
5
  from deepeval.metrics import BaseMultimodalMetric
6
6
  from deepeval.test_case import (
7
- MLLMTestCaseParams,
8
- MLLMTestCase,
7
+ LLMTestCaseParams,
8
+ LLMTestCase,
9
9
  )
10
10
  from deepeval.metrics.multimodal_metrics.multimodal_g_eval.template import (
11
11
  MultimodalGEvalTemplate,
@@ -17,7 +17,7 @@ from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import (
17
17
  from deepeval.utils import get_or_create_event_loop, prettify_list
18
18
  from deepeval.metrics.indicator import metric_progress_indicator
19
19
  from deepeval.metrics.utils import (
20
- initialize_multimodal_model,
20
+ initialize_model,
21
21
  check_mllm_test_case_params,
22
22
  construct_verbose_logs,
23
23
  trimAndLoadJson,
@@ -42,11 +42,11 @@ class MultimodalGEval(BaseMultimodalMetric):
42
42
  def __init__(
43
43
  self,
44
44
  name: str,
45
- evaluation_params: List[MLLMTestCaseParams],
45
+ evaluation_params: List[LLMTestCaseParams],
46
46
  criteria: Optional[str] = None,
47
47
  evaluation_steps: Optional[List[str]] = None,
48
48
  rubric: Optional[List[Rubric]] = None,
49
- model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
49
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
50
50
  threshold: float = 0.5,
51
51
  top_logprobs: int = 20,
52
52
  async_mode: bool = True,
@@ -62,7 +62,7 @@ class MultimodalGEval(BaseMultimodalMetric):
62
62
  self.evaluation_params = evaluation_params
63
63
  self.criteria = criteria
64
64
  self.rubric = validate_and_sort_rubrics(rubric)
65
- self.model, self.using_native_model = initialize_multimodal_model(model)
65
+ self.model, self.using_native_model = initialize_model(model)
66
66
  self.evaluation_model = self.model.get_model_name()
67
67
  self.evaluation_steps = (
68
68
  evaluation_steps
@@ -79,7 +79,7 @@ class MultimodalGEval(BaseMultimodalMetric):
79
79
 
80
80
  def measure(
81
81
  self,
82
- test_case: MLLMTestCase,
82
+ test_case: LLMTestCase,
83
83
  _show_indicator: bool = True,
84
84
  _in_component: bool = False,
85
85
  _log_metric_to_confident: bool = True,
@@ -87,7 +87,7 @@ class MultimodalGEval(BaseMultimodalMetric):
87
87
  ) -> float:
88
88
 
89
89
  check_mllm_test_case_params(
90
- test_case, self.evaluation_params, None, None, self
90
+ test_case, self.evaluation_params, None, None, self, self.model
91
91
  )
92
92
 
93
93
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -134,7 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
134
134
 
135
135
  async def a_measure(
136
136
  self,
137
- test_case: MLLMTestCase,
137
+ test_case: LLMTestCase,
138
138
  _show_indicator: bool = True,
139
139
  _in_component: bool = False,
140
140
  _additional_context: Optional[str] = None,
@@ -142,7 +142,7 @@ class MultimodalGEval(BaseMultimodalMetric):
142
142
  ) -> float:
143
143
 
144
144
  check_mllm_test_case_params(
145
- test_case, self.evaluation_params, None, None, self
145
+ test_case, self.evaluation_params, None, None, self, self.model
146
146
  )
147
147
 
148
148
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -185,15 +185,15 @@ class MultimodalGEval(BaseMultimodalMetric):
185
185
  criteria=self.criteria, parameters=g_eval_params_str
186
186
  )
187
187
  if self.using_native_model:
188
- res, cost = await self.model.a_generate([prompt], schema=Steps)
188
+ res, cost = await self.model.a_generate(prompt, schema=Steps)
189
189
  self.evaluation_cost += cost
190
190
  return res.steps
191
191
  else:
192
192
  try:
193
- res: Steps = await self.model.a_generate([prompt], schema=Steps)
193
+ res: Steps = await self.model.a_generate(prompt, schema=Steps)
194
194
  return res.steps
195
195
  except TypeError:
196
- res = await self.model.a_generate([prompt])
196
+ res = await self.model.a_generate(prompt)
197
197
  data = trimAndLoadJson(res, self)
198
198
  return data["steps"]
199
199
 
@@ -208,20 +208,20 @@ class MultimodalGEval(BaseMultimodalMetric):
208
208
  criteria=self.criteria, parameters=g_eval_params_str
209
209
  )
210
210
  if self.using_native_model:
211
- res, cost = self.model.generate([prompt], schema=Steps)
211
+ res, cost = self.model.generate(prompt, schema=Steps)
212
212
  self.evaluation_cost += cost
213
213
  return res.steps
214
214
  else:
215
215
  try:
216
- res: Steps = self.model.generate([prompt], schema=Steps)
216
+ res: Steps = self.model.generate(prompt, schema=Steps)
217
217
  return res.steps
218
218
  except TypeError:
219
- res = self.model.generate([prompt])
219
+ res = self.model.generate(prompt)
220
220
  data = trimAndLoadJson(res, self)
221
221
  return data["steps"]
222
222
 
223
223
  async def _a_evaluate(
224
- self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
224
+ self, test_case: LLMTestCase, _additional_context: Optional[str] = None
225
225
  ) -> Tuple[Union[int, float], str]:
226
226
  test_case_list = construct_test_case_list(
227
227
  self.evaluation_params, test_case
@@ -296,7 +296,7 @@ class MultimodalGEval(BaseMultimodalMetric):
296
296
  return data["score"], data["reason"]
297
297
 
298
298
  def _evaluate(
299
- self, test_case: MLLMTestCase, _additional_context: Optional[str] = None
299
+ self, test_case: LLMTestCase, _additional_context: Optional[str] = None
300
300
  ) -> Tuple[Union[int, float], str]:
301
301
  test_case_list = construct_test_case_list(
302
302
  self.evaluation_params, test_case
@@ -53,53 +53,45 @@ class MultimodalGEvalTemplate:
53
53
  else ""
54
54
  )
55
55
 
56
- return (
57
- [
58
- textwrap.dedent(
59
- f"""You are an evaluator. Given the following {dependencies}, assess the response below and return a JSON object with two fields:
60
-
61
- - `"score"`: an integer between {score_range[0]} and {score_range[1]}, {score_explanation}.
62
- - `"reason"`: a brief explanation for why the score was given. This must mention specific strengths or shortcomings, referencing relevant details from the input. Do **not** quote the score itself in the explanation.
63
-
64
- Your explanation should:
65
- - {reasoning_expectation}
66
- - Mention key details from the test case parameters.
67
- - Be concise, clear, and focused on the evaluation logic.
68
-
69
- Only return valid JSON. Do **not** include any extra commentary or text.
70
-
71
- ---
72
-
73
- Evaluation Steps:
74
- {evaluation_steps}
75
-
76
- {rubric_text}
77
- Test Case:
78
- ************************
79
- """
80
- )
81
- ]
82
- + test_case_list
83
- + [
84
- textwrap.dedent(
85
- f"""
86
- ************************
87
- \n\n\n
88
- Parameters:
89
- {parameters}
90
- {additional_context}
91
-
92
- ---
93
- **Example JSON:**
94
- {{
95
- "score": {score_range[0]},
96
- "reason": "your concise and informative reason here"
97
- }}
98
-
99
- JSON:
100
- """
101
- )
102
- ]
56
+ return textwrap.dedent(
57
+ f"""You are an evaluator. Given the following {dependencies}, assess the response below and return a JSON object with two fields:
58
+
59
+ - `"score"`: an integer between {score_range[0]} and {score_range[1]}, {score_explanation}.
60
+ - `"reason"`: a brief explanation for why the score was given. This must mention specific strengths or shortcomings, referencing relevant details from the input. Do **not** quote the score itself in the explanation.
61
+
62
+ Your explanation should:
63
+ - {reasoning_expectation}
64
+ - Mention key details from the test case parameters.
65
+ - Be concise, clear, and focused on the evaluation logic.
66
+
67
+ Only return valid JSON. Do **not** include any extra commentary or text.
68
+
69
+ ---
70
+
71
+ Evaluation Steps:
72
+ {evaluation_steps}
73
+
74
+ {rubric_text}
75
+ Test Case:
76
+ ************************
77
+
78
+ {test_case_list}
79
+
80
+ ************************
81
+ \n\n\n
82
+ Parameters:
83
+ {parameters}
84
+ {additional_context}
85
+
86
+ ---
87
+ **Example JSON:**
88
+ {{
89
+ "reason": "your concise and informative reason here",
90
+ "score": {score_range[0]}
91
+ }}
92
+
93
+ JSON:
94
+ """
103
95
  )
104
96
 
105
97
  @staticmethod
@@ -114,35 +106,28 @@ class MultimodalGEvalTemplate:
114
106
  if _additional_context
115
107
  else ""
116
108
  )
117
- return (
118
- [
119
- textwrap.dedent(
120
- f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
121
-
122
- Evaluation Steps:
123
- {evaluation_steps}
124
- ************************
125
- """
126
- )
127
- ]
128
- + test_case_list
129
- + [
130
- textwrap.dedent(
131
- f"""
132
- ************************
133
- {additional_context}
134
- **
135
- IMPORTANT: Please make sure to only return in JSON format, with the "score" and "reason" key. No words or explanation is needed.
136
-
137
- Example JSON:
138
- {{
139
- "score": 0,
140
- "reason": "The text does not follow the evaluation steps provided."
141
- }}
142
- **
143
-
144
- JSON:
145
- """
146
- )
147
- ]
109
+ return textwrap.dedent(
110
+ f"""Given the evaluation steps, return a JSON with two keys: 1) a `score` key that is STRICTLY EITHER 1 (follows the criteria 100% outlined in the evaluation steps), OR 0 (does not follow the criteria), and 2) a `reason` key, a reason for the given score, but DO NOT QUOTE THE SCORE in your reason. Please mention specific information from {parameters} in your reason, but be very concise with it!
111
+
112
+ Evaluation Steps:
113
+ {evaluation_steps}
114
+ ************************
115
+
116
+ {test_case_list}
117
+
118
+ ************************
119
+ {additional_context}
120
+
121
+ **
122
+ IMPORTANT: Please make sure to only return in JSON format, with the "score" and "reason" key. No words or explanation is needed.
123
+
124
+ Example JSON:
125
+ {{
126
+ "reason": "The text does not follow the evaluation steps provided.",
127
+ "score": 0
128
+ }}
129
+ **
130
+
131
+ JSON:
132
+ """
148
133
  )