deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from deepeval.metrics import BaseMultimodalMetric
|
|
5
|
-
from deepeval.test_case import
|
|
5
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_coherence.template import (
|
|
7
7
|
ImageCoherenceTemplate,
|
|
8
8
|
)
|
|
@@ -10,32 +10,35 @@ from deepeval.metrics.utils import (
|
|
|
10
10
|
construct_verbose_logs,
|
|
11
11
|
trimAndLoadJson,
|
|
12
12
|
check_mllm_test_case_params,
|
|
13
|
-
|
|
13
|
+
initialize_model,
|
|
14
14
|
)
|
|
15
|
-
from deepeval.models import
|
|
15
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
16
16
|
from deepeval.metrics.multimodal_metrics.image_coherence.schema import (
|
|
17
17
|
ReasonScore,
|
|
18
18
|
)
|
|
19
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
from deepeval.utils import
|
|
20
|
+
from deepeval.utils import (
|
|
21
|
+
get_or_create_event_loop,
|
|
22
|
+
convert_to_multi_modal_array,
|
|
23
|
+
)
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
24
|
-
_required_params: List[
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
28
|
+
LLMTestCaseParams.INPUT,
|
|
29
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
27
30
|
]
|
|
28
31
|
|
|
29
32
|
def __init__(
|
|
30
33
|
self,
|
|
31
|
-
model: Optional[Union[str,
|
|
34
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
32
35
|
threshold: float = 0.5,
|
|
33
36
|
async_mode: bool = True,
|
|
34
37
|
strict_mode: bool = False,
|
|
35
38
|
verbose_mode: bool = False,
|
|
36
39
|
max_context_size: Optional[int] = None,
|
|
37
40
|
):
|
|
38
|
-
self.model, self.using_native_model =
|
|
41
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
39
42
|
self.evaluation_model = self.model.get_model_name()
|
|
40
43
|
self.threshold = 1 if strict_mode else threshold
|
|
41
44
|
self.strict_mode = strict_mode
|
|
@@ -45,13 +48,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
45
48
|
|
|
46
49
|
def measure(
|
|
47
50
|
self,
|
|
48
|
-
test_case:
|
|
51
|
+
test_case: LLMTestCase,
|
|
49
52
|
_show_indicator: bool = True,
|
|
50
53
|
_in_component: bool = False,
|
|
51
54
|
_log_metric_to_confident: bool = True,
|
|
52
55
|
) -> float:
|
|
53
56
|
check_mllm_test_case_params(
|
|
54
|
-
test_case, self._required_params, None, None, self
|
|
57
|
+
test_case, self._required_params, None, None, self, self.model
|
|
55
58
|
)
|
|
56
59
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
57
60
|
with metric_progress_indicator(
|
|
@@ -68,7 +71,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
71
|
-
actual_output =
|
|
74
|
+
actual_output = convert_to_multi_modal_array(
|
|
75
|
+
test_case.actual_output
|
|
76
|
+
)
|
|
72
77
|
self.contexts_above = []
|
|
73
78
|
self.contexts_below = []
|
|
74
79
|
self.scores = []
|
|
@@ -145,13 +150,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
145
150
|
|
|
146
151
|
async def a_measure(
|
|
147
152
|
self,
|
|
148
|
-
test_case:
|
|
153
|
+
test_case: LLMTestCase,
|
|
149
154
|
_show_indicator: bool = True,
|
|
150
155
|
_in_component: bool = False,
|
|
151
156
|
_log_metric_to_confident: bool = True,
|
|
152
157
|
) -> float:
|
|
153
158
|
check_mllm_test_case_params(
|
|
154
|
-
test_case, self._required_params, None, None, self
|
|
159
|
+
test_case, self._required_params, None, None, self, self.model
|
|
155
160
|
)
|
|
156
161
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
157
162
|
with metric_progress_indicator(
|
|
@@ -160,7 +165,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
160
165
|
_show_indicator=_show_indicator,
|
|
161
166
|
_in_component=_in_component,
|
|
162
167
|
):
|
|
163
|
-
actual_output =
|
|
168
|
+
actual_output = convert_to_multi_modal_array(
|
|
169
|
+
test_case.actual_output
|
|
170
|
+
)
|
|
164
171
|
self.contexts_above = []
|
|
165
172
|
self.contexts_below = []
|
|
166
173
|
self.scores = []
|
|
@@ -253,7 +260,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
253
260
|
instructions = ImageCoherenceTemplate.evaluate_image_coherence(
|
|
254
261
|
context_above, context_below
|
|
255
262
|
)
|
|
256
|
-
prompt =
|
|
263
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
257
264
|
if self.using_native_model:
|
|
258
265
|
res, cost = self.model.generate(prompt, ReasonScore)
|
|
259
266
|
self.evaluation_cost += cost
|
|
@@ -278,7 +285,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
278
285
|
instructions = ImageCoherenceTemplate.evaluate_image_coherence(
|
|
279
286
|
context_above, context_below
|
|
280
287
|
)
|
|
281
|
-
prompt =
|
|
288
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
282
289
|
if self.using_native_model:
|
|
283
290
|
res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
|
|
284
291
|
self.evaluation_cost += cost
|
|
@@ -4,38 +4,41 @@ import math
|
|
|
4
4
|
import textwrap
|
|
5
5
|
|
|
6
6
|
from deepeval.metrics import BaseMultimodalMetric
|
|
7
|
-
from deepeval.test_case import
|
|
7
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
8
8
|
from deepeval.metrics.multimodal_metrics.image_editing.template import (
|
|
9
9
|
ImageEditingTemplate,
|
|
10
10
|
)
|
|
11
|
-
from deepeval.utils import
|
|
11
|
+
from deepeval.utils import (
|
|
12
|
+
get_or_create_event_loop,
|
|
13
|
+
convert_to_multi_modal_array,
|
|
14
|
+
)
|
|
12
15
|
from deepeval.metrics.utils import (
|
|
13
16
|
construct_verbose_logs,
|
|
14
17
|
trimAndLoadJson,
|
|
15
18
|
check_mllm_test_case_params,
|
|
16
|
-
|
|
19
|
+
initialize_model,
|
|
17
20
|
)
|
|
18
|
-
from deepeval.models import
|
|
21
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
19
22
|
from deepeval.metrics.multimodal_metrics.image_editing.schema import ReasonScore
|
|
20
23
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
class ImageEditingMetric(BaseMultimodalMetric):
|
|
24
27
|
|
|
25
|
-
_required_params: List[
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
29
|
+
LLMTestCaseParams.INPUT,
|
|
30
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
28
31
|
]
|
|
29
32
|
|
|
30
33
|
def __init__(
|
|
31
34
|
self,
|
|
32
|
-
model: Optional[Union[str,
|
|
35
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
33
36
|
threshold: float = 0.5,
|
|
34
37
|
async_mode: bool = True,
|
|
35
38
|
strict_mode: bool = False,
|
|
36
39
|
verbose_mode: bool = False,
|
|
37
40
|
):
|
|
38
|
-
self.model, self.using_native_model =
|
|
41
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
39
42
|
self.evaluation_model = self.model.get_model_name()
|
|
40
43
|
self.threshold = 1 if strict_mode else threshold
|
|
41
44
|
self.strict_mode = strict_mode
|
|
@@ -44,13 +47,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
44
47
|
|
|
45
48
|
def measure(
|
|
46
49
|
self,
|
|
47
|
-
test_case:
|
|
50
|
+
test_case: LLMTestCase,
|
|
48
51
|
_show_indicator: bool = True,
|
|
49
52
|
_in_component: bool = False,
|
|
50
53
|
_log_metric_to_confident: bool = True,
|
|
51
54
|
) -> float:
|
|
52
55
|
check_mllm_test_case_params(
|
|
53
|
-
test_case, self._required_params, 1, 1, self
|
|
56
|
+
test_case, self._required_params, 1, 1, self, self.model
|
|
54
57
|
)
|
|
55
58
|
|
|
56
59
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -68,12 +71,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
68
71
|
)
|
|
69
72
|
)
|
|
70
73
|
else:
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
)
|
|
74
|
-
_, output_images = self.separate_images_from_text(
|
|
74
|
+
input = convert_to_multi_modal_array(test_case.input)
|
|
75
|
+
actual_output = convert_to_multi_modal_array(
|
|
75
76
|
test_case.actual_output
|
|
76
77
|
)
|
|
78
|
+
input_texts, input_images = self.separate_images_from_text(
|
|
79
|
+
input
|
|
80
|
+
)
|
|
81
|
+
_, output_images = self.separate_images_from_text(actual_output)
|
|
77
82
|
|
|
78
83
|
self.SC_scores, self.SC_reasoning = (
|
|
79
84
|
self._evaluate_semantic_consistency(
|
|
@@ -107,13 +112,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
107
112
|
|
|
108
113
|
async def a_measure(
|
|
109
114
|
self,
|
|
110
|
-
test_case:
|
|
115
|
+
test_case: LLMTestCase,
|
|
111
116
|
_show_indicator: bool = True,
|
|
112
117
|
_in_component: bool = False,
|
|
113
118
|
_log_metric_to_confident: bool = True,
|
|
114
119
|
) -> float:
|
|
115
120
|
check_mllm_test_case_params(
|
|
116
|
-
test_case, self._required_params, 1, 1, self
|
|
121
|
+
test_case, self._required_params, 1, 1, self, self.model
|
|
117
122
|
)
|
|
118
123
|
|
|
119
124
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -123,12 +128,12 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
123
128
|
_show_indicator=_show_indicator,
|
|
124
129
|
_in_component=_in_component,
|
|
125
130
|
):
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
)
|
|
129
|
-
_, output_images = self.separate_images_from_text(
|
|
131
|
+
input = convert_to_multi_modal_array(test_case.input)
|
|
132
|
+
actual_output = convert_to_multi_modal_array(
|
|
130
133
|
test_case.actual_output
|
|
131
134
|
)
|
|
135
|
+
input_texts, input_images = self.separate_images_from_text(input)
|
|
136
|
+
_, output_images = self.separate_images_from_text(actual_output)
|
|
132
137
|
(self.SC_scores, self.SC_reasoning), (
|
|
133
138
|
self.PQ_scores,
|
|
134
139
|
self.PQ_reasoning,
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from deepeval.metrics import BaseMultimodalMetric
|
|
5
|
-
from deepeval.test_case import
|
|
5
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_helpfulness.template import (
|
|
7
7
|
ImageHelpfulnessTemplate,
|
|
8
8
|
)
|
|
@@ -10,33 +10,36 @@ from deepeval.metrics.utils import (
|
|
|
10
10
|
construct_verbose_logs,
|
|
11
11
|
trimAndLoadJson,
|
|
12
12
|
check_mllm_test_case_params,
|
|
13
|
-
|
|
13
|
+
initialize_model,
|
|
14
14
|
)
|
|
15
|
-
from deepeval.models import
|
|
15
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
16
16
|
from deepeval.metrics.multimodal_metrics.image_helpfulness.schema import (
|
|
17
17
|
ReasonScore,
|
|
18
18
|
)
|
|
19
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
from deepeval.utils import
|
|
20
|
+
from deepeval.utils import (
|
|
21
|
+
get_or_create_event_loop,
|
|
22
|
+
convert_to_multi_modal_array,
|
|
23
|
+
)
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
24
27
|
|
|
25
|
-
_required_params: List[
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
29
|
+
LLMTestCaseParams.INPUT,
|
|
30
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
28
31
|
]
|
|
29
32
|
|
|
30
33
|
def __init__(
|
|
31
34
|
self,
|
|
32
|
-
model: Optional[Union[str,
|
|
35
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
33
36
|
threshold: float = 0.5,
|
|
34
37
|
async_mode: bool = True,
|
|
35
38
|
strict_mode: bool = False,
|
|
36
39
|
verbose_mode: bool = False,
|
|
37
40
|
max_context_size: Optional[int] = None,
|
|
38
41
|
):
|
|
39
|
-
self.model, self.using_native_model =
|
|
42
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
40
43
|
self.evaluation_model = self.model.get_model_name()
|
|
41
44
|
self.threshold = 1 if strict_mode else threshold
|
|
42
45
|
self.strict_mode = strict_mode
|
|
@@ -46,13 +49,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
46
49
|
|
|
47
50
|
def measure(
|
|
48
51
|
self,
|
|
49
|
-
test_case:
|
|
52
|
+
test_case: LLMTestCase,
|
|
50
53
|
_show_indicator: bool = True,
|
|
51
54
|
_in_component: bool = False,
|
|
52
55
|
_log_metric_to_confident: bool = True,
|
|
53
56
|
) -> float:
|
|
54
57
|
check_mllm_test_case_params(
|
|
55
|
-
test_case, self._required_params, None, None, self
|
|
58
|
+
test_case, self._required_params, None, None, self, self.model
|
|
56
59
|
)
|
|
57
60
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
58
61
|
with metric_progress_indicator(
|
|
@@ -69,7 +72,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
69
72
|
)
|
|
70
73
|
)
|
|
71
74
|
else:
|
|
72
|
-
actual_output =
|
|
75
|
+
actual_output = convert_to_multi_modal_array(
|
|
76
|
+
test_case.actual_output
|
|
77
|
+
)
|
|
73
78
|
self.contexts_above = []
|
|
74
79
|
self.contexts_below = []
|
|
75
80
|
self.scores = []
|
|
@@ -146,13 +151,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
146
151
|
|
|
147
152
|
async def a_measure(
|
|
148
153
|
self,
|
|
149
|
-
test_case:
|
|
154
|
+
test_case: LLMTestCase,
|
|
150
155
|
_show_indicator: bool = True,
|
|
151
156
|
_in_component: bool = False,
|
|
152
157
|
_log_metric_to_confident: bool = True,
|
|
153
158
|
) -> float:
|
|
154
159
|
check_mllm_test_case_params(
|
|
155
|
-
test_case, self._required_params, None, None, self
|
|
160
|
+
test_case, self._required_params, None, None, self, self.model
|
|
156
161
|
)
|
|
157
162
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
158
163
|
with metric_progress_indicator(
|
|
@@ -161,7 +166,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
161
166
|
_show_indicator=_show_indicator,
|
|
162
167
|
_in_component=_in_component,
|
|
163
168
|
):
|
|
164
|
-
actual_output =
|
|
169
|
+
actual_output = convert_to_multi_modal_array(
|
|
170
|
+
test_case.actual_output
|
|
171
|
+
)
|
|
165
172
|
self.contexts_above = []
|
|
166
173
|
self.contexts_below = []
|
|
167
174
|
self.scores = []
|
|
@@ -254,7 +261,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
254
261
|
instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
|
|
255
262
|
context_above, context_below
|
|
256
263
|
)
|
|
257
|
-
prompt =
|
|
264
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
258
265
|
if self.using_native_model:
|
|
259
266
|
res, cost = self.model.generate(prompt, schema=ReasonScore)
|
|
260
267
|
self.evaluation_cost += cost
|
|
@@ -279,7 +286,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
279
286
|
instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
|
|
280
287
|
context_above, context_below
|
|
281
288
|
)
|
|
282
|
-
prompt =
|
|
289
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
283
290
|
if self.using_native_model:
|
|
284
291
|
res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
|
|
285
292
|
self.evaluation_cost += cost
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from deepeval.metrics import BaseMultimodalMetric
|
|
5
|
-
from deepeval.test_case import
|
|
5
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_reference.template import (
|
|
7
7
|
ImageReferenceTemplate,
|
|
8
8
|
)
|
|
@@ -10,33 +10,36 @@ from deepeval.metrics.utils import (
|
|
|
10
10
|
construct_verbose_logs,
|
|
11
11
|
trimAndLoadJson,
|
|
12
12
|
check_mllm_test_case_params,
|
|
13
|
-
|
|
13
|
+
initialize_model,
|
|
14
14
|
)
|
|
15
|
-
from deepeval.models import
|
|
15
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
16
16
|
from deepeval.metrics.multimodal_metrics.image_reference.schema import (
|
|
17
17
|
ReasonScore,
|
|
18
18
|
)
|
|
19
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
from deepeval.utils import
|
|
20
|
+
from deepeval.utils import (
|
|
21
|
+
get_or_create_event_loop,
|
|
22
|
+
convert_to_multi_modal_array,
|
|
23
|
+
)
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
class ImageReferenceMetric(BaseMultimodalMetric):
|
|
24
27
|
|
|
25
|
-
_required_params: List[
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
29
|
+
LLMTestCaseParams.INPUT,
|
|
30
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
28
31
|
]
|
|
29
32
|
|
|
30
33
|
def __init__(
|
|
31
34
|
self,
|
|
32
|
-
model: Optional[Union[str,
|
|
35
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
33
36
|
threshold: float = 0.5,
|
|
34
37
|
async_mode: bool = True,
|
|
35
38
|
strict_mode: bool = False,
|
|
36
39
|
verbose_mode: bool = False,
|
|
37
40
|
max_context_size: Optional[int] = None,
|
|
38
41
|
):
|
|
39
|
-
self.model, self.using_native_model =
|
|
42
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
40
43
|
self.evaluation_model = self.model.get_model_name()
|
|
41
44
|
self.threshold = 1 if strict_mode else threshold
|
|
42
45
|
self.strict_mode = strict_mode
|
|
@@ -46,13 +49,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
46
49
|
|
|
47
50
|
def measure(
|
|
48
51
|
self,
|
|
49
|
-
test_case:
|
|
52
|
+
test_case: LLMTestCase,
|
|
50
53
|
_show_indicator: bool = True,
|
|
51
54
|
_in_component: bool = False,
|
|
52
55
|
_log_metric_to_confident: bool = True,
|
|
53
56
|
) -> float:
|
|
54
57
|
check_mllm_test_case_params(
|
|
55
|
-
test_case, self._required_params, None, None, self
|
|
58
|
+
test_case, self._required_params, None, None, self, self.model
|
|
56
59
|
)
|
|
57
60
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
58
61
|
with metric_progress_indicator(
|
|
@@ -69,7 +72,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
69
72
|
)
|
|
70
73
|
)
|
|
71
74
|
else:
|
|
72
|
-
actual_output =
|
|
75
|
+
actual_output = convert_to_multi_modal_array(
|
|
76
|
+
test_case.actual_output
|
|
77
|
+
)
|
|
73
78
|
self.contexts_above = []
|
|
74
79
|
self.contexts_below = []
|
|
75
80
|
self.scores = []
|
|
@@ -146,13 +151,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
146
151
|
|
|
147
152
|
async def a_measure(
|
|
148
153
|
self,
|
|
149
|
-
test_case:
|
|
154
|
+
test_case: LLMTestCase,
|
|
150
155
|
_show_indicator: bool = True,
|
|
151
156
|
_in_component: bool = False,
|
|
152
157
|
_log_metric_to_confident: bool = True,
|
|
153
158
|
) -> float:
|
|
154
159
|
check_mllm_test_case_params(
|
|
155
|
-
test_case, self._required_params, None, None, self
|
|
160
|
+
test_case, self._required_params, None, None, self, self.model
|
|
156
161
|
)
|
|
157
162
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
158
163
|
with metric_progress_indicator(
|
|
@@ -161,7 +166,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
161
166
|
_show_indicator=_show_indicator,
|
|
162
167
|
_in_component=_in_component,
|
|
163
168
|
):
|
|
164
|
-
actual_output =
|
|
169
|
+
actual_output = convert_to_multi_modal_array(
|
|
170
|
+
test_case.actual_output
|
|
171
|
+
)
|
|
165
172
|
self.contexts_above = []
|
|
166
173
|
self.contexts_below = []
|
|
167
174
|
self.scores = []
|
|
@@ -254,7 +261,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
254
261
|
instructions = ImageReferenceTemplate.evaluate_image_reference(
|
|
255
262
|
context_above, context_below
|
|
256
263
|
)
|
|
257
|
-
prompt =
|
|
264
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
258
265
|
if self.using_native_model:
|
|
259
266
|
res, cost = self.model.generate(prompt, schema=ReasonScore)
|
|
260
267
|
self.evaluation_cost += cost
|
|
@@ -279,7 +286,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
279
286
|
instructions = ImageReferenceTemplate.evaluate_image_reference(
|
|
280
287
|
context_above, context_below
|
|
281
288
|
)
|
|
282
|
-
prompt =
|
|
289
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
283
290
|
if self.using_native_model:
|
|
284
291
|
res, cost = await self.model.a_generate(prompt, schema=ReasonScore)
|
|
285
292
|
self.evaluation_cost += cost
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
|
|
2
2
|
|
|
3
3
|
from typing import Optional, List, Tuple, Type, Union
|
|
4
|
-
from deepeval.models import
|
|
4
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
5
5
|
from deepeval.metrics import BaseMultimodalMetric
|
|
6
6
|
from deepeval.test_case import (
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
LLMTestCaseParams,
|
|
8
|
+
LLMTestCase,
|
|
9
9
|
)
|
|
10
10
|
from deepeval.metrics.multimodal_metrics.multimodal_g_eval.template import (
|
|
11
11
|
MultimodalGEvalTemplate,
|
|
@@ -17,7 +17,7 @@ from deepeval.metrics.multimodal_metrics.multimodal_g_eval.schema import (
|
|
|
17
17
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
18
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
19
|
from deepeval.metrics.utils import (
|
|
20
|
-
|
|
20
|
+
initialize_model,
|
|
21
21
|
check_mllm_test_case_params,
|
|
22
22
|
construct_verbose_logs,
|
|
23
23
|
trimAndLoadJson,
|
|
@@ -42,11 +42,11 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
42
42
|
def __init__(
|
|
43
43
|
self,
|
|
44
44
|
name: str,
|
|
45
|
-
evaluation_params: List[
|
|
45
|
+
evaluation_params: List[LLMTestCaseParams],
|
|
46
46
|
criteria: Optional[str] = None,
|
|
47
47
|
evaluation_steps: Optional[List[str]] = None,
|
|
48
48
|
rubric: Optional[List[Rubric]] = None,
|
|
49
|
-
model: Optional[Union[str,
|
|
49
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
50
50
|
threshold: float = 0.5,
|
|
51
51
|
top_logprobs: int = 20,
|
|
52
52
|
async_mode: bool = True,
|
|
@@ -62,7 +62,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
62
62
|
self.evaluation_params = evaluation_params
|
|
63
63
|
self.criteria = criteria
|
|
64
64
|
self.rubric = validate_and_sort_rubrics(rubric)
|
|
65
|
-
self.model, self.using_native_model =
|
|
65
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
66
66
|
self.evaluation_model = self.model.get_model_name()
|
|
67
67
|
self.evaluation_steps = (
|
|
68
68
|
evaluation_steps
|
|
@@ -79,7 +79,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
79
79
|
|
|
80
80
|
def measure(
|
|
81
81
|
self,
|
|
82
|
-
test_case:
|
|
82
|
+
test_case: LLMTestCase,
|
|
83
83
|
_show_indicator: bool = True,
|
|
84
84
|
_in_component: bool = False,
|
|
85
85
|
_log_metric_to_confident: bool = True,
|
|
@@ -87,7 +87,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
87
87
|
) -> float:
|
|
88
88
|
|
|
89
89
|
check_mllm_test_case_params(
|
|
90
|
-
test_case, self.evaluation_params, None, None, self
|
|
90
|
+
test_case, self.evaluation_params, None, None, self, self.model
|
|
91
91
|
)
|
|
92
92
|
|
|
93
93
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -134,7 +134,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
134
134
|
|
|
135
135
|
async def a_measure(
|
|
136
136
|
self,
|
|
137
|
-
test_case:
|
|
137
|
+
test_case: LLMTestCase,
|
|
138
138
|
_show_indicator: bool = True,
|
|
139
139
|
_in_component: bool = False,
|
|
140
140
|
_additional_context: Optional[str] = None,
|
|
@@ -142,7 +142,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
142
142
|
) -> float:
|
|
143
143
|
|
|
144
144
|
check_mllm_test_case_params(
|
|
145
|
-
test_case, self.evaluation_params, None, None, self
|
|
145
|
+
test_case, self.evaluation_params, None, None, self, self.model
|
|
146
146
|
)
|
|
147
147
|
|
|
148
148
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -185,15 +185,15 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
185
185
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
186
186
|
)
|
|
187
187
|
if self.using_native_model:
|
|
188
|
-
res, cost = await self.model.a_generate(
|
|
188
|
+
res, cost = await self.model.a_generate(prompt, schema=Steps)
|
|
189
189
|
self.evaluation_cost += cost
|
|
190
190
|
return res.steps
|
|
191
191
|
else:
|
|
192
192
|
try:
|
|
193
|
-
res: Steps = await self.model.a_generate(
|
|
193
|
+
res: Steps = await self.model.a_generate(prompt, schema=Steps)
|
|
194
194
|
return res.steps
|
|
195
195
|
except TypeError:
|
|
196
|
-
res = await self.model.a_generate(
|
|
196
|
+
res = await self.model.a_generate(prompt)
|
|
197
197
|
data = trimAndLoadJson(res, self)
|
|
198
198
|
return data["steps"]
|
|
199
199
|
|
|
@@ -208,20 +208,20 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
208
208
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
209
209
|
)
|
|
210
210
|
if self.using_native_model:
|
|
211
|
-
res, cost = self.model.generate(
|
|
211
|
+
res, cost = self.model.generate(prompt, schema=Steps)
|
|
212
212
|
self.evaluation_cost += cost
|
|
213
213
|
return res.steps
|
|
214
214
|
else:
|
|
215
215
|
try:
|
|
216
|
-
res: Steps = self.model.generate(
|
|
216
|
+
res: Steps = self.model.generate(prompt, schema=Steps)
|
|
217
217
|
return res.steps
|
|
218
218
|
except TypeError:
|
|
219
|
-
res = self.model.generate(
|
|
219
|
+
res = self.model.generate(prompt)
|
|
220
220
|
data = trimAndLoadJson(res, self)
|
|
221
221
|
return data["steps"]
|
|
222
222
|
|
|
223
223
|
async def _a_evaluate(
|
|
224
|
-
self, test_case:
|
|
224
|
+
self, test_case: LLMTestCase, _additional_context: Optional[str] = None
|
|
225
225
|
) -> Tuple[Union[int, float], str]:
|
|
226
226
|
test_case_list = construct_test_case_list(
|
|
227
227
|
self.evaluation_params, test_case
|
|
@@ -296,7 +296,7 @@ class MultimodalGEval(BaseMultimodalMetric):
|
|
|
296
296
|
return data["score"], data["reason"]
|
|
297
297
|
|
|
298
298
|
def _evaluate(
|
|
299
|
-
self, test_case:
|
|
299
|
+
self, test_case: LLMTestCase, _additional_context: Optional[str] = None
|
|
300
300
|
) -> Tuple[Union[int, float], str]:
|
|
301
301
|
test_case_list = construct_test_case_list(
|
|
302
302
|
self.evaluation_params, test_case
|