deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -3,39 +3,43 @@ from typing import Optional, List, Tuple, Union
|
|
|
3
3
|
import math
|
|
4
4
|
import textwrap
|
|
5
5
|
|
|
6
|
-
from deepeval.metrics import
|
|
7
|
-
from deepeval.test_case import
|
|
6
|
+
from deepeval.metrics import BaseMetric
|
|
7
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
8
8
|
from deepeval.metrics.multimodal_metrics.image_editing.template import (
|
|
9
9
|
ImageEditingTemplate,
|
|
10
10
|
)
|
|
11
|
-
from deepeval.utils import
|
|
11
|
+
from deepeval.utils import (
|
|
12
|
+
get_or_create_event_loop,
|
|
13
|
+
convert_to_multi_modal_array,
|
|
14
|
+
)
|
|
12
15
|
from deepeval.metrics.utils import (
|
|
13
16
|
construct_verbose_logs,
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
check_llm_test_case_params,
|
|
18
|
+
initialize_model,
|
|
19
|
+
a_generate_with_schema_and_extract,
|
|
20
|
+
generate_with_schema_and_extract,
|
|
17
21
|
)
|
|
18
|
-
from deepeval.models import
|
|
22
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
19
23
|
from deepeval.metrics.multimodal_metrics.image_editing.schema import ReasonScore
|
|
20
24
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
25
|
|
|
22
26
|
|
|
23
|
-
class ImageEditingMetric(
|
|
27
|
+
class ImageEditingMetric(BaseMetric):
|
|
24
28
|
|
|
25
|
-
_required_params: List[
|
|
26
|
-
|
|
27
|
-
|
|
29
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
30
|
+
LLMTestCaseParams.INPUT,
|
|
31
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
28
32
|
]
|
|
29
33
|
|
|
30
34
|
def __init__(
|
|
31
35
|
self,
|
|
32
|
-
model: Optional[Union[str,
|
|
36
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
33
37
|
threshold: float = 0.5,
|
|
34
38
|
async_mode: bool = True,
|
|
35
39
|
strict_mode: bool = False,
|
|
36
40
|
verbose_mode: bool = False,
|
|
37
41
|
):
|
|
38
|
-
self.model, self.using_native_model =
|
|
42
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
39
43
|
self.evaluation_model = self.model.get_model_name()
|
|
40
44
|
self.threshold = 1 if strict_mode else threshold
|
|
41
45
|
self.strict_mode = strict_mode
|
|
@@ -44,13 +48,19 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
44
48
|
|
|
45
49
|
def measure(
|
|
46
50
|
self,
|
|
47
|
-
test_case:
|
|
51
|
+
test_case: LLMTestCase,
|
|
48
52
|
_show_indicator: bool = True,
|
|
49
53
|
_in_component: bool = False,
|
|
50
54
|
_log_metric_to_confident: bool = True,
|
|
51
55
|
) -> float:
|
|
52
|
-
|
|
53
|
-
test_case,
|
|
56
|
+
check_llm_test_case_params(
|
|
57
|
+
test_case,
|
|
58
|
+
self._required_params,
|
|
59
|
+
1,
|
|
60
|
+
1,
|
|
61
|
+
self,
|
|
62
|
+
self.model,
|
|
63
|
+
test_case.multimodal,
|
|
54
64
|
)
|
|
55
65
|
|
|
56
66
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -68,12 +78,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
68
78
|
)
|
|
69
79
|
)
|
|
70
80
|
else:
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
)
|
|
74
|
-
_, output_images = self.separate_images_from_text(
|
|
81
|
+
input = convert_to_multi_modal_array(test_case.input)
|
|
82
|
+
actual_output = convert_to_multi_modal_array(
|
|
75
83
|
test_case.actual_output
|
|
76
84
|
)
|
|
85
|
+
input_texts, input_images = self.separate_images_from_text(
|
|
86
|
+
input
|
|
87
|
+
)
|
|
88
|
+
_, output_images = self.separate_images_from_text(actual_output)
|
|
77
89
|
|
|
78
90
|
self.SC_scores, self.SC_reasoning = (
|
|
79
91
|
self._evaluate_semantic_consistency(
|
|
@@ -98,7 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
98
110
|
steps=[
|
|
99
111
|
f"Semantic Consistency Scores:\n{self.SC_scores}",
|
|
100
112
|
f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
|
|
101
|
-
f"Perceptual Quality Scores:\n{self.
|
|
113
|
+
f"Perceptual Quality Scores:\n{self.PQ_scores}",
|
|
102
114
|
f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
|
|
103
115
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
104
116
|
],
|
|
@@ -107,13 +119,19 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
107
119
|
|
|
108
120
|
async def a_measure(
|
|
109
121
|
self,
|
|
110
|
-
test_case:
|
|
122
|
+
test_case: LLMTestCase,
|
|
111
123
|
_show_indicator: bool = True,
|
|
112
124
|
_in_component: bool = False,
|
|
113
125
|
_log_metric_to_confident: bool = True,
|
|
114
126
|
) -> float:
|
|
115
|
-
|
|
116
|
-
test_case,
|
|
127
|
+
check_llm_test_case_params(
|
|
128
|
+
test_case,
|
|
129
|
+
self._required_params,
|
|
130
|
+
1,
|
|
131
|
+
1,
|
|
132
|
+
self,
|
|
133
|
+
self.model,
|
|
134
|
+
test_case.multimodal,
|
|
117
135
|
)
|
|
118
136
|
|
|
119
137
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -123,12 +141,12 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
123
141
|
_show_indicator=_show_indicator,
|
|
124
142
|
_in_component=_in_component,
|
|
125
143
|
):
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
)
|
|
129
|
-
_, output_images = self.separate_images_from_text(
|
|
144
|
+
input = convert_to_multi_modal_array(test_case.input)
|
|
145
|
+
actual_output = convert_to_multi_modal_array(
|
|
130
146
|
test_case.actual_output
|
|
131
147
|
)
|
|
148
|
+
input_texts, input_images = self.separate_images_from_text(input)
|
|
149
|
+
_, output_images = self.separate_images_from_text(actual_output)
|
|
132
150
|
(self.SC_scores, self.SC_reasoning), (
|
|
133
151
|
self.PQ_scores,
|
|
134
152
|
self.PQ_reasoning,
|
|
@@ -153,7 +171,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
153
171
|
steps=[
|
|
154
172
|
f"Semantic Consistency Scores:\n{self.SC_scores}",
|
|
155
173
|
f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
|
|
156
|
-
f"Perceptual Quality Scores:\n{self.
|
|
174
|
+
f"Perceptual Quality Scores:\n{self.PQ_scores}",
|
|
157
175
|
f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
|
|
158
176
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
159
177
|
],
|
|
@@ -185,24 +203,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
185
203
|
text_prompt=text_prompt
|
|
186
204
|
)
|
|
187
205
|
]
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
try:
|
|
196
|
-
res: ReasonScore = await self.model.a_generate(
|
|
197
|
-
prompt + images, schema=ReasonScore
|
|
198
|
-
)
|
|
199
|
-
return res.score, res.reasoning
|
|
200
|
-
except TypeError:
|
|
201
|
-
res = await self.model.a_generate(
|
|
202
|
-
prompt + images, input_text=prompt
|
|
203
|
-
)
|
|
204
|
-
data = trimAndLoadJson(res, self)
|
|
205
|
-
return data["score"], data["reasoning"]
|
|
206
|
+
return await a_generate_with_schema_and_extract(
|
|
207
|
+
metric=self,
|
|
208
|
+
prompt=f"{prompt} {images}",
|
|
209
|
+
schema_cls=ReasonScore,
|
|
210
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
211
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
212
|
+
)
|
|
206
213
|
|
|
207
214
|
def _evaluate_semantic_consistency(
|
|
208
215
|
self,
|
|
@@ -217,20 +224,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
217
224
|
text_prompt=text_prompt
|
|
218
225
|
)
|
|
219
226
|
]
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
prompt + images, schema=ReasonScore
|
|
228
|
-
)
|
|
229
|
-
return res.score, res.reasoning
|
|
230
|
-
except TypeError:
|
|
231
|
-
res = self.model.generate(prompt + images)
|
|
232
|
-
data = trimAndLoadJson(res, self)
|
|
233
|
-
return data["score"], data["reasoning"]
|
|
227
|
+
return generate_with_schema_and_extract(
|
|
228
|
+
metric=self,
|
|
229
|
+
prompt=f"{prompt} {images}",
|
|
230
|
+
schema_cls=ReasonScore,
|
|
231
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
232
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
233
|
+
)
|
|
234
234
|
|
|
235
235
|
async def _a_evaluate_perceptual_quality(
|
|
236
236
|
self, actual_image_output: MLLMImage
|
|
@@ -239,22 +239,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
239
239
|
prompt = [
|
|
240
240
|
ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
|
|
241
241
|
]
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
try:
|
|
250
|
-
res: ReasonScore = await self.model.a_generate(
|
|
251
|
-
prompt + images, schema=ReasonScore
|
|
252
|
-
)
|
|
253
|
-
return res.score, res.reasoning
|
|
254
|
-
except TypeError:
|
|
255
|
-
res = await self.model.a_generate(prompt + images)
|
|
256
|
-
data = trimAndLoadJson(res, self)
|
|
257
|
-
return data["score"], data["reasoning"]
|
|
242
|
+
return await a_generate_with_schema_and_extract(
|
|
243
|
+
metric=self,
|
|
244
|
+
prompt=f"{prompt} {images}",
|
|
245
|
+
schema_cls=ReasonScore,
|
|
246
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
247
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
248
|
+
)
|
|
258
249
|
|
|
259
250
|
def _evaluate_perceptual_quality(
|
|
260
251
|
self, actual_image_output: MLLMImage
|
|
@@ -263,22 +254,15 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
263
254
|
prompt = [
|
|
264
255
|
ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
|
|
265
256
|
]
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
prompt + images, schema=ReasonScore
|
|
274
|
-
)
|
|
275
|
-
return res.score, res.reasoning
|
|
276
|
-
except TypeError:
|
|
277
|
-
res = self.model.generate(prompt + images)
|
|
278
|
-
data = trimAndLoadJson(res, self)
|
|
279
|
-
return data["score"], data["reasoning"]
|
|
257
|
+
return generate_with_schema_and_extract(
|
|
258
|
+
metric=self,
|
|
259
|
+
prompt=f"{prompt} {images}",
|
|
260
|
+
schema_cls=ReasonScore,
|
|
261
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
262
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
263
|
+
)
|
|
280
264
|
|
|
281
|
-
def _calculate_score(self) ->
|
|
265
|
+
def _calculate_score(self) -> float:
|
|
282
266
|
min_SC_score = min(self.SC_scores)
|
|
283
267
|
min_PQ_score = min(self.PQ_scores)
|
|
284
268
|
return math.sqrt(min_SC_score * min_PQ_score) / 10
|
|
@@ -288,14 +272,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
288
272
|
self.success = False
|
|
289
273
|
else:
|
|
290
274
|
try:
|
|
291
|
-
self.score >= self.threshold
|
|
292
|
-
except:
|
|
275
|
+
self.success = self.score >= self.threshold
|
|
276
|
+
except TypeError:
|
|
293
277
|
self.success = False
|
|
294
278
|
return self.success
|
|
295
279
|
|
|
296
280
|
def _generate_reason(
|
|
297
281
|
self,
|
|
298
|
-
) ->
|
|
282
|
+
) -> str:
|
|
299
283
|
return textwrap.dedent(
|
|
300
284
|
f"""
|
|
301
285
|
The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}
|
|
@@ -1,42 +1,46 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from deepeval.metrics import
|
|
5
|
-
from deepeval.test_case import
|
|
4
|
+
from deepeval.metrics import BaseMetric
|
|
5
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_helpfulness.template import (
|
|
7
7
|
ImageHelpfulnessTemplate,
|
|
8
8
|
)
|
|
9
9
|
from deepeval.metrics.utils import (
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
check_llm_test_case_params,
|
|
12
|
+
initialize_model,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
|
-
from deepeval.models import
|
|
16
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.multimodal_metrics.image_helpfulness.schema import (
|
|
17
18
|
ReasonScore,
|
|
18
19
|
)
|
|
19
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
from deepeval.utils import
|
|
21
|
+
from deepeval.utils import (
|
|
22
|
+
get_or_create_event_loop,
|
|
23
|
+
convert_to_multi_modal_array,
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
|
|
23
|
-
class ImageHelpfulnessMetric(
|
|
27
|
+
class ImageHelpfulnessMetric(BaseMetric):
|
|
24
28
|
|
|
25
|
-
_required_params: List[
|
|
26
|
-
|
|
27
|
-
|
|
29
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
30
|
+
LLMTestCaseParams.INPUT,
|
|
31
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
28
32
|
]
|
|
29
33
|
|
|
30
34
|
def __init__(
|
|
31
35
|
self,
|
|
32
|
-
model: Optional[Union[str,
|
|
36
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
33
37
|
threshold: float = 0.5,
|
|
34
38
|
async_mode: bool = True,
|
|
35
39
|
strict_mode: bool = False,
|
|
36
40
|
verbose_mode: bool = False,
|
|
37
41
|
max_context_size: Optional[int] = None,
|
|
38
42
|
):
|
|
39
|
-
self.model, self.using_native_model =
|
|
43
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
40
44
|
self.evaluation_model = self.model.get_model_name()
|
|
41
45
|
self.threshold = 1 if strict_mode else threshold
|
|
42
46
|
self.strict_mode = strict_mode
|
|
@@ -46,13 +50,19 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
46
50
|
|
|
47
51
|
def measure(
|
|
48
52
|
self,
|
|
49
|
-
test_case:
|
|
53
|
+
test_case: LLMTestCase,
|
|
50
54
|
_show_indicator: bool = True,
|
|
51
55
|
_in_component: bool = False,
|
|
52
56
|
_log_metric_to_confident: bool = True,
|
|
53
57
|
) -> float:
|
|
54
|
-
|
|
55
|
-
test_case,
|
|
58
|
+
check_llm_test_case_params(
|
|
59
|
+
test_case,
|
|
60
|
+
self._required_params,
|
|
61
|
+
None,
|
|
62
|
+
None,
|
|
63
|
+
self,
|
|
64
|
+
self.model,
|
|
65
|
+
test_case.multimodal,
|
|
56
66
|
)
|
|
57
67
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
58
68
|
with metric_progress_indicator(
|
|
@@ -69,7 +79,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
69
79
|
)
|
|
70
80
|
)
|
|
71
81
|
else:
|
|
72
|
-
actual_output =
|
|
82
|
+
actual_output = convert_to_multi_modal_array(
|
|
83
|
+
test_case.actual_output
|
|
84
|
+
)
|
|
73
85
|
self.contexts_above = []
|
|
74
86
|
self.contexts_below = []
|
|
75
87
|
self.scores = []
|
|
@@ -146,13 +158,19 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
146
158
|
|
|
147
159
|
async def a_measure(
|
|
148
160
|
self,
|
|
149
|
-
test_case:
|
|
161
|
+
test_case: LLMTestCase,
|
|
150
162
|
_show_indicator: bool = True,
|
|
151
163
|
_in_component: bool = False,
|
|
152
164
|
_log_metric_to_confident: bool = True,
|
|
153
165
|
) -> float:
|
|
154
|
-
|
|
155
|
-
test_case,
|
|
166
|
+
check_llm_test_case_params(
|
|
167
|
+
test_case,
|
|
168
|
+
self._required_params,
|
|
169
|
+
None,
|
|
170
|
+
None,
|
|
171
|
+
self,
|
|
172
|
+
self.model,
|
|
173
|
+
test_case.multimodal,
|
|
156
174
|
)
|
|
157
175
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
158
176
|
with metric_progress_indicator(
|
|
@@ -161,7 +179,9 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
161
179
|
_show_indicator=_show_indicator,
|
|
162
180
|
_in_component=_in_component,
|
|
163
181
|
):
|
|
164
|
-
actual_output =
|
|
182
|
+
actual_output = convert_to_multi_modal_array(
|
|
183
|
+
test_case.actual_output
|
|
184
|
+
)
|
|
165
185
|
self.contexts_above = []
|
|
166
186
|
self.contexts_below = []
|
|
167
187
|
self.scores = []
|
|
@@ -254,21 +274,14 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
254
274
|
instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
|
|
255
275
|
context_above, context_below
|
|
256
276
|
)
|
|
257
|
-
prompt =
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
prompt, schema=ReasonScore
|
|
266
|
-
)
|
|
267
|
-
return res.score, res.reasoning
|
|
268
|
-
except TypeError:
|
|
269
|
-
res = self.model.generate(prompt)
|
|
270
|
-
data = trimAndLoadJson(res, self)
|
|
271
|
-
return data["score"], data["reasoning"]
|
|
277
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
278
|
+
return generate_with_schema_and_extract(
|
|
279
|
+
metric=self,
|
|
280
|
+
prompt=prompt,
|
|
281
|
+
schema_cls=ReasonScore,
|
|
282
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
283
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
284
|
+
)
|
|
272
285
|
|
|
273
286
|
async def a_evaluate_image_helpfulness(
|
|
274
287
|
self,
|
|
@@ -279,21 +292,14 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
279
292
|
instructions = ImageHelpfulnessTemplate.evaluate_image_helpfulness(
|
|
280
293
|
context_above, context_below
|
|
281
294
|
)
|
|
282
|
-
prompt =
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
prompt, schema=ReasonScore
|
|
291
|
-
)
|
|
292
|
-
return res.score, res.reasoning
|
|
293
|
-
except TypeError:
|
|
294
|
-
res = await self.model.a_generate(prompt)
|
|
295
|
-
data = trimAndLoadJson(res, self)
|
|
296
|
-
return data["score"], data["reasoning"]
|
|
295
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
296
|
+
return await a_generate_with_schema_and_extract(
|
|
297
|
+
metric=self,
|
|
298
|
+
prompt=prompt,
|
|
299
|
+
schema_cls=ReasonScore,
|
|
300
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
301
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
302
|
+
)
|
|
297
303
|
|
|
298
304
|
def get_image_context(
|
|
299
305
|
self, image_index: int, actual_output: List[Union[str, MLLMImage]]
|
|
@@ -328,7 +334,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
328
334
|
if isinstance(element, MLLMImage)
|
|
329
335
|
]
|
|
330
336
|
|
|
331
|
-
def calculate_score(self, scores: List[float]):
|
|
337
|
+
def calculate_score(self, scores: List[float]) -> float:
|
|
332
338
|
return sum(scores) / len(scores)
|
|
333
339
|
|
|
334
340
|
def is_successful(self) -> bool:
|
|
@@ -337,7 +343,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
337
343
|
else:
|
|
338
344
|
try:
|
|
339
345
|
self.success = self.score >= self.threshold
|
|
340
|
-
except:
|
|
346
|
+
except TypeError:
|
|
341
347
|
self.success = False
|
|
342
348
|
return self.success
|
|
343
349
|
|