deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,42 +1,46 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from deepeval.metrics import
|
|
5
|
-
from deepeval.test_case import
|
|
4
|
+
from deepeval.metrics import BaseMetric
|
|
5
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_reference.template import (
|
|
7
7
|
ImageReferenceTemplate,
|
|
8
8
|
)
|
|
9
9
|
from deepeval.metrics.utils import (
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
check_llm_test_case_params,
|
|
12
|
+
initialize_model,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
|
-
from deepeval.models import
|
|
16
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.multimodal_metrics.image_reference.schema import (
|
|
17
18
|
ReasonScore,
|
|
18
19
|
)
|
|
19
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
from deepeval.utils import
|
|
21
|
+
from deepeval.utils import (
|
|
22
|
+
get_or_create_event_loop,
|
|
23
|
+
convert_to_multi_modal_array,
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
|
|
23
|
-
class ImageReferenceMetric(
|
|
27
|
+
class ImageReferenceMetric(BaseMetric):
|
|
24
28
|
|
|
25
|
-
_required_params: List[
|
|
26
|
-
|
|
27
|
-
|
|
29
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
30
|
+
LLMTestCaseParams.INPUT,
|
|
31
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
28
32
|
]
|
|
29
33
|
|
|
30
34
|
def __init__(
|
|
31
35
|
self,
|
|
32
|
-
model: Optional[Union[str,
|
|
36
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
33
37
|
threshold: float = 0.5,
|
|
34
38
|
async_mode: bool = True,
|
|
35
39
|
strict_mode: bool = False,
|
|
36
40
|
verbose_mode: bool = False,
|
|
37
41
|
max_context_size: Optional[int] = None,
|
|
38
42
|
):
|
|
39
|
-
self.model, self.using_native_model =
|
|
43
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
40
44
|
self.evaluation_model = self.model.get_model_name()
|
|
41
45
|
self.threshold = 1 if strict_mode else threshold
|
|
42
46
|
self.strict_mode = strict_mode
|
|
@@ -46,13 +50,19 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
46
50
|
|
|
47
51
|
def measure(
|
|
48
52
|
self,
|
|
49
|
-
test_case:
|
|
53
|
+
test_case: LLMTestCase,
|
|
50
54
|
_show_indicator: bool = True,
|
|
51
55
|
_in_component: bool = False,
|
|
52
56
|
_log_metric_to_confident: bool = True,
|
|
53
57
|
) -> float:
|
|
54
|
-
|
|
55
|
-
test_case,
|
|
58
|
+
check_llm_test_case_params(
|
|
59
|
+
test_case,
|
|
60
|
+
self._required_params,
|
|
61
|
+
None,
|
|
62
|
+
None,
|
|
63
|
+
self,
|
|
64
|
+
self.model,
|
|
65
|
+
test_case.multimodal,
|
|
56
66
|
)
|
|
57
67
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
58
68
|
with metric_progress_indicator(
|
|
@@ -69,7 +79,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
69
79
|
)
|
|
70
80
|
)
|
|
71
81
|
else:
|
|
72
|
-
actual_output =
|
|
82
|
+
actual_output = convert_to_multi_modal_array(
|
|
83
|
+
test_case.actual_output
|
|
84
|
+
)
|
|
73
85
|
self.contexts_above = []
|
|
74
86
|
self.contexts_below = []
|
|
75
87
|
self.scores = []
|
|
@@ -146,13 +158,19 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
146
158
|
|
|
147
159
|
async def a_measure(
|
|
148
160
|
self,
|
|
149
|
-
test_case:
|
|
161
|
+
test_case: LLMTestCase,
|
|
150
162
|
_show_indicator: bool = True,
|
|
151
163
|
_in_component: bool = False,
|
|
152
164
|
_log_metric_to_confident: bool = True,
|
|
153
165
|
) -> float:
|
|
154
|
-
|
|
155
|
-
test_case,
|
|
166
|
+
check_llm_test_case_params(
|
|
167
|
+
test_case,
|
|
168
|
+
self._required_params,
|
|
169
|
+
None,
|
|
170
|
+
None,
|
|
171
|
+
self,
|
|
172
|
+
self.model,
|
|
173
|
+
test_case.multimodal,
|
|
156
174
|
)
|
|
157
175
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
158
176
|
with metric_progress_indicator(
|
|
@@ -161,7 +179,9 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
161
179
|
_show_indicator=_show_indicator,
|
|
162
180
|
_in_component=_in_component,
|
|
163
181
|
):
|
|
164
|
-
actual_output =
|
|
182
|
+
actual_output = convert_to_multi_modal_array(
|
|
183
|
+
test_case.actual_output
|
|
184
|
+
)
|
|
165
185
|
self.contexts_above = []
|
|
166
186
|
self.contexts_below = []
|
|
167
187
|
self.scores = []
|
|
@@ -254,21 +274,14 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
254
274
|
instructions = ImageReferenceTemplate.evaluate_image_reference(
|
|
255
275
|
context_above, context_below
|
|
256
276
|
)
|
|
257
|
-
prompt =
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
prompt, schema=ReasonScore
|
|
266
|
-
)
|
|
267
|
-
return res.score, res.reasoning
|
|
268
|
-
except TypeError:
|
|
269
|
-
res = self.model.generate(prompt)
|
|
270
|
-
data = trimAndLoadJson(res, self)
|
|
271
|
-
return data["score"], data["reasoning"]
|
|
277
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
278
|
+
return generate_with_schema_and_extract(
|
|
279
|
+
metric=self,
|
|
280
|
+
prompt=prompt,
|
|
281
|
+
schema_cls=ReasonScore,
|
|
282
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
283
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
284
|
+
)
|
|
272
285
|
|
|
273
286
|
async def a_evaluate_image_reference(
|
|
274
287
|
self,
|
|
@@ -279,21 +292,14 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
279
292
|
instructions = ImageReferenceTemplate.evaluate_image_reference(
|
|
280
293
|
context_above, context_below
|
|
281
294
|
)
|
|
282
|
-
prompt =
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
prompt, schema=ReasonScore
|
|
291
|
-
)
|
|
292
|
-
return res.score, res.reasoning
|
|
293
|
-
except TypeError:
|
|
294
|
-
res = await self.model.a_generate(prompt)
|
|
295
|
-
data = trimAndLoadJson(res, self)
|
|
296
|
-
return data["score"], data["reasoning"]
|
|
295
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
296
|
+
return await a_generate_with_schema_and_extract(
|
|
297
|
+
metric=self,
|
|
298
|
+
prompt=prompt,
|
|
299
|
+
schema_cls=ReasonScore,
|
|
300
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
301
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
302
|
+
)
|
|
297
303
|
|
|
298
304
|
def get_image_context(
|
|
299
305
|
self, image_index: int, actual_output: List[Union[str, MLLMImage]]
|
|
@@ -328,7 +334,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
328
334
|
if isinstance(element, MLLMImage)
|
|
329
335
|
]
|
|
330
336
|
|
|
331
|
-
def calculate_score(self, scores: List[float]):
|
|
337
|
+
def calculate_score(self, scores: List[float]) -> float:
|
|
332
338
|
return sum(scores) / len(scores)
|
|
333
339
|
|
|
334
340
|
def is_successful(self) -> bool:
|
|
@@ -337,7 +343,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
337
343
|
else:
|
|
338
344
|
try:
|
|
339
345
|
self.success = self.score >= self.threshold
|
|
340
|
-
except:
|
|
346
|
+
except TypeError:
|
|
341
347
|
self.success = False
|
|
342
348
|
return self.success
|
|
343
349
|
|
|
@@ -3,38 +3,42 @@ from typing import Optional, List, Tuple, Union
|
|
|
3
3
|
import math
|
|
4
4
|
import textwrap
|
|
5
5
|
|
|
6
|
-
from deepeval.metrics import
|
|
7
|
-
from deepeval.test_case import
|
|
6
|
+
from deepeval.metrics import BaseMetric
|
|
7
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
8
8
|
from deepeval.metrics.multimodal_metrics.text_to_image.template import (
|
|
9
9
|
TextToImageTemplate,
|
|
10
10
|
)
|
|
11
|
-
from deepeval.utils import
|
|
11
|
+
from deepeval.utils import (
|
|
12
|
+
get_or_create_event_loop,
|
|
13
|
+
convert_to_multi_modal_array,
|
|
14
|
+
)
|
|
12
15
|
from deepeval.metrics.utils import (
|
|
13
16
|
construct_verbose_logs,
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
check_llm_test_case_params,
|
|
18
|
+
initialize_model,
|
|
19
|
+
a_generate_with_schema_and_extract,
|
|
20
|
+
generate_with_schema_and_extract,
|
|
17
21
|
)
|
|
18
|
-
from deepeval.models import
|
|
22
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
19
23
|
from deepeval.metrics.multimodal_metrics.text_to_image.schema import ReasonScore
|
|
20
24
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
25
|
|
|
22
|
-
required_params: List[
|
|
23
|
-
|
|
24
|
-
|
|
26
|
+
required_params: List[LLMTestCaseParams] = [
|
|
27
|
+
LLMTestCaseParams.INPUT,
|
|
28
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
25
29
|
]
|
|
26
30
|
|
|
27
31
|
|
|
28
|
-
class TextToImageMetric(
|
|
32
|
+
class TextToImageMetric(BaseMetric):
|
|
29
33
|
def __init__(
|
|
30
34
|
self,
|
|
31
|
-
model: Optional[Union[str,
|
|
35
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
32
36
|
threshold: float = 0.5,
|
|
33
37
|
async_mode: bool = True,
|
|
34
38
|
strict_mode: bool = False,
|
|
35
39
|
verbose_mode: bool = False,
|
|
36
40
|
):
|
|
37
|
-
self.model, self.using_native_model =
|
|
41
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
38
42
|
self.evaluation_model = self.model.get_model_name()
|
|
39
43
|
self.threshold = 1 if strict_mode else threshold
|
|
40
44
|
self.strict_mode = strict_mode
|
|
@@ -43,11 +47,19 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
43
47
|
|
|
44
48
|
def measure(
|
|
45
49
|
self,
|
|
46
|
-
test_case:
|
|
50
|
+
test_case: LLMTestCase,
|
|
47
51
|
_show_indicator: bool = True,
|
|
48
52
|
_in_component: bool = False,
|
|
49
53
|
) -> float:
|
|
50
|
-
|
|
54
|
+
check_llm_test_case_params(
|
|
55
|
+
test_case,
|
|
56
|
+
required_params,
|
|
57
|
+
0,
|
|
58
|
+
1,
|
|
59
|
+
self,
|
|
60
|
+
self.model,
|
|
61
|
+
test_case.multimodal,
|
|
62
|
+
)
|
|
51
63
|
|
|
52
64
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
53
65
|
with metric_progress_indicator(
|
|
@@ -63,10 +75,12 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
63
75
|
)
|
|
64
76
|
)
|
|
65
77
|
else:
|
|
66
|
-
|
|
67
|
-
|
|
78
|
+
input = convert_to_multi_modal_array(test_case.input)
|
|
79
|
+
actual_output = convert_to_multi_modal_array(
|
|
68
80
|
test_case.actual_output
|
|
69
81
|
)
|
|
82
|
+
input_texts, _ = self.separate_images_from_text(input)
|
|
83
|
+
_, output_images = self.separate_images_from_text(actual_output)
|
|
70
84
|
|
|
71
85
|
self.SC_scores, self.SC_reasoning = (
|
|
72
86
|
self._evaluate_semantic_consistency(
|
|
@@ -90,7 +104,7 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
90
104
|
steps=[
|
|
91
105
|
f"Semantic Consistency Scores:\n{self.SC_scores}",
|
|
92
106
|
f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
|
|
93
|
-
f"Perceptual Quality Scores:\n{self.
|
|
107
|
+
f"Perceptual Quality Scores:\n{self.PQ_scores}",
|
|
94
108
|
f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
|
|
95
109
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
96
110
|
],
|
|
@@ -99,11 +113,19 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
99
113
|
|
|
100
114
|
async def a_measure(
|
|
101
115
|
self,
|
|
102
|
-
test_case:
|
|
116
|
+
test_case: LLMTestCase,
|
|
103
117
|
_show_indicator: bool = True,
|
|
104
118
|
_in_component: bool = False,
|
|
105
119
|
) -> float:
|
|
106
|
-
|
|
120
|
+
check_llm_test_case_params(
|
|
121
|
+
test_case,
|
|
122
|
+
required_params,
|
|
123
|
+
0,
|
|
124
|
+
1,
|
|
125
|
+
self,
|
|
126
|
+
self.model,
|
|
127
|
+
test_case.multimodal,
|
|
128
|
+
)
|
|
107
129
|
|
|
108
130
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
109
131
|
with metric_progress_indicator(
|
|
@@ -112,10 +134,12 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
112
134
|
_show_indicator=_show_indicator,
|
|
113
135
|
_in_component=_in_component,
|
|
114
136
|
):
|
|
115
|
-
|
|
116
|
-
|
|
137
|
+
input = convert_to_multi_modal_array(test_case.input)
|
|
138
|
+
actual_output = convert_to_multi_modal_array(
|
|
117
139
|
test_case.actual_output
|
|
118
140
|
)
|
|
141
|
+
input_texts, _ = self.separate_images_from_text(input)
|
|
142
|
+
_, output_images = self.separate_images_from_text(actual_output)
|
|
119
143
|
(self.SC_scores, self.SC_reasoning), (
|
|
120
144
|
self.PQ_scores,
|
|
121
145
|
self.PQ_reasoning,
|
|
@@ -139,7 +163,7 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
139
163
|
steps=[
|
|
140
164
|
f"Semantic Consistency Scores:\n{self.SC_scores}",
|
|
141
165
|
f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
|
|
142
|
-
f"Perceptual Quality Scores:\n{self.
|
|
166
|
+
f"Perceptual Quality Scores:\n{self.PQ_scores}",
|
|
143
167
|
f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
|
|
144
168
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
145
169
|
],
|
|
@@ -163,106 +187,86 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
163
187
|
text_prompt: str,
|
|
164
188
|
actual_image_output: MLLMImage,
|
|
165
189
|
) -> Tuple[List[int], str]:
|
|
166
|
-
images: List[MLLMImage] = []
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
)
|
|
172
|
-
]
|
|
173
|
-
if self.using_native_model:
|
|
174
|
-
res, cost = await self.model.a_generate(
|
|
175
|
-
prompt + images, ReasonScore
|
|
176
|
-
)
|
|
177
|
-
self.evaluation_cost += cost
|
|
178
|
-
return res.score, res.reasoning
|
|
179
|
-
else:
|
|
180
|
-
try:
|
|
181
|
-
res: ReasonScore = await self.model.a_generate(
|
|
182
|
-
prompt + images, schema=ReasonScore
|
|
183
|
-
)
|
|
184
|
-
return res.score, res.reasoning
|
|
185
|
-
except TypeError:
|
|
186
|
-
res = await self.model.a_generate(
|
|
187
|
-
prompt + images, input_text=prompt
|
|
190
|
+
images: List[MLLMImage] = [actual_image_output]
|
|
191
|
+
prompt = f"""
|
|
192
|
+
{
|
|
193
|
+
TextToImageTemplate.generate_semantic_consistency_evaluation_results(
|
|
194
|
+
text_prompt=text_prompt
|
|
188
195
|
)
|
|
189
|
-
|
|
190
|
-
|
|
196
|
+
}
|
|
197
|
+
Images:
|
|
198
|
+
{images}
|
|
199
|
+
"""
|
|
200
|
+
return await a_generate_with_schema_and_extract(
|
|
201
|
+
metric=self,
|
|
202
|
+
prompt=prompt,
|
|
203
|
+
schema_cls=ReasonScore,
|
|
204
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
205
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
206
|
+
)
|
|
191
207
|
|
|
192
208
|
def _evaluate_semantic_consistency(
|
|
193
209
|
self,
|
|
194
210
|
text_prompt: str,
|
|
195
211
|
actual_image_output: MLLMImage,
|
|
196
212
|
) -> Tuple[List[int], str]:
|
|
197
|
-
images: List[MLLMImage] = []
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
)
|
|
203
|
-
]
|
|
204
|
-
if self.using_native_model:
|
|
205
|
-
res, cost = self.model.generate(prompt + images, ReasonScore)
|
|
206
|
-
self.evaluation_cost += cost
|
|
207
|
-
return res.score, res.reasoning
|
|
208
|
-
else:
|
|
209
|
-
try:
|
|
210
|
-
res: ReasonScore = self.model.generate(
|
|
211
|
-
prompt + images, schema=ReasonScore
|
|
213
|
+
images: List[MLLMImage] = [actual_image_output]
|
|
214
|
+
prompt = f"""
|
|
215
|
+
{
|
|
216
|
+
TextToImageTemplate.generate_semantic_consistency_evaluation_results(
|
|
217
|
+
text_prompt=text_prompt
|
|
212
218
|
)
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
219
|
+
}
|
|
220
|
+
Images:
|
|
221
|
+
{images}
|
|
222
|
+
"""
|
|
223
|
+
return generate_with_schema_and_extract(
|
|
224
|
+
metric=self,
|
|
225
|
+
prompt=prompt,
|
|
226
|
+
schema_cls=ReasonScore,
|
|
227
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
228
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
229
|
+
)
|
|
218
230
|
|
|
219
231
|
async def _a_evaluate_perceptual_quality(
|
|
220
232
|
self, actual_image_output: MLLMImage
|
|
221
233
|
) -> Tuple[List[int], str]:
|
|
222
234
|
images: List[MLLMImage] = [actual_image_output]
|
|
223
|
-
prompt =
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
return res.score, res.reasoning
|
|
238
|
-
except TypeError:
|
|
239
|
-
res = await self.model.a_generate(prompt + images)
|
|
240
|
-
data = trimAndLoadJson(res, self)
|
|
241
|
-
return data["score"], data["reasoning"]
|
|
235
|
+
prompt = f"""
|
|
236
|
+
{
|
|
237
|
+
TextToImageTemplate.generate_perceptual_quality_evaluation_results()
|
|
238
|
+
}
|
|
239
|
+
Images:
|
|
240
|
+
{images}
|
|
241
|
+
"""
|
|
242
|
+
return await a_generate_with_schema_and_extract(
|
|
243
|
+
metric=self,
|
|
244
|
+
prompt=prompt,
|
|
245
|
+
schema_cls=ReasonScore,
|
|
246
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
247
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
248
|
+
)
|
|
242
249
|
|
|
243
250
|
def _evaluate_perceptual_quality(
|
|
244
251
|
self, actual_image_output: MLLMImage
|
|
245
252
|
) -> Tuple[List[int], str]:
|
|
246
253
|
images: List[MLLMImage] = [actual_image_output]
|
|
247
|
-
prompt =
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
res = self.model.generate(prompt + images)
|
|
262
|
-
data = trimAndLoadJson(res, self)
|
|
263
|
-
return data["score"], data["reasoning"]
|
|
254
|
+
prompt = f"""
|
|
255
|
+
{
|
|
256
|
+
TextToImageTemplate.generate_perceptual_quality_evaluation_results()
|
|
257
|
+
}
|
|
258
|
+
Images:
|
|
259
|
+
{images}
|
|
260
|
+
"""
|
|
261
|
+
return generate_with_schema_and_extract(
|
|
262
|
+
metric=self,
|
|
263
|
+
prompt=prompt,
|
|
264
|
+
schema_cls=ReasonScore,
|
|
265
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
266
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
267
|
+
)
|
|
264
268
|
|
|
265
|
-
def _calculate_score(self) ->
|
|
269
|
+
def _calculate_score(self) -> float:
|
|
266
270
|
min_SC_score = min(self.SC_scores)
|
|
267
271
|
min_PQ_score = min(self.PQ_scores)
|
|
268
272
|
return math.sqrt(min_SC_score * min_PQ_score) / 10
|
|
@@ -272,14 +276,12 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
272
276
|
self.success = False
|
|
273
277
|
else:
|
|
274
278
|
try:
|
|
275
|
-
self.score >= self.threshold
|
|
276
|
-
except:
|
|
279
|
+
self.success = self.score >= self.threshold
|
|
280
|
+
except TypeError:
|
|
277
281
|
self.success = False
|
|
278
282
|
return self.success
|
|
279
283
|
|
|
280
|
-
def _generate_reason(
|
|
281
|
-
self,
|
|
282
|
-
) -> Tuple[List[float], str]:
|
|
284
|
+
def _generate_reason(self) -> str:
|
|
283
285
|
return textwrap.dedent(
|
|
284
286
|
f"""
|
|
285
287
|
The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}
|