deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from deepeval.metrics import
|
|
4
|
+
from deepeval.metrics import BaseMetric
|
|
5
5
|
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_helpfulness.template import (
|
|
7
7
|
ImageHelpfulnessTemplate,
|
|
8
8
|
)
|
|
9
9
|
from deepeval.metrics.utils import (
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
|
|
12
|
-
check_mllm_test_case_params,
|
|
11
|
+
check_llm_test_case_params,
|
|
13
12
|
initialize_model,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.multimodal_metrics.image_helpfulness.schema import (
|
|
@@ -23,7 +24,7 @@ from deepeval.utils import (
|
|
|
23
24
|
)
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
class ImageHelpfulnessMetric(
|
|
27
|
+
class ImageHelpfulnessMetric(BaseMetric):
|
|
27
28
|
|
|
28
29
|
_required_params: List[LLMTestCaseParams] = [
|
|
29
30
|
LLMTestCaseParams.INPUT,
|
|
@@ -54,8 +55,14 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
54
55
|
_in_component: bool = False,
|
|
55
56
|
_log_metric_to_confident: bool = True,
|
|
56
57
|
) -> float:
|
|
57
|
-
|
|
58
|
-
test_case,
|
|
58
|
+
check_llm_test_case_params(
|
|
59
|
+
test_case,
|
|
60
|
+
self._required_params,
|
|
61
|
+
None,
|
|
62
|
+
None,
|
|
63
|
+
self,
|
|
64
|
+
self.model,
|
|
65
|
+
test_case.multimodal,
|
|
59
66
|
)
|
|
60
67
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
61
68
|
with metric_progress_indicator(
|
|
@@ -156,8 +163,14 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
156
163
|
_in_component: bool = False,
|
|
157
164
|
_log_metric_to_confident: bool = True,
|
|
158
165
|
) -> float:
|
|
159
|
-
|
|
160
|
-
test_case,
|
|
166
|
+
check_llm_test_case_params(
|
|
167
|
+
test_case,
|
|
168
|
+
self._required_params,
|
|
169
|
+
None,
|
|
170
|
+
None,
|
|
171
|
+
self,
|
|
172
|
+
self.model,
|
|
173
|
+
test_case.multimodal,
|
|
161
174
|
)
|
|
162
175
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
163
176
|
with metric_progress_indicator(
|
|
@@ -262,20 +275,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
262
275
|
context_above, context_below
|
|
263
276
|
)
|
|
264
277
|
prompt = f"{instructions} \nImages: {image}"
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
prompt, schema=ReasonScore
|
|
273
|
-
)
|
|
274
|
-
return res.score, res.reasoning
|
|
275
|
-
except TypeError:
|
|
276
|
-
res = self.model.generate(prompt)
|
|
277
|
-
data = trimAndLoadJson(res, self)
|
|
278
|
-
return data["score"], data["reasoning"]
|
|
278
|
+
return generate_with_schema_and_extract(
|
|
279
|
+
metric=self,
|
|
280
|
+
prompt=prompt,
|
|
281
|
+
schema_cls=ReasonScore,
|
|
282
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
283
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
284
|
+
)
|
|
279
285
|
|
|
280
286
|
async def a_evaluate_image_helpfulness(
|
|
281
287
|
self,
|
|
@@ -287,20 +293,13 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
287
293
|
context_above, context_below
|
|
288
294
|
)
|
|
289
295
|
prompt = f"{instructions} \nImages: {image}"
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
prompt, schema=ReasonScore
|
|
298
|
-
)
|
|
299
|
-
return res.score, res.reasoning
|
|
300
|
-
except TypeError:
|
|
301
|
-
res = await self.model.a_generate(prompt)
|
|
302
|
-
data = trimAndLoadJson(res, self)
|
|
303
|
-
return data["score"], data["reasoning"]
|
|
296
|
+
return await a_generate_with_schema_and_extract(
|
|
297
|
+
metric=self,
|
|
298
|
+
prompt=prompt,
|
|
299
|
+
schema_cls=ReasonScore,
|
|
300
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
301
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
302
|
+
)
|
|
304
303
|
|
|
305
304
|
def get_image_context(
|
|
306
305
|
self, image_index: int, actual_output: List[Union[str, MLLMImage]]
|
|
@@ -335,7 +334,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
335
334
|
if isinstance(element, MLLMImage)
|
|
336
335
|
]
|
|
337
336
|
|
|
338
|
-
def calculate_score(self, scores: List[float]):
|
|
337
|
+
def calculate_score(self, scores: List[float]) -> float:
|
|
339
338
|
return sum(scores) / len(scores)
|
|
340
339
|
|
|
341
340
|
def is_successful(self) -> bool:
|
|
@@ -344,7 +343,7 @@ class ImageHelpfulnessMetric(BaseMultimodalMetric):
|
|
|
344
343
|
else:
|
|
345
344
|
try:
|
|
346
345
|
self.success = self.score >= self.threshold
|
|
347
|
-
except:
|
|
346
|
+
except TypeError:
|
|
348
347
|
self.success = False
|
|
349
348
|
return self.success
|
|
350
349
|
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from deepeval.metrics import
|
|
4
|
+
from deepeval.metrics import BaseMetric
|
|
5
5
|
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_reference.template import (
|
|
7
7
|
ImageReferenceTemplate,
|
|
8
8
|
)
|
|
9
9
|
from deepeval.metrics.utils import (
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
|
|
12
|
-
check_mllm_test_case_params,
|
|
11
|
+
check_llm_test_case_params,
|
|
13
12
|
initialize_model,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.multimodal_metrics.image_reference.schema import (
|
|
@@ -23,7 +24,7 @@ from deepeval.utils import (
|
|
|
23
24
|
)
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
class ImageReferenceMetric(
|
|
27
|
+
class ImageReferenceMetric(BaseMetric):
|
|
27
28
|
|
|
28
29
|
_required_params: List[LLMTestCaseParams] = [
|
|
29
30
|
LLMTestCaseParams.INPUT,
|
|
@@ -54,8 +55,14 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
54
55
|
_in_component: bool = False,
|
|
55
56
|
_log_metric_to_confident: bool = True,
|
|
56
57
|
) -> float:
|
|
57
|
-
|
|
58
|
-
test_case,
|
|
58
|
+
check_llm_test_case_params(
|
|
59
|
+
test_case,
|
|
60
|
+
self._required_params,
|
|
61
|
+
None,
|
|
62
|
+
None,
|
|
63
|
+
self,
|
|
64
|
+
self.model,
|
|
65
|
+
test_case.multimodal,
|
|
59
66
|
)
|
|
60
67
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
61
68
|
with metric_progress_indicator(
|
|
@@ -156,8 +163,14 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
156
163
|
_in_component: bool = False,
|
|
157
164
|
_log_metric_to_confident: bool = True,
|
|
158
165
|
) -> float:
|
|
159
|
-
|
|
160
|
-
test_case,
|
|
166
|
+
check_llm_test_case_params(
|
|
167
|
+
test_case,
|
|
168
|
+
self._required_params,
|
|
169
|
+
None,
|
|
170
|
+
None,
|
|
171
|
+
self,
|
|
172
|
+
self.model,
|
|
173
|
+
test_case.multimodal,
|
|
161
174
|
)
|
|
162
175
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
163
176
|
with metric_progress_indicator(
|
|
@@ -262,20 +275,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
262
275
|
context_above, context_below
|
|
263
276
|
)
|
|
264
277
|
prompt = f"{instructions} \nImages: {image}"
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
prompt, schema=ReasonScore
|
|
273
|
-
)
|
|
274
|
-
return res.score, res.reasoning
|
|
275
|
-
except TypeError:
|
|
276
|
-
res = self.model.generate(prompt)
|
|
277
|
-
data = trimAndLoadJson(res, self)
|
|
278
|
-
return data["score"], data["reasoning"]
|
|
278
|
+
return generate_with_schema_and_extract(
|
|
279
|
+
metric=self,
|
|
280
|
+
prompt=prompt,
|
|
281
|
+
schema_cls=ReasonScore,
|
|
282
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
283
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
284
|
+
)
|
|
279
285
|
|
|
280
286
|
async def a_evaluate_image_reference(
|
|
281
287
|
self,
|
|
@@ -287,20 +293,13 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
287
293
|
context_above, context_below
|
|
288
294
|
)
|
|
289
295
|
prompt = f"{instructions} \nImages: {image}"
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
prompt, schema=ReasonScore
|
|
298
|
-
)
|
|
299
|
-
return res.score, res.reasoning
|
|
300
|
-
except TypeError:
|
|
301
|
-
res = await self.model.a_generate(prompt)
|
|
302
|
-
data = trimAndLoadJson(res, self)
|
|
303
|
-
return data["score"], data["reasoning"]
|
|
296
|
+
return await a_generate_with_schema_and_extract(
|
|
297
|
+
metric=self,
|
|
298
|
+
prompt=prompt,
|
|
299
|
+
schema_cls=ReasonScore,
|
|
300
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
301
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
302
|
+
)
|
|
304
303
|
|
|
305
304
|
def get_image_context(
|
|
306
305
|
self, image_index: int, actual_output: List[Union[str, MLLMImage]]
|
|
@@ -335,7 +334,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
335
334
|
if isinstance(element, MLLMImage)
|
|
336
335
|
]
|
|
337
336
|
|
|
338
|
-
def calculate_score(self, scores: List[float]):
|
|
337
|
+
def calculate_score(self, scores: List[float]) -> float:
|
|
339
338
|
return sum(scores) / len(scores)
|
|
340
339
|
|
|
341
340
|
def is_successful(self) -> bool:
|
|
@@ -344,7 +343,7 @@ class ImageReferenceMetric(BaseMultimodalMetric):
|
|
|
344
343
|
else:
|
|
345
344
|
try:
|
|
346
345
|
self.success = self.score >= self.threshold
|
|
347
|
-
except:
|
|
346
|
+
except TypeError:
|
|
348
347
|
self.success = False
|
|
349
348
|
return self.success
|
|
350
349
|
|
|
@@ -3,7 +3,7 @@ from typing import Optional, List, Tuple, Union
|
|
|
3
3
|
import math
|
|
4
4
|
import textwrap
|
|
5
5
|
|
|
6
|
-
from deepeval.metrics import
|
|
6
|
+
from deepeval.metrics import BaseMetric
|
|
7
7
|
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
8
8
|
from deepeval.metrics.multimodal_metrics.text_to_image.template import (
|
|
9
9
|
TextToImageTemplate,
|
|
@@ -14,9 +14,10 @@ from deepeval.utils import (
|
|
|
14
14
|
)
|
|
15
15
|
from deepeval.metrics.utils import (
|
|
16
16
|
construct_verbose_logs,
|
|
17
|
-
|
|
18
|
-
check_mllm_test_case_params,
|
|
17
|
+
check_llm_test_case_params,
|
|
19
18
|
initialize_model,
|
|
19
|
+
a_generate_with_schema_and_extract,
|
|
20
|
+
generate_with_schema_and_extract,
|
|
20
21
|
)
|
|
21
22
|
from deepeval.models import DeepEvalBaseLLM
|
|
22
23
|
from deepeval.metrics.multimodal_metrics.text_to_image.schema import ReasonScore
|
|
@@ -28,7 +29,7 @@ required_params: List[LLMTestCaseParams] = [
|
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
|
|
31
|
-
class TextToImageMetric(
|
|
32
|
+
class TextToImageMetric(BaseMetric):
|
|
32
33
|
def __init__(
|
|
33
34
|
self,
|
|
34
35
|
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
@@ -50,8 +51,14 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
50
51
|
_show_indicator: bool = True,
|
|
51
52
|
_in_component: bool = False,
|
|
52
53
|
) -> float:
|
|
53
|
-
|
|
54
|
-
test_case,
|
|
54
|
+
check_llm_test_case_params(
|
|
55
|
+
test_case,
|
|
56
|
+
required_params,
|
|
57
|
+
0,
|
|
58
|
+
1,
|
|
59
|
+
self,
|
|
60
|
+
self.model,
|
|
61
|
+
test_case.multimodal,
|
|
55
62
|
)
|
|
56
63
|
|
|
57
64
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -97,7 +104,7 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
97
104
|
steps=[
|
|
98
105
|
f"Semantic Consistency Scores:\n{self.SC_scores}",
|
|
99
106
|
f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
|
|
100
|
-
f"Perceptual Quality Scores:\n{self.
|
|
107
|
+
f"Perceptual Quality Scores:\n{self.PQ_scores}",
|
|
101
108
|
f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
|
|
102
109
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
103
110
|
],
|
|
@@ -110,8 +117,14 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
110
117
|
_show_indicator: bool = True,
|
|
111
118
|
_in_component: bool = False,
|
|
112
119
|
) -> float:
|
|
113
|
-
|
|
114
|
-
test_case,
|
|
120
|
+
check_llm_test_case_params(
|
|
121
|
+
test_case,
|
|
122
|
+
required_params,
|
|
123
|
+
0,
|
|
124
|
+
1,
|
|
125
|
+
self,
|
|
126
|
+
self.model,
|
|
127
|
+
test_case.multimodal,
|
|
115
128
|
)
|
|
116
129
|
|
|
117
130
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -150,7 +163,7 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
150
163
|
steps=[
|
|
151
164
|
f"Semantic Consistency Scores:\n{self.SC_scores}",
|
|
152
165
|
f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
|
|
153
|
-
f"Perceptual Quality Scores:\n{self.
|
|
166
|
+
f"Perceptual Quality Scores:\n{self.PQ_scores}",
|
|
154
167
|
f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
|
|
155
168
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
156
169
|
],
|
|
@@ -174,8 +187,7 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
174
187
|
text_prompt: str,
|
|
175
188
|
actual_image_output: MLLMImage,
|
|
176
189
|
) -> Tuple[List[int], str]:
|
|
177
|
-
images: List[MLLMImage] = []
|
|
178
|
-
images.append(actual_image_output)
|
|
190
|
+
images: List[MLLMImage] = [actual_image_output]
|
|
179
191
|
prompt = f"""
|
|
180
192
|
{
|
|
181
193
|
TextToImageTemplate.generate_semantic_consistency_evaluation_results(
|
|
@@ -185,28 +197,20 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
185
197
|
Images:
|
|
186
198
|
{images}
|
|
187
199
|
"""
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
prompt, schema=ReasonScore
|
|
196
|
-
)
|
|
197
|
-
return res.score, res.reasoning
|
|
198
|
-
except TypeError:
|
|
199
|
-
res = await self.model.a_generate(prompt, input_text=prompt)
|
|
200
|
-
data = trimAndLoadJson(res, self)
|
|
201
|
-
return data["score"], data["reasoning"]
|
|
200
|
+
return await a_generate_with_schema_and_extract(
|
|
201
|
+
metric=self,
|
|
202
|
+
prompt=prompt,
|
|
203
|
+
schema_cls=ReasonScore,
|
|
204
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
205
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
206
|
+
)
|
|
202
207
|
|
|
203
208
|
def _evaluate_semantic_consistency(
|
|
204
209
|
self,
|
|
205
210
|
text_prompt: str,
|
|
206
211
|
actual_image_output: MLLMImage,
|
|
207
212
|
) -> Tuple[List[int], str]:
|
|
208
|
-
images: List[MLLMImage] = []
|
|
209
|
-
images.append(actual_image_output)
|
|
213
|
+
images: List[MLLMImage] = [actual_image_output]
|
|
210
214
|
prompt = f"""
|
|
211
215
|
{
|
|
212
216
|
TextToImageTemplate.generate_semantic_consistency_evaluation_results(
|
|
@@ -216,20 +220,13 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
216
220
|
Images:
|
|
217
221
|
{images}
|
|
218
222
|
"""
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
prompt, schema=ReasonScore
|
|
227
|
-
)
|
|
228
|
-
return res.score, res.reasoning
|
|
229
|
-
except TypeError:
|
|
230
|
-
res = self.model.generate(prompt)
|
|
231
|
-
data = trimAndLoadJson(res, self)
|
|
232
|
-
return data["score"], data["reasoning"]
|
|
223
|
+
return generate_with_schema_and_extract(
|
|
224
|
+
metric=self,
|
|
225
|
+
prompt=prompt,
|
|
226
|
+
schema_cls=ReasonScore,
|
|
227
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
228
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
229
|
+
)
|
|
233
230
|
|
|
234
231
|
async def _a_evaluate_perceptual_quality(
|
|
235
232
|
self, actual_image_output: MLLMImage
|
|
@@ -242,20 +239,13 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
242
239
|
Images:
|
|
243
240
|
{images}
|
|
244
241
|
"""
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
prompt, schema=ReasonScore
|
|
253
|
-
)
|
|
254
|
-
return res.score, res.reasoning
|
|
255
|
-
except TypeError:
|
|
256
|
-
res = await self.model.a_generate(prompt)
|
|
257
|
-
data = trimAndLoadJson(res, self)
|
|
258
|
-
return data["score"], data["reasoning"]
|
|
242
|
+
return await a_generate_with_schema_and_extract(
|
|
243
|
+
metric=self,
|
|
244
|
+
prompt=prompt,
|
|
245
|
+
schema_cls=ReasonScore,
|
|
246
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
247
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
248
|
+
)
|
|
259
249
|
|
|
260
250
|
def _evaluate_perceptual_quality(
|
|
261
251
|
self, actual_image_output: MLLMImage
|
|
@@ -268,22 +258,15 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
268
258
|
Images:
|
|
269
259
|
{images}
|
|
270
260
|
"""
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
prompt, schema=ReasonScore
|
|
279
|
-
)
|
|
280
|
-
return res.score, res.reasoning
|
|
281
|
-
except TypeError:
|
|
282
|
-
res = self.model.generate(prompt)
|
|
283
|
-
data = trimAndLoadJson(res, self)
|
|
284
|
-
return data["score"], data["reasoning"]
|
|
261
|
+
return generate_with_schema_and_extract(
|
|
262
|
+
metric=self,
|
|
263
|
+
prompt=prompt,
|
|
264
|
+
schema_cls=ReasonScore,
|
|
265
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
266
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
267
|
+
)
|
|
285
268
|
|
|
286
|
-
def _calculate_score(self) ->
|
|
269
|
+
def _calculate_score(self) -> float:
|
|
287
270
|
min_SC_score = min(self.SC_scores)
|
|
288
271
|
min_PQ_score = min(self.PQ_scores)
|
|
289
272
|
return math.sqrt(min_SC_score * min_PQ_score) / 10
|
|
@@ -293,14 +276,12 @@ class TextToImageMetric(BaseMultimodalMetric):
|
|
|
293
276
|
self.success = False
|
|
294
277
|
else:
|
|
295
278
|
try:
|
|
296
|
-
self.score >= self.threshold
|
|
297
|
-
except:
|
|
279
|
+
self.success = self.score >= self.threshold
|
|
280
|
+
except TypeError:
|
|
298
281
|
self.success = False
|
|
299
282
|
return self.success
|
|
300
283
|
|
|
301
|
-
def _generate_reason(
|
|
302
|
-
self,
|
|
303
|
-
) -> Tuple[List[float], str]:
|
|
284
|
+
def _generate_reason(self) -> str:
|
|
304
285
|
return textwrap.dedent(
|
|
305
286
|
f"""
|
|
306
287
|
The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}
|