deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
from typing import Union, List
|
|
2
|
-
import textwrap
|
|
3
|
-
|
|
4
|
-
from deepeval.test_case import MLLMImage
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class MultimodalContextualRelevancyTemplate:
|
|
8
|
-
@staticmethod
|
|
9
|
-
def generate_reason(
|
|
10
|
-
input: List[Union[str, MLLMImage]],
|
|
11
|
-
irrelevancies: List[str],
|
|
12
|
-
relevant_statements: List[str],
|
|
13
|
-
score: float,
|
|
14
|
-
):
|
|
15
|
-
return (
|
|
16
|
-
[
|
|
17
|
-
textwrap.dedent(
|
|
18
|
-
f"""Based on the given input, reasons for why the retrieval context is irrelevant to the input, the statements in the retrieval context that is actually relevant to the retrieval context, and the contextual relevancy score (the closer to 1 the better), please generate a CONCISE reason for the score.
|
|
19
|
-
In your reason, you should quote data provided in the reasons for irrelevancy and relevant statements to support your point.
|
|
20
|
-
|
|
21
|
-
**
|
|
22
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
23
|
-
Example JSON:
|
|
24
|
-
{{
|
|
25
|
-
"reason": "The score is <contextual_relevancy_score> because <your_reason>."
|
|
26
|
-
}}
|
|
27
|
-
|
|
28
|
-
If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
29
|
-
**
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
Contextual Relevancy Score:
|
|
33
|
-
{score}
|
|
34
|
-
|
|
35
|
-
Input:
|
|
36
|
-
"""
|
|
37
|
-
)
|
|
38
|
-
]
|
|
39
|
-
+ input
|
|
40
|
-
+ [
|
|
41
|
-
textwrap.dedent(
|
|
42
|
-
f"""
|
|
43
|
-
Reasons for why the retrieval context is irrelevant to the input:
|
|
44
|
-
{irrelevancies}
|
|
45
|
-
|
|
46
|
-
Statement in the retrieval context that is relevant to the input:
|
|
47
|
-
{relevant_statements}
|
|
48
|
-
|
|
49
|
-
JSON:
|
|
50
|
-
"""
|
|
51
|
-
)
|
|
52
|
-
]
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
@staticmethod
|
|
56
|
-
def generate_verdicts(
|
|
57
|
-
input: List[Union[str, MLLMImage]], context: List[Union[str, MLLMImage]]
|
|
58
|
-
) -> List[Union[str, MLLMImage]]:
|
|
59
|
-
return (
|
|
60
|
-
[
|
|
61
|
-
textwrap.dedent(
|
|
62
|
-
f"""Based on the input and context (image or string), please generate a JSON object to indicate whether the context is relevant to the provided input. The JSON will be a list of 'verdicts', with 2 mandatory fields: 'verdict' and 'statement', and 1 optional field: 'reason'.
|
|
63
|
-
If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
|
|
64
|
-
If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
|
|
65
|
-
The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the statement or image is relevant to the input.
|
|
66
|
-
Provide a 'reason' ONLY IF verdict is no. You MUST quote the irrelevant parts of the statement or image to back up your reason.
|
|
67
|
-
|
|
68
|
-
**
|
|
69
|
-
IMPORTANT: Please make sure to only return in JSON format.
|
|
70
|
-
Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
|
|
71
|
-
Example Input: "What were some of Einstein's achievements?"
|
|
72
|
-
|
|
73
|
-
Example:
|
|
74
|
-
{{
|
|
75
|
-
"verdicts": [
|
|
76
|
-
{{
|
|
77
|
-
"statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
|
|
78
|
-
"verdict": "yes"
|
|
79
|
-
}},
|
|
80
|
-
{{
|
|
81
|
-
"statement": "There was a cat.",
|
|
82
|
-
"reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements.",
|
|
83
|
-
"verdict": "no"
|
|
84
|
-
}}
|
|
85
|
-
]
|
|
86
|
-
}}
|
|
87
|
-
**
|
|
88
|
-
|
|
89
|
-
Input:
|
|
90
|
-
"""
|
|
91
|
-
)
|
|
92
|
-
]
|
|
93
|
-
+ input
|
|
94
|
-
+ [
|
|
95
|
-
textwrap.dedent(
|
|
96
|
-
"""
|
|
97
|
-
Context:
|
|
98
|
-
"""
|
|
99
|
-
)
|
|
100
|
-
]
|
|
101
|
-
+ [context]
|
|
102
|
-
)
|
|
File without changes
|
|
@@ -1,356 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional, Union
|
|
2
|
-
import asyncio
|
|
3
|
-
|
|
4
|
-
from deepeval.metrics import BaseMultimodalMetric
|
|
5
|
-
from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
|
|
6
|
-
from deepeval.metrics.multimodal_metrics.multimodal_faithfulness.template import (
|
|
7
|
-
MultimodalFaithfulnessTemplate,
|
|
8
|
-
)
|
|
9
|
-
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
10
|
-
from deepeval.metrics.utils import (
|
|
11
|
-
construct_verbose_logs,
|
|
12
|
-
trimAndLoadJson,
|
|
13
|
-
check_mllm_test_case_params,
|
|
14
|
-
initialize_multimodal_model,
|
|
15
|
-
)
|
|
16
|
-
from deepeval.models import DeepEvalBaseLLM
|
|
17
|
-
from deepeval.metrics.multimodal_metrics.multimodal_faithfulness.schema import *
|
|
18
|
-
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class MultimodalFaithfulnessMetric(BaseMultimodalMetric):
|
|
22
|
-
|
|
23
|
-
_required_params: List[MLLMTestCaseParams] = [
|
|
24
|
-
MLLMTestCaseParams.INPUT,
|
|
25
|
-
MLLMTestCaseParams.ACTUAL_OUTPUT,
|
|
26
|
-
MLLMTestCaseParams.RETRIEVAL_CONTEXT,
|
|
27
|
-
]
|
|
28
|
-
|
|
29
|
-
def __init__(
|
|
30
|
-
self,
|
|
31
|
-
threshold: float = 0.5,
|
|
32
|
-
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
33
|
-
include_reason: bool = True,
|
|
34
|
-
async_mode: bool = True,
|
|
35
|
-
strict_mode: bool = False,
|
|
36
|
-
verbose_mode: bool = False,
|
|
37
|
-
truths_extraction_limit: Optional[int] = None,
|
|
38
|
-
):
|
|
39
|
-
self.threshold = 1 if strict_mode else threshold
|
|
40
|
-
self.model, self.using_native_model = initialize_multimodal_model(model)
|
|
41
|
-
self.evaluation_model = self.model.get_model_name()
|
|
42
|
-
self.include_reason = include_reason
|
|
43
|
-
self.async_mode = async_mode
|
|
44
|
-
self.strict_mode = strict_mode
|
|
45
|
-
self.verbose_mode = verbose_mode
|
|
46
|
-
|
|
47
|
-
self.truths_extraction_limit = truths_extraction_limit
|
|
48
|
-
if self.truths_extraction_limit is not None:
|
|
49
|
-
self.truths_extraction_limit = max(self.truths_extraction_limit, 0)
|
|
50
|
-
|
|
51
|
-
def measure(
|
|
52
|
-
self,
|
|
53
|
-
test_case: MLLMTestCase,
|
|
54
|
-
_show_indicator: bool = True,
|
|
55
|
-
_in_component: bool = False,
|
|
56
|
-
_log_metric_to_confident: bool = True,
|
|
57
|
-
) -> float:
|
|
58
|
-
check_mllm_test_case_params(
|
|
59
|
-
test_case, self._required_params, None, None, self
|
|
60
|
-
)
|
|
61
|
-
|
|
62
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
63
|
-
with metric_progress_indicator(
|
|
64
|
-
self,
|
|
65
|
-
_show_indicator=_show_indicator,
|
|
66
|
-
_in_component=_in_component,
|
|
67
|
-
):
|
|
68
|
-
if self.async_mode:
|
|
69
|
-
loop = get_or_create_event_loop()
|
|
70
|
-
loop.run_until_complete(
|
|
71
|
-
self.a_measure(
|
|
72
|
-
test_case,
|
|
73
|
-
_show_indicator=False,
|
|
74
|
-
_in_component=_in_component,
|
|
75
|
-
_log_metric_to_confident=_log_metric_to_confident,
|
|
76
|
-
)
|
|
77
|
-
)
|
|
78
|
-
else:
|
|
79
|
-
self.truths = self._generate_truths(test_case.retrieval_context)
|
|
80
|
-
self.claims = self._generate_claims(test_case.actual_output)
|
|
81
|
-
self.verdicts = self._generate_verdicts()
|
|
82
|
-
self.score = self._calculate_score()
|
|
83
|
-
self.reason = self._generate_reason()
|
|
84
|
-
self.success = self.score >= self.threshold
|
|
85
|
-
self.verbose_logs = construct_verbose_logs(
|
|
86
|
-
self,
|
|
87
|
-
steps=[
|
|
88
|
-
f"Truths (limit={self.truths_extraction_limit}):\n{prettify_list(self.truths)}",
|
|
89
|
-
f"Claims:\n{prettify_list(self.claims)}",
|
|
90
|
-
f"Verdicts:\n{prettify_list(self.verdicts)}",
|
|
91
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
92
|
-
],
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
return self.score
|
|
96
|
-
|
|
97
|
-
async def a_measure(
|
|
98
|
-
self,
|
|
99
|
-
test_case: MLLMTestCase,
|
|
100
|
-
_show_indicator: bool = True,
|
|
101
|
-
_in_component: bool = False,
|
|
102
|
-
_log_metric_to_confident: bool = True,
|
|
103
|
-
) -> float:
|
|
104
|
-
check_mllm_test_case_params(
|
|
105
|
-
test_case, self._required_params, None, None, self
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
109
|
-
with metric_progress_indicator(
|
|
110
|
-
self,
|
|
111
|
-
async_mode=True,
|
|
112
|
-
_show_indicator=_show_indicator,
|
|
113
|
-
_in_component=_in_component,
|
|
114
|
-
):
|
|
115
|
-
self.truths, self.claims = await asyncio.gather(
|
|
116
|
-
self._a_generate_truths(test_case.retrieval_context),
|
|
117
|
-
self._a_generate_claims(test_case.actual_output),
|
|
118
|
-
)
|
|
119
|
-
self.verdicts = await self._a_generate_verdicts()
|
|
120
|
-
self.score = self._calculate_score()
|
|
121
|
-
self.reason = await self._a_generate_reason()
|
|
122
|
-
self.success = self.score >= self.threshold
|
|
123
|
-
self.verbose_logs = construct_verbose_logs(
|
|
124
|
-
self,
|
|
125
|
-
steps=[
|
|
126
|
-
f"Truths (limit={self.truths_extraction_limit}):\n{prettify_list(self.truths)}",
|
|
127
|
-
f"Claims:\n{prettify_list(self.claims)}",
|
|
128
|
-
f"Verdicts:\n{prettify_list(self.verdicts)}",
|
|
129
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
130
|
-
],
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
return self.score
|
|
134
|
-
|
|
135
|
-
async def _a_generate_reason(self) -> str:
|
|
136
|
-
if self.include_reason is False:
|
|
137
|
-
return None
|
|
138
|
-
|
|
139
|
-
contradictions = []
|
|
140
|
-
for verdict in self.verdicts:
|
|
141
|
-
if verdict.verdict.strip().lower() == "no":
|
|
142
|
-
contradictions.append(verdict.reason)
|
|
143
|
-
|
|
144
|
-
prompt: dict = MultimodalFaithfulnessTemplate.generate_reason(
|
|
145
|
-
contradictions=contradictions,
|
|
146
|
-
score=format(self.score, ".2f"),
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
if self.using_native_model:
|
|
150
|
-
res, cost = await self.model.a_generate(
|
|
151
|
-
prompt, schema=MultimodalFaithfulnessScoreReason
|
|
152
|
-
)
|
|
153
|
-
self.evaluation_cost += cost
|
|
154
|
-
return res.reason
|
|
155
|
-
else:
|
|
156
|
-
try:
|
|
157
|
-
res: MultimodalFaithfulnessScoreReason = (
|
|
158
|
-
await self.model.a_generate(
|
|
159
|
-
prompt, schema=MultimodalFaithfulnessScoreReason
|
|
160
|
-
)
|
|
161
|
-
)
|
|
162
|
-
return res.reason
|
|
163
|
-
except TypeError:
|
|
164
|
-
res = await self.model.a_generate(prompt)
|
|
165
|
-
data = trimAndLoadJson(res, self)
|
|
166
|
-
return data["reason"]
|
|
167
|
-
|
|
168
|
-
def _generate_reason(self) -> str:
|
|
169
|
-
if self.include_reason is False:
|
|
170
|
-
return None
|
|
171
|
-
|
|
172
|
-
contradictions = []
|
|
173
|
-
for verdict in self.verdicts:
|
|
174
|
-
if verdict.verdict.strip().lower() == "no":
|
|
175
|
-
contradictions.append(verdict.reason)
|
|
176
|
-
|
|
177
|
-
prompt: dict = MultimodalFaithfulnessTemplate.generate_reason(
|
|
178
|
-
contradictions=contradictions,
|
|
179
|
-
score=format(self.score, ".2f"),
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
if self.using_native_model:
|
|
183
|
-
res, cost = self.model.generate(
|
|
184
|
-
prompt, schema=MultimodalFaithfulnessScoreReason
|
|
185
|
-
)
|
|
186
|
-
self.evaluation_cost += cost
|
|
187
|
-
return res.reason
|
|
188
|
-
else:
|
|
189
|
-
try:
|
|
190
|
-
res: MultimodalFaithfulnessScoreReason = self.model.generate(
|
|
191
|
-
prompt, schema=MultimodalFaithfulnessScoreReason
|
|
192
|
-
)
|
|
193
|
-
return res.reason
|
|
194
|
-
except TypeError:
|
|
195
|
-
res = self.model.generate(prompt)
|
|
196
|
-
data = trimAndLoadJson(res, self)
|
|
197
|
-
return data["reason"]
|
|
198
|
-
|
|
199
|
-
async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]:
|
|
200
|
-
if len(self.claims) == 0:
|
|
201
|
-
return []
|
|
202
|
-
|
|
203
|
-
verdicts: List[FaithfulnessVerdict] = []
|
|
204
|
-
prompt = MultimodalFaithfulnessTemplate.generate_verdicts(
|
|
205
|
-
claims=self.claims, retrieval_context="\n\n".join(self.truths)
|
|
206
|
-
)
|
|
207
|
-
if self.using_native_model:
|
|
208
|
-
res, cost = await self.model.a_generate(prompt, schema=Verdicts)
|
|
209
|
-
self.evaluation_cost += cost
|
|
210
|
-
verdicts = [item for item in res.verdicts]
|
|
211
|
-
return verdicts
|
|
212
|
-
else:
|
|
213
|
-
try:
|
|
214
|
-
res: Verdicts = await self.model.a_generate(
|
|
215
|
-
prompt, schema=Verdicts
|
|
216
|
-
)
|
|
217
|
-
verdicts = [item for item in res.verdicts]
|
|
218
|
-
return verdicts
|
|
219
|
-
except TypeError:
|
|
220
|
-
res = await self.model.a_generate(prompt)
|
|
221
|
-
data = trimAndLoadJson(res, self)
|
|
222
|
-
verdicts = [
|
|
223
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
224
|
-
]
|
|
225
|
-
return verdicts
|
|
226
|
-
|
|
227
|
-
def _generate_verdicts(self) -> List[FaithfulnessVerdict]:
|
|
228
|
-
if len(self.claims) == 0:
|
|
229
|
-
return []
|
|
230
|
-
|
|
231
|
-
verdicts: List[FaithfulnessVerdict] = []
|
|
232
|
-
prompt = MultimodalFaithfulnessTemplate.generate_verdicts(
|
|
233
|
-
claims=self.claims, retrieval_context="\n\n".join(self.truths)
|
|
234
|
-
)
|
|
235
|
-
if self.using_native_model:
|
|
236
|
-
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
237
|
-
self.evaluation_cost += cost
|
|
238
|
-
verdicts = [item for item in res.verdicts]
|
|
239
|
-
return verdicts
|
|
240
|
-
else:
|
|
241
|
-
try:
|
|
242
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
243
|
-
verdicts = [item for item in res.verdicts]
|
|
244
|
-
return verdicts
|
|
245
|
-
except TypeError:
|
|
246
|
-
res = self.model.generate(prompt)
|
|
247
|
-
data = trimAndLoadJson(res, self)
|
|
248
|
-
verdicts = [
|
|
249
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
250
|
-
]
|
|
251
|
-
return verdicts
|
|
252
|
-
|
|
253
|
-
async def _a_generate_truths(
|
|
254
|
-
self, retrieval_context: List[Union[str, MLLMImage]]
|
|
255
|
-
) -> List[str]:
|
|
256
|
-
prompt = MultimodalFaithfulnessTemplate.generate_truths(
|
|
257
|
-
excerpt=retrieval_context,
|
|
258
|
-
extraction_limit=self.truths_extraction_limit,
|
|
259
|
-
)
|
|
260
|
-
if self.using_native_model:
|
|
261
|
-
res, cost = await self.model.a_generate(prompt, schema=Truths)
|
|
262
|
-
self.evaluation_cost += cost
|
|
263
|
-
return res.truths
|
|
264
|
-
else:
|
|
265
|
-
try:
|
|
266
|
-
res: Truths = await self.model.a_generate(prompt, schema=Truths)
|
|
267
|
-
return res.truths
|
|
268
|
-
except TypeError:
|
|
269
|
-
res = await self.model.a_generate(prompt)
|
|
270
|
-
data = trimAndLoadJson(res, self)
|
|
271
|
-
return data["truths"]
|
|
272
|
-
|
|
273
|
-
def _generate_truths(
|
|
274
|
-
self, retrieval_context: List[Union[str, MLLMImage]]
|
|
275
|
-
) -> List[str]:
|
|
276
|
-
prompt = MultimodalFaithfulnessTemplate.generate_truths(
|
|
277
|
-
excerpt=retrieval_context,
|
|
278
|
-
extraction_limit=self.truths_extraction_limit,
|
|
279
|
-
)
|
|
280
|
-
if self.using_native_model:
|
|
281
|
-
res, cost = self.model.generate(prompt, schema=Truths)
|
|
282
|
-
self.evaluation_cost += cost
|
|
283
|
-
return res.truths
|
|
284
|
-
else:
|
|
285
|
-
try:
|
|
286
|
-
res: Truths = self.model.generate(prompt, schema=Truths)
|
|
287
|
-
return res.truths
|
|
288
|
-
except TypeError:
|
|
289
|
-
res = self.model.generate(prompt)
|
|
290
|
-
data = trimAndLoadJson(res, self)
|
|
291
|
-
return data["truths"]
|
|
292
|
-
|
|
293
|
-
async def _a_generate_claims(
|
|
294
|
-
self, actual_output: List[Union[str, MLLMImage]]
|
|
295
|
-
) -> List[str]:
|
|
296
|
-
prompt = MultimodalFaithfulnessTemplate.generate_claims(
|
|
297
|
-
excerpt=actual_output
|
|
298
|
-
)
|
|
299
|
-
if self.using_native_model:
|
|
300
|
-
res, cost = await self.model.a_generate(prompt, schema=Claims)
|
|
301
|
-
self.evaluation_cost += cost
|
|
302
|
-
return res.claims
|
|
303
|
-
else:
|
|
304
|
-
try:
|
|
305
|
-
res: Claims = await self.model.a_generate(prompt, schema=Claims)
|
|
306
|
-
return res.claims
|
|
307
|
-
except TypeError:
|
|
308
|
-
res = await self.model.a_generate(prompt)
|
|
309
|
-
data = trimAndLoadJson(res, self)
|
|
310
|
-
return data["claims"]
|
|
311
|
-
|
|
312
|
-
def _generate_claims(
|
|
313
|
-
self, actual_output: List[Union[str, MLLMImage]]
|
|
314
|
-
) -> List[str]:
|
|
315
|
-
prompt = MultimodalFaithfulnessTemplate.generate_claims(
|
|
316
|
-
excerpt=actual_output
|
|
317
|
-
)
|
|
318
|
-
if self.using_native_model:
|
|
319
|
-
res, cost = self.model.generate(prompt, schema=Claims)
|
|
320
|
-
self.evaluation_cost += cost
|
|
321
|
-
return res.claims
|
|
322
|
-
else:
|
|
323
|
-
try:
|
|
324
|
-
res: Claims = self.model.generate(prompt, schema=Claims)
|
|
325
|
-
return res.claims
|
|
326
|
-
except TypeError:
|
|
327
|
-
res = self.model.generate(prompt)
|
|
328
|
-
data = trimAndLoadJson(res, self)
|
|
329
|
-
return data["claims"]
|
|
330
|
-
|
|
331
|
-
def _calculate_score(self) -> float:
|
|
332
|
-
number_of_verdicts = len(self.verdicts)
|
|
333
|
-
if number_of_verdicts == 0:
|
|
334
|
-
return 1
|
|
335
|
-
|
|
336
|
-
faithfulness_count = 0
|
|
337
|
-
for verdict in self.verdicts:
|
|
338
|
-
if verdict.verdict.strip().lower() != "no":
|
|
339
|
-
faithfulness_count += 1
|
|
340
|
-
|
|
341
|
-
score = faithfulness_count / number_of_verdicts
|
|
342
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
|
343
|
-
|
|
344
|
-
def is_successful(self) -> bool:
|
|
345
|
-
if self.error is not None:
|
|
346
|
-
self.success = False
|
|
347
|
-
else:
|
|
348
|
-
try:
|
|
349
|
-
self.success = self.score >= self.threshold
|
|
350
|
-
except:
|
|
351
|
-
self.success = False
|
|
352
|
-
return self.success
|
|
353
|
-
|
|
354
|
-
@property
|
|
355
|
-
def __name__(self):
|
|
356
|
-
return "Multimodal Faithfulness"
|
|
@@ -1,175 +0,0 @@
|
|
|
1
|
-
from typing import Union, List, Optional
|
|
2
|
-
import textwrap
|
|
3
|
-
|
|
4
|
-
from deepeval.test_case import MLLMImage
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class MultimodalFaithfulnessTemplate:
|
|
8
|
-
@staticmethod
|
|
9
|
-
def generate_claims(excerpt):
|
|
10
|
-
return (
|
|
11
|
-
[
|
|
12
|
-
textwrap.dedent(
|
|
13
|
-
f"""Based on the given excerpt, which contains text and possibly images, please generate a comprehensive list of FACTUAL, undisputed truths, that can inferred from the provided text and images.
|
|
14
|
-
|
|
15
|
-
Example:
|
|
16
|
-
Example Excerpt:
|
|
17
|
-
"Einstein won the noble prize in 1968 for his discovery of the photoelectric effect."
|
|
18
|
-
|
|
19
|
-
Example JSON:
|
|
20
|
-
{{
|
|
21
|
-
"claims": [
|
|
22
|
-
"Einstein won the noble prize for his discovery of the photoelectric effect.",
|
|
23
|
-
"Einstein won the noble prize in 1968."
|
|
24
|
-
]
|
|
25
|
-
}}
|
|
26
|
-
===== END OF EXAMPLE ======
|
|
27
|
-
|
|
28
|
-
**
|
|
29
|
-
IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
|
|
30
|
-
Only include claims that are factual, and the claims you extract should include the full context it was presented in, NOT cherry picked facts.
|
|
31
|
-
You should NOT include any prior knowledge, and take the text at face value when extracting claims.
|
|
32
|
-
**
|
|
33
|
-
|
|
34
|
-
Text:
|
|
35
|
-
"""
|
|
36
|
-
)
|
|
37
|
-
]
|
|
38
|
-
+ excerpt
|
|
39
|
-
+ [
|
|
40
|
-
textwrap.dedent(
|
|
41
|
-
f"""
|
|
42
|
-
JSON:
|
|
43
|
-
"""
|
|
44
|
-
)
|
|
45
|
-
]
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
@staticmethod
|
|
49
|
-
def generate_truths(excerpt, extraction_limit: Optional[int] = None):
|
|
50
|
-
if extraction_limit is None:
|
|
51
|
-
limit = " FACTUAL, undisputed truths"
|
|
52
|
-
elif extraction_limit == 1:
|
|
53
|
-
limit = " the single most important FACTUAL, undisputed truth"
|
|
54
|
-
else:
|
|
55
|
-
limit = f" the {extraction_limit} most important FACTUAL, undisputed truths per document"
|
|
56
|
-
return (
|
|
57
|
-
[
|
|
58
|
-
textwrap.dedent(
|
|
59
|
-
f"""Based on the given excerpt (text and images), please generate a comprehensive list of{limit}, that can inferred from the provided excerpt.
|
|
60
|
-
|
|
61
|
-
Example:
|
|
62
|
-
Example Excerpt:
|
|
63
|
-
"Einstein won the noble prize in 1968 for his discovery of the photoelectric effect."
|
|
64
|
-
|
|
65
|
-
Example JSON:
|
|
66
|
-
{{
|
|
67
|
-
"truths": [
|
|
68
|
-
"Einstein won the noble prize for his discovery of the photoelectric effect.",
|
|
69
|
-
"Einstein won the noble prize in 1968."
|
|
70
|
-
]
|
|
71
|
-
}}
|
|
72
|
-
===== END OF EXAMPLE ======
|
|
73
|
-
|
|
74
|
-
**
|
|
75
|
-
IMPORTANT: Please make sure to only return in JSON format, with the "truths" key as a list of strings. No words or explanation is needed.
|
|
76
|
-
Only include truths that are factual.
|
|
77
|
-
**
|
|
78
|
-
|
|
79
|
-
Excerpt:
|
|
80
|
-
"""
|
|
81
|
-
)
|
|
82
|
-
]
|
|
83
|
-
+ excerpt
|
|
84
|
-
+ [
|
|
85
|
-
textwrap.dedent(
|
|
86
|
-
f"""
|
|
87
|
-
JSON:
|
|
88
|
-
"""
|
|
89
|
-
)
|
|
90
|
-
]
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
@staticmethod
|
|
94
|
-
def generate_verdicts(claims, retrieval_context):
|
|
95
|
-
return textwrap.dedent(
|
|
96
|
-
f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
97
|
-
The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
|
|
98
|
-
Provide a 'reason' ONLY if the answer is 'no' or 'idk'.
|
|
99
|
-
The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
|
|
100
|
-
|
|
101
|
-
**
|
|
102
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
|
|
103
|
-
Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
|
|
104
|
-
Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."]
|
|
105
|
-
|
|
106
|
-
Example:
|
|
107
|
-
{{
|
|
108
|
-
"verdicts": [
|
|
109
|
-
{{
|
|
110
|
-
"reason": "The claim about Barack Obama is not directly addressed in the retrieval context, and so poses no contradiction.",
|
|
111
|
-
"verdict": "idk"
|
|
112
|
-
}},
|
|
113
|
-
{{
|
|
114
|
-
"reason": "The claim about Zurich being a city in London is incorrect but does not pose a contradiction to the retrieval context.",
|
|
115
|
-
"verdict": "idk"
|
|
116
|
-
}},
|
|
117
|
-
{{
|
|
118
|
-
"verdict": "yes"
|
|
119
|
-
}},
|
|
120
|
-
{{
|
|
121
|
-
"reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead.",
|
|
122
|
-
"verdict": "no"
|
|
123
|
-
}},
|
|
124
|
-
{{
|
|
125
|
-
"reason": "The actual output claims Einstein is a German chef, which is not correct as the retrieval context states he was a German scientist instead.",
|
|
126
|
-
"verdict": "no"
|
|
127
|
-
}}
|
|
128
|
-
]
|
|
129
|
-
}}
|
|
130
|
-
===== END OF EXAMPLE ======
|
|
131
|
-
|
|
132
|
-
The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
|
|
133
|
-
You DON'T have to provide a reason if the answer is 'yes'.
|
|
134
|
-
ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
|
|
135
|
-
Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
|
|
136
|
-
Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
|
|
137
|
-
**
|
|
138
|
-
|
|
139
|
-
Retrieval Contexts:
|
|
140
|
-
{retrieval_context}
|
|
141
|
-
|
|
142
|
-
Claims:
|
|
143
|
-
{claims}
|
|
144
|
-
|
|
145
|
-
JSON:
|
|
146
|
-
"""
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
@staticmethod
|
|
150
|
-
def generate_reason(score, contradictions):
|
|
151
|
-
return textwrap.dedent(
|
|
152
|
-
f"""Below is a list of Contradictions. It is a list of strings explaining why the 'actual output' does not align with the information presented in the 'retrieval context'. Contradictions happen in the 'actual output', NOT the 'retrieval context'.
|
|
153
|
-
Given the faithfulness score, which is a 0-1 score indicating how faithful the `actual output` is to the retrieval context (higher the better), CONCISELY summarize the contradictions to justify the score.
|
|
154
|
-
|
|
155
|
-
**
|
|
156
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
157
|
-
Example JSON:
|
|
158
|
-
{{
|
|
159
|
-
"reason": "The score is <faithfulness_score> because <your_reason>."
|
|
160
|
-
}}
|
|
161
|
-
|
|
162
|
-
If there are no contradictions, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
163
|
-
Your reason MUST use information in `contradiction` in your reason.
|
|
164
|
-
Be sure in your reason, as if you know what the actual output is from the contradictions.
|
|
165
|
-
**
|
|
166
|
-
|
|
167
|
-
Faithfulness Score:
|
|
168
|
-
{score}
|
|
169
|
-
|
|
170
|
-
Contradictions:
|
|
171
|
-
{contradictions}
|
|
172
|
-
|
|
173
|
-
JSON:
|
|
174
|
-
"""
|
|
175
|
-
)
|
|
File without changes
|