deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py
DELETED
|
@@ -1,285 +0,0 @@
|
|
|
1
|
-
from typing import Optional, List, Union
|
|
2
|
-
|
|
3
|
-
from deepeval.metrics import BaseMultimodalMetric
|
|
4
|
-
from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
|
|
5
|
-
from deepeval.metrics.multimodal_metrics.multimodal_contextual_recall.template import (
|
|
6
|
-
MultimodalContextualRecallTemplate,
|
|
7
|
-
)
|
|
8
|
-
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
9
|
-
from deepeval.metrics.utils import (
|
|
10
|
-
construct_verbose_logs,
|
|
11
|
-
trimAndLoadJson,
|
|
12
|
-
check_mllm_test_case_params,
|
|
13
|
-
initialize_multimodal_model,
|
|
14
|
-
)
|
|
15
|
-
from deepeval.models import DeepEvalBaseMLLM
|
|
16
|
-
from deepeval.metrics.multimodal_metrics.multimodal_contextual_recall.schema import *
|
|
17
|
-
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class MultimodalContextualRecallMetric(BaseMultimodalMetric):
|
|
21
|
-
|
|
22
|
-
_required_params: List[MLLMTestCaseParams] = [
|
|
23
|
-
MLLMTestCaseParams.INPUT,
|
|
24
|
-
MLLMTestCaseParams.ACTUAL_OUTPUT,
|
|
25
|
-
MLLMTestCaseParams.RETRIEVAL_CONTEXT,
|
|
26
|
-
MLLMTestCaseParams.EXPECTED_OUTPUT,
|
|
27
|
-
]
|
|
28
|
-
|
|
29
|
-
def __init__(
|
|
30
|
-
self,
|
|
31
|
-
threshold: float = 0.5,
|
|
32
|
-
model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
|
|
33
|
-
include_reason: bool = True,
|
|
34
|
-
async_mode: bool = True,
|
|
35
|
-
strict_mode: bool = False,
|
|
36
|
-
verbose_mode: bool = False,
|
|
37
|
-
):
|
|
38
|
-
self.threshold = 1 if strict_mode else threshold
|
|
39
|
-
self.model, self.using_native_model = initialize_multimodal_model(model)
|
|
40
|
-
self.evaluation_model = self.model.get_model_name()
|
|
41
|
-
self.include_reason = include_reason
|
|
42
|
-
self.async_mode = async_mode
|
|
43
|
-
self.strict_mode = strict_mode
|
|
44
|
-
self.verbose_mode = verbose_mode
|
|
45
|
-
|
|
46
|
-
def measure(
|
|
47
|
-
self,
|
|
48
|
-
test_case: MLLMTestCase,
|
|
49
|
-
_show_indicator: bool = True,
|
|
50
|
-
_in_component: bool = False,
|
|
51
|
-
_log_metric_to_confident: bool = True,
|
|
52
|
-
) -> float:
|
|
53
|
-
check_mllm_test_case_params(
|
|
54
|
-
test_case, self._required_params, None, None, self
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
58
|
-
with metric_progress_indicator(
|
|
59
|
-
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
60
|
-
):
|
|
61
|
-
if self.async_mode:
|
|
62
|
-
loop = get_or_create_event_loop()
|
|
63
|
-
loop.run_until_complete(
|
|
64
|
-
self.a_measure(
|
|
65
|
-
test_case,
|
|
66
|
-
_show_indicator=False,
|
|
67
|
-
_in_component=_in_component,
|
|
68
|
-
_log_metric_to_confident=_log_metric_to_confident,
|
|
69
|
-
)
|
|
70
|
-
)
|
|
71
|
-
else:
|
|
72
|
-
self.verdicts: List[ContextualRecallVerdict] = (
|
|
73
|
-
self._generate_verdicts(
|
|
74
|
-
test_case.expected_output, test_case.retrieval_context
|
|
75
|
-
)
|
|
76
|
-
)
|
|
77
|
-
self.score = self._calculate_score()
|
|
78
|
-
self.reason = self._generate_reason(test_case.expected_output)
|
|
79
|
-
self.success = self.score >= self.threshold
|
|
80
|
-
self.verbose_logs = construct_verbose_logs(
|
|
81
|
-
self,
|
|
82
|
-
steps=[
|
|
83
|
-
f"Verdicts:\n{prettify_list(self.verdicts)}",
|
|
84
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
|
-
],
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
return self.score
|
|
89
|
-
|
|
90
|
-
async def a_measure(
|
|
91
|
-
self,
|
|
92
|
-
test_case: MLLMTestCase,
|
|
93
|
-
_show_indicator: bool = True,
|
|
94
|
-
_in_component: bool = False,
|
|
95
|
-
_log_metric_to_confident: bool = True,
|
|
96
|
-
) -> float:
|
|
97
|
-
check_mllm_test_case_params(
|
|
98
|
-
test_case, self._required_params, None, None, self
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
102
|
-
with metric_progress_indicator(
|
|
103
|
-
self,
|
|
104
|
-
async_mode=True,
|
|
105
|
-
_show_indicator=_show_indicator,
|
|
106
|
-
_in_component=_in_component,
|
|
107
|
-
):
|
|
108
|
-
self.verdicts: List[ContextualRecallVerdict] = (
|
|
109
|
-
await self._a_generate_verdicts(
|
|
110
|
-
test_case.expected_output, test_case.retrieval_context
|
|
111
|
-
)
|
|
112
|
-
)
|
|
113
|
-
self.score = self._calculate_score()
|
|
114
|
-
self.reason = await self._a_generate_reason(
|
|
115
|
-
test_case.expected_output
|
|
116
|
-
)
|
|
117
|
-
self.success = self.score >= self.threshold
|
|
118
|
-
self.verbose_logs = construct_verbose_logs(
|
|
119
|
-
self,
|
|
120
|
-
steps=[
|
|
121
|
-
f"Verdicts:\n{prettify_list(self.verdicts)}",
|
|
122
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
123
|
-
],
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
return self.score
|
|
127
|
-
|
|
128
|
-
async def _a_generate_reason(
|
|
129
|
-
self, expected_output: List[Union[str, MLLMImage]]
|
|
130
|
-
):
|
|
131
|
-
if self.include_reason is False:
|
|
132
|
-
return None
|
|
133
|
-
|
|
134
|
-
supportive_reasons = []
|
|
135
|
-
unsupportive_reasons = []
|
|
136
|
-
for verdict in self.verdicts:
|
|
137
|
-
if verdict.verdict.lower() == "yes":
|
|
138
|
-
supportive_reasons.append(verdict.reason)
|
|
139
|
-
else:
|
|
140
|
-
unsupportive_reasons.append(verdict.reason)
|
|
141
|
-
|
|
142
|
-
prompt = MultimodalContextualRecallTemplate.generate_reason(
|
|
143
|
-
expected_output=expected_output,
|
|
144
|
-
supportive_reasons=supportive_reasons,
|
|
145
|
-
unsupportive_reasons=unsupportive_reasons,
|
|
146
|
-
score=format(self.score, ".2f"),
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
if self.using_native_model:
|
|
150
|
-
res, cost = await self.model.a_generate(
|
|
151
|
-
prompt, schema=MultimodalContextualRecallScoreReason
|
|
152
|
-
)
|
|
153
|
-
self.evaluation_cost += cost
|
|
154
|
-
return res.reason
|
|
155
|
-
else:
|
|
156
|
-
try:
|
|
157
|
-
res: MultimodalContextualRecallScoreReason = (
|
|
158
|
-
await self.model.a_generate(
|
|
159
|
-
prompt, schema=MultimodalContextualRecallScoreReason
|
|
160
|
-
)
|
|
161
|
-
)
|
|
162
|
-
return res.reason
|
|
163
|
-
except TypeError:
|
|
164
|
-
res = await self.model.a_generate(prompt)
|
|
165
|
-
data = trimAndLoadJson(res, self)
|
|
166
|
-
return data["reason"]
|
|
167
|
-
|
|
168
|
-
def _generate_reason(self, expected_output: List[Union[str, MLLMImage]]):
|
|
169
|
-
if self.include_reason is False:
|
|
170
|
-
return None
|
|
171
|
-
|
|
172
|
-
supportive_reasons = []
|
|
173
|
-
unsupportive_reasons = []
|
|
174
|
-
for verdict in self.verdicts:
|
|
175
|
-
if verdict.verdict.lower() == "yes":
|
|
176
|
-
supportive_reasons.append(verdict.reason)
|
|
177
|
-
else:
|
|
178
|
-
unsupportive_reasons.append(verdict.reason)
|
|
179
|
-
|
|
180
|
-
prompt = MultimodalContextualRecallTemplate.generate_reason(
|
|
181
|
-
expected_output=expected_output,
|
|
182
|
-
supportive_reasons=supportive_reasons,
|
|
183
|
-
unsupportive_reasons=unsupportive_reasons,
|
|
184
|
-
score=format(self.score, ".2f"),
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
if self.using_native_model:
|
|
188
|
-
res, cost = self.model.generate(
|
|
189
|
-
prompt, schema=MultimodalContextualRecallScoreReason
|
|
190
|
-
)
|
|
191
|
-
self.evaluation_cost += cost
|
|
192
|
-
return res.reason
|
|
193
|
-
else:
|
|
194
|
-
try:
|
|
195
|
-
res: MultimodalContextualRecallScoreReason = (
|
|
196
|
-
self.model.generate(
|
|
197
|
-
prompt, schema=MultimodalContextualRecallScoreReason
|
|
198
|
-
)
|
|
199
|
-
)
|
|
200
|
-
return res.reason
|
|
201
|
-
except TypeError:
|
|
202
|
-
res = self.model.generate(prompt)
|
|
203
|
-
data = trimAndLoadJson(res, self)
|
|
204
|
-
return data["reason"]
|
|
205
|
-
|
|
206
|
-
def _calculate_score(self):
|
|
207
|
-
number_of_verdicts = len(self.verdicts)
|
|
208
|
-
if number_of_verdicts == 0:
|
|
209
|
-
return 0
|
|
210
|
-
|
|
211
|
-
justified_sentences = 0
|
|
212
|
-
for verdict in self.verdicts:
|
|
213
|
-
if verdict.verdict.lower() == "yes":
|
|
214
|
-
justified_sentences += 1
|
|
215
|
-
|
|
216
|
-
score = justified_sentences / number_of_verdicts
|
|
217
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
|
218
|
-
|
|
219
|
-
async def _a_generate_verdicts(
|
|
220
|
-
self,
|
|
221
|
-
expected_output: List[Union[str, MLLMImage]],
|
|
222
|
-
retrieval_context: List[Union[str, MLLMImage]],
|
|
223
|
-
) -> List[ContextualRecallVerdict]:
|
|
224
|
-
prompt = MultimodalContextualRecallTemplate.generate_verdicts(
|
|
225
|
-
expected_output=expected_output, retrieval_context=retrieval_context
|
|
226
|
-
)
|
|
227
|
-
if self.using_native_model:
|
|
228
|
-
res, cost = await self.model.a_generate(prompt, schema=Verdicts)
|
|
229
|
-
self.evaluation_cost += cost
|
|
230
|
-
verdicts: Verdicts = [item for item in res.verdicts]
|
|
231
|
-
return verdicts
|
|
232
|
-
else:
|
|
233
|
-
try:
|
|
234
|
-
res: Verdicts = await self.model.a_generate(
|
|
235
|
-
prompt, schema=Verdicts
|
|
236
|
-
)
|
|
237
|
-
verdicts: Verdicts = [item for item in res.verdicts]
|
|
238
|
-
return verdicts
|
|
239
|
-
except TypeError:
|
|
240
|
-
res = await self.model.a_generate(prompt)
|
|
241
|
-
data = trimAndLoadJson(res, self)
|
|
242
|
-
verdicts = [
|
|
243
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
244
|
-
]
|
|
245
|
-
return verdicts
|
|
246
|
-
|
|
247
|
-
def _generate_verdicts(
|
|
248
|
-
self,
|
|
249
|
-
expected_output: List[Union[str, MLLMImage]],
|
|
250
|
-
retrieval_context: List[Union[str, MLLMImage]],
|
|
251
|
-
) -> List[ContextualRecallVerdict]:
|
|
252
|
-
prompt = MultimodalContextualRecallTemplate.generate_verdicts(
|
|
253
|
-
expected_output=expected_output, retrieval_context=retrieval_context
|
|
254
|
-
)
|
|
255
|
-
if self.using_native_model:
|
|
256
|
-
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
257
|
-
self.evaluation_cost += cost
|
|
258
|
-
verdicts: Verdicts = [item for item in res.verdicts]
|
|
259
|
-
return verdicts
|
|
260
|
-
else:
|
|
261
|
-
try:
|
|
262
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
263
|
-
verdicts: Verdicts = [item for item in res.verdicts]
|
|
264
|
-
return verdicts
|
|
265
|
-
except TypeError:
|
|
266
|
-
res = self.model.generate(prompt)
|
|
267
|
-
data = trimAndLoadJson(res, self)
|
|
268
|
-
verdicts = [
|
|
269
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
270
|
-
]
|
|
271
|
-
return verdicts
|
|
272
|
-
|
|
273
|
-
def is_successful(self) -> bool:
|
|
274
|
-
if self.error is not None:
|
|
275
|
-
self.success = False
|
|
276
|
-
else:
|
|
277
|
-
try:
|
|
278
|
-
self.success = self.score >= self.threshold
|
|
279
|
-
except:
|
|
280
|
-
self.success = False
|
|
281
|
-
return self.success
|
|
282
|
-
|
|
283
|
-
@property
|
|
284
|
-
def __name__(self):
|
|
285
|
-
return "Multimodal Contextual Recall"
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
2
|
-
from pydantic import BaseModel, Field
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ContextualRecallVerdict(BaseModel):
|
|
6
|
-
verdict: str
|
|
7
|
-
reason: str
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class Verdicts(BaseModel):
|
|
11
|
-
verdicts: List[ContextualRecallVerdict]
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class MultimodalContextualRecallScoreReason(BaseModel):
|
|
15
|
-
reason: str
|
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
from typing import Union, List
|
|
2
|
-
import textwrap
|
|
3
|
-
|
|
4
|
-
from deepeval.test_case import MLLMImage
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class MultimodalContextualRecallTemplate:
|
|
8
|
-
@staticmethod
|
|
9
|
-
def generate_reason(
|
|
10
|
-
expected_output, supportive_reasons, unsupportive_reasons, score
|
|
11
|
-
) -> List[Union[str, MLLMImage]]:
|
|
12
|
-
return (
|
|
13
|
-
[
|
|
14
|
-
textwrap.dedent(
|
|
15
|
-
f"""Given the original expected output, a list of supportive reasons, and a list of unsupportive reasons (which is deduced directly from the 'expected output'), and a contextual recall score (closer to 1 the better), summarize a CONCISE reason for the score.
|
|
16
|
-
A supportive reason is the reason why a certain sentence or image in the original expected output can be attributed to the node in the retrieval context.
|
|
17
|
-
An unsupportive reason is the reason why a certain sentence or image in the original expected output cannot be attributed to anything in the retrieval context.
|
|
18
|
-
In your reason, you should related supportive/unsupportive reasons to the sentence or image number in expected output, and info regarding the node number in retrieval context to support your final reason. The first mention of "node(s)" should specify "node(s) in retrieval context)".
|
|
19
|
-
|
|
20
|
-
**
|
|
21
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
22
|
-
Example JSON:
|
|
23
|
-
{{
|
|
24
|
-
"reason": "The score is <contextual_recall_score> because <your_reason>."
|
|
25
|
-
}}
|
|
26
|
-
|
|
27
|
-
DO NOT mention 'supportive reasons' and 'unsupportive reasons' in your reason, these terms are just here for you to understand the broader scope of things.
|
|
28
|
-
If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
29
|
-
**
|
|
30
|
-
|
|
31
|
-
Contextual Recall Score:
|
|
32
|
-
{score}
|
|
33
|
-
|
|
34
|
-
Expected Output:
|
|
35
|
-
"""
|
|
36
|
-
)
|
|
37
|
-
]
|
|
38
|
-
+ expected_output
|
|
39
|
-
+ [
|
|
40
|
-
textwrap.dedent(
|
|
41
|
-
f"""Supportive Reasons:
|
|
42
|
-
{supportive_reasons}
|
|
43
|
-
|
|
44
|
-
Unsupportive Reasons:
|
|
45
|
-
{unsupportive_reasons}
|
|
46
|
-
|
|
47
|
-
JSON:"""
|
|
48
|
-
)
|
|
49
|
-
]
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
@staticmethod
|
|
53
|
-
def generate_verdicts(
|
|
54
|
-
expected_output, retrieval_context
|
|
55
|
-
) -> List[Union[str, MLLMImage]]:
|
|
56
|
-
return (
|
|
57
|
-
[
|
|
58
|
-
textwrap.dedent(
|
|
59
|
-
f"""For EACH sentence and image in the given expected output below, determine whether the sentence or image can be attributed to the nodes of retrieval contexts. Please generate a list of JSON with two keys: `verdict` and `reason`.
|
|
60
|
-
The `verdict` key should STRICTLY be either a 'yes' or 'no'. Answer 'yes' if the sentence or image can be attributed to any parts of the retrieval context, else answer 'no'.
|
|
61
|
-
The `reason` key should provide a reason why to the verdict. In the reason, you should aim to include the node(s) count in the retrieval context (eg., 1st node, and 2nd node in the retrieval context) that is attributed to said sentence or image. A node is either a string or image, but not both (so do not group images and texts in the same nodes). You should also aim to quote the specific part of the retrieval context to justify your verdict, but keep it extremely concise and cut short the quote with an ellipsis if possible.
|
|
62
|
-
|
|
63
|
-
**
|
|
64
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects, each with two keys: `verdict` and `reason`.
|
|
65
|
-
|
|
66
|
-
{{
|
|
67
|
-
"verdicts": [
|
|
68
|
-
{{
|
|
69
|
-
"reason": "...",
|
|
70
|
-
"verdict": "yes"
|
|
71
|
-
}},
|
|
72
|
-
...
|
|
73
|
-
]
|
|
74
|
-
}}
|
|
75
|
-
|
|
76
|
-
Since you are going to generate a verdict for each sentence, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of sentences and images in the `expected output`.
|
|
77
|
-
**
|
|
78
|
-
|
|
79
|
-
Expected Output:
|
|
80
|
-
"""
|
|
81
|
-
)
|
|
82
|
-
]
|
|
83
|
-
+ expected_output
|
|
84
|
-
+ [
|
|
85
|
-
textwrap.dedent(
|
|
86
|
-
"""
|
|
87
|
-
Retrieval Context:
|
|
88
|
-
"""
|
|
89
|
-
)
|
|
90
|
-
]
|
|
91
|
-
+ MultimodalContextualRecallTemplate.id_retrieval_context(
|
|
92
|
-
retrieval_context
|
|
93
|
-
)
|
|
94
|
-
+ [
|
|
95
|
-
textwrap.dedent(
|
|
96
|
-
"""
|
|
97
|
-
JSON:
|
|
98
|
-
"""
|
|
99
|
-
)
|
|
100
|
-
]
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
@staticmethod
|
|
104
|
-
def id_retrieval_context(retrieval_context) -> List[Union[str, MLLMImage]]:
|
|
105
|
-
annotated_retrieval_context = []
|
|
106
|
-
for i, context in enumerate(retrieval_context):
|
|
107
|
-
if isinstance(context, str):
|
|
108
|
-
annotated_retrieval_context.append(f"Node {i + 1}: {context}")
|
|
109
|
-
elif isinstance(context, MLLMImage):
|
|
110
|
-
annotated_retrieval_context.append(f"Node {i + 1}:")
|
|
111
|
-
annotated_retrieval_context.append(context)
|
|
112
|
-
return annotated_retrieval_context
|
|
@@ -1,282 +0,0 @@
|
|
|
1
|
-
from typing import Optional, List, Union
|
|
2
|
-
import asyncio
|
|
3
|
-
|
|
4
|
-
from deepeval.metrics import BaseMultimodalMetric
|
|
5
|
-
from deepeval.test_case import MLLMTestCaseParams, MLLMTestCase, MLLMImage
|
|
6
|
-
from deepeval.metrics.multimodal_metrics.multimodal_contextual_relevancy.template import (
|
|
7
|
-
MultimodalContextualRelevancyTemplate,
|
|
8
|
-
)
|
|
9
|
-
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
10
|
-
from deepeval.metrics.utils import (
|
|
11
|
-
construct_verbose_logs,
|
|
12
|
-
trimAndLoadJson,
|
|
13
|
-
check_mllm_test_case_params,
|
|
14
|
-
initialize_multimodal_model,
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
from deepeval.models import DeepEvalBaseMLLM
|
|
18
|
-
from deepeval.metrics.multimodal_metrics.multimodal_contextual_relevancy.schema import *
|
|
19
|
-
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class MultimodalContextualRelevancyMetric(BaseMultimodalMetric):
|
|
23
|
-
|
|
24
|
-
_required_params: List[MLLMTestCaseParams] = [
|
|
25
|
-
MLLMTestCaseParams.INPUT,
|
|
26
|
-
MLLMTestCaseParams.ACTUAL_OUTPUT,
|
|
27
|
-
MLLMTestCaseParams.RETRIEVAL_CONTEXT,
|
|
28
|
-
]
|
|
29
|
-
|
|
30
|
-
def __init__(
|
|
31
|
-
self,
|
|
32
|
-
threshold: float = 0.5,
|
|
33
|
-
model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
|
|
34
|
-
include_reason: bool = True,
|
|
35
|
-
async_mode: bool = True,
|
|
36
|
-
strict_mode: bool = False,
|
|
37
|
-
verbose_mode: bool = False,
|
|
38
|
-
):
|
|
39
|
-
self.threshold = 1 if strict_mode else threshold
|
|
40
|
-
self.model, self.using_native_model = initialize_multimodal_model(model)
|
|
41
|
-
self.evaluation_model = self.model.get_model_name()
|
|
42
|
-
self.include_reason = include_reason
|
|
43
|
-
self.async_mode = async_mode
|
|
44
|
-
self.strict_mode = strict_mode
|
|
45
|
-
self.verbose_mode = verbose_mode
|
|
46
|
-
|
|
47
|
-
def measure(
|
|
48
|
-
self,
|
|
49
|
-
test_case: MLLMTestCase,
|
|
50
|
-
_show_indicator: bool = True,
|
|
51
|
-
_in_component: bool = False,
|
|
52
|
-
_log_metric_to_confident: bool = True,
|
|
53
|
-
) -> float:
|
|
54
|
-
check_mllm_test_case_params(
|
|
55
|
-
test_case, self._required_params, None, None, self
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
59
|
-
with metric_progress_indicator(
|
|
60
|
-
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
61
|
-
):
|
|
62
|
-
if self.async_mode:
|
|
63
|
-
loop = get_or_create_event_loop()
|
|
64
|
-
loop.run_until_complete(
|
|
65
|
-
self.a_measure(
|
|
66
|
-
test_case,
|
|
67
|
-
_show_indicator=False,
|
|
68
|
-
_in_component=_in_component,
|
|
69
|
-
_log_metric_to_confident=_log_metric_to_confident,
|
|
70
|
-
)
|
|
71
|
-
)
|
|
72
|
-
else:
|
|
73
|
-
self.verdicts_list: List[ContextualRelevancyVerdicts] = [
|
|
74
|
-
(self._generate_verdicts(test_case.input, context))
|
|
75
|
-
for context in test_case.retrieval_context
|
|
76
|
-
]
|
|
77
|
-
self.score = self._calculate_score()
|
|
78
|
-
self.reason = self._generate_reason(test_case.input)
|
|
79
|
-
self.success = self.score >= self.threshold
|
|
80
|
-
self.verbose_logs = construct_verbose_logs(
|
|
81
|
-
self,
|
|
82
|
-
steps=[
|
|
83
|
-
f"Verdicts:\n{prettify_list(self.verdicts_list)}",
|
|
84
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
85
|
-
],
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
return self.score
|
|
89
|
-
|
|
90
|
-
async def a_measure(
|
|
91
|
-
self,
|
|
92
|
-
test_case: MLLMTestCase,
|
|
93
|
-
_show_indicator: bool = True,
|
|
94
|
-
_in_component: bool = False,
|
|
95
|
-
_log_metric_to_confident: bool = True,
|
|
96
|
-
) -> float:
|
|
97
|
-
check_mllm_test_case_params(
|
|
98
|
-
test_case, self._required_params, None, None, self
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
102
|
-
with metric_progress_indicator(
|
|
103
|
-
self,
|
|
104
|
-
async_mode=True,
|
|
105
|
-
_show_indicator=_show_indicator,
|
|
106
|
-
_in_component=_in_component,
|
|
107
|
-
):
|
|
108
|
-
self.verdicts_list: List[ContextualRelevancyVerdicts] = (
|
|
109
|
-
await asyncio.gather(
|
|
110
|
-
*[
|
|
111
|
-
self._a_generate_verdicts(test_case.input, context)
|
|
112
|
-
for context in test_case.retrieval_context
|
|
113
|
-
]
|
|
114
|
-
)
|
|
115
|
-
)
|
|
116
|
-
self.score = self._calculate_score()
|
|
117
|
-
self.reason = await self._a_generate_reason(test_case.input)
|
|
118
|
-
self.success = self.score >= self.threshold
|
|
119
|
-
self.verbose_logs = construct_verbose_logs(
|
|
120
|
-
self,
|
|
121
|
-
steps=[
|
|
122
|
-
f"Verdicts:\n{prettify_list(self.verdicts_list)}",
|
|
123
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
124
|
-
],
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
return self.score
|
|
128
|
-
|
|
129
|
-
async def _a_generate_reason(self, input: List[Union[str, MLLMImage]]):
|
|
130
|
-
if self.include_reason is False:
|
|
131
|
-
return None
|
|
132
|
-
|
|
133
|
-
irrelevancies = []
|
|
134
|
-
relevant_statements = []
|
|
135
|
-
for verdicts in self.verdicts_list:
|
|
136
|
-
for verdict in verdicts.verdicts:
|
|
137
|
-
if verdict.verdict.lower() == "no":
|
|
138
|
-
irrelevancies.append(verdict.reason)
|
|
139
|
-
else:
|
|
140
|
-
relevant_statements.append(verdict.statement)
|
|
141
|
-
|
|
142
|
-
prompt: dict = MultimodalContextualRelevancyTemplate.generate_reason(
|
|
143
|
-
input=input,
|
|
144
|
-
irrelevancies=irrelevancies,
|
|
145
|
-
relevant_statements=relevant_statements,
|
|
146
|
-
score=format(self.score, ".2f"),
|
|
147
|
-
)
|
|
148
|
-
if self.using_native_model:
|
|
149
|
-
res, cost = await self.model.a_generate(
|
|
150
|
-
prompt, schema=MultimodelContextualRelevancyScoreReason
|
|
151
|
-
)
|
|
152
|
-
self.evaluation_cost += cost
|
|
153
|
-
return res.reason
|
|
154
|
-
else:
|
|
155
|
-
try:
|
|
156
|
-
res: MultimodelContextualRelevancyScoreReason = (
|
|
157
|
-
await self.model.a_generate(
|
|
158
|
-
prompt, schema=MultimodelContextualRelevancyScoreReason
|
|
159
|
-
)
|
|
160
|
-
)
|
|
161
|
-
return res.reason
|
|
162
|
-
except TypeError:
|
|
163
|
-
res = await self.model.a_generate(prompt)
|
|
164
|
-
data = trimAndLoadJson(res, self)
|
|
165
|
-
return data["reason"]
|
|
166
|
-
|
|
167
|
-
def _generate_reason(self, input: List[Union[str, MLLMImage]]):
|
|
168
|
-
if self.include_reason is False:
|
|
169
|
-
return None
|
|
170
|
-
|
|
171
|
-
irrelevancies = []
|
|
172
|
-
relevant_statements = []
|
|
173
|
-
for verdicts in self.verdicts_list:
|
|
174
|
-
for verdict in verdicts.verdicts:
|
|
175
|
-
if verdict.verdict.lower() == "no":
|
|
176
|
-
irrelevancies.append(verdict.reason)
|
|
177
|
-
else:
|
|
178
|
-
relevant_statements.append(verdict.statement)
|
|
179
|
-
|
|
180
|
-
prompt: dict = MultimodalContextualRelevancyTemplate.generate_reason(
|
|
181
|
-
input=input,
|
|
182
|
-
irrelevancies=irrelevancies,
|
|
183
|
-
relevant_statements=relevant_statements,
|
|
184
|
-
score=format(self.score, ".2f"),
|
|
185
|
-
)
|
|
186
|
-
if self.using_native_model:
|
|
187
|
-
res, cost = self.model.generate(
|
|
188
|
-
prompt, schema=MultimodelContextualRelevancyScoreReason
|
|
189
|
-
)
|
|
190
|
-
self.evaluation_cost += cost
|
|
191
|
-
return res.reason
|
|
192
|
-
else:
|
|
193
|
-
try:
|
|
194
|
-
res: MultimodelContextualRelevancyScoreReason = (
|
|
195
|
-
self.model.generate(
|
|
196
|
-
prompt, schema=MultimodelContextualRelevancyScoreReason
|
|
197
|
-
)
|
|
198
|
-
)
|
|
199
|
-
return res.reason
|
|
200
|
-
except TypeError:
|
|
201
|
-
res = self.model.generate(prompt)
|
|
202
|
-
data = trimAndLoadJson(res, self)
|
|
203
|
-
return data["reason"]
|
|
204
|
-
|
|
205
|
-
def _calculate_score(self):
|
|
206
|
-
total_verdicts = 0
|
|
207
|
-
relevant_statements = 0
|
|
208
|
-
for verdicts in self.verdicts_list:
|
|
209
|
-
for verdict in verdicts.verdicts:
|
|
210
|
-
total_verdicts += 1
|
|
211
|
-
if verdict.verdict.lower() == "yes":
|
|
212
|
-
relevant_statements += 1
|
|
213
|
-
|
|
214
|
-
if total_verdicts == 0:
|
|
215
|
-
return 0
|
|
216
|
-
|
|
217
|
-
score = relevant_statements / total_verdicts
|
|
218
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
|
219
|
-
|
|
220
|
-
async def _a_generate_verdicts(
|
|
221
|
-
self,
|
|
222
|
-
input: List[Union[str, MLLMImage]],
|
|
223
|
-
context: List[Union[str, MLLMImage]],
|
|
224
|
-
) -> ContextualRelevancyVerdicts:
|
|
225
|
-
prompt = MultimodalContextualRelevancyTemplate.generate_verdicts(
|
|
226
|
-
input=input, context=context
|
|
227
|
-
)
|
|
228
|
-
if self.using_native_model:
|
|
229
|
-
res, cost = await self.model.a_generate(
|
|
230
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
231
|
-
)
|
|
232
|
-
self.evaluation_cost += cost
|
|
233
|
-
return res
|
|
234
|
-
else:
|
|
235
|
-
try:
|
|
236
|
-
res = await self.model.a_generate(
|
|
237
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
238
|
-
)
|
|
239
|
-
return res
|
|
240
|
-
except TypeError:
|
|
241
|
-
res = await self.model.a_generate(prompt)
|
|
242
|
-
data = trimAndLoadJson(res, self)
|
|
243
|
-
return ContextualRelevancyVerdicts(**data)
|
|
244
|
-
|
|
245
|
-
def _generate_verdicts(
|
|
246
|
-
self,
|
|
247
|
-
input: List[Union[str, MLLMImage]],
|
|
248
|
-
context: List[Union[str, MLLMImage]],
|
|
249
|
-
) -> ContextualRelevancyVerdicts:
|
|
250
|
-
prompt = MultimodalContextualRelevancyTemplate.generate_verdicts(
|
|
251
|
-
input=input, context=context
|
|
252
|
-
)
|
|
253
|
-
if self.using_native_model:
|
|
254
|
-
res, cost = self.model.generate(
|
|
255
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
256
|
-
)
|
|
257
|
-
self.evaluation_cost += cost
|
|
258
|
-
return res
|
|
259
|
-
else:
|
|
260
|
-
try:
|
|
261
|
-
res = self.model.generate(
|
|
262
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
263
|
-
)
|
|
264
|
-
return res
|
|
265
|
-
except TypeError:
|
|
266
|
-
res = self.model.generate(prompt)
|
|
267
|
-
data = trimAndLoadJson(res, self)
|
|
268
|
-
return ContextualRelevancyVerdicts(**data)
|
|
269
|
-
|
|
270
|
-
def is_successful(self) -> bool:
|
|
271
|
-
if self.error is not None:
|
|
272
|
-
self.success = False
|
|
273
|
-
else:
|
|
274
|
-
try:
|
|
275
|
-
self.success = self.score >= self.threshold
|
|
276
|
-
except:
|
|
277
|
-
self.success = False
|
|
278
|
-
return self.success
|
|
279
|
-
|
|
280
|
-
@property
|
|
281
|
-
def __name__(self):
|
|
282
|
-
return "Multimodal Contextual Relevancy"
|