deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,301 +0,0 @@
|
|
|
1
|
-
from typing import Optional, List, Union
|
|
2
|
-
|
|
3
|
-
from deepeval.metrics import BaseMultimodalMetric
|
|
4
|
-
from deepeval.test_case import MLLMTestCase
|
|
5
|
-
from deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.template import (
|
|
6
|
-
MultiModalContextualPrecisionTemplate,
|
|
7
|
-
)
|
|
8
|
-
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
9
|
-
from deepeval.metrics.utils import (
|
|
10
|
-
construct_verbose_logs,
|
|
11
|
-
trimAndLoadJson,
|
|
12
|
-
check_mllm_test_case_params,
|
|
13
|
-
initialize_multimodal_model,
|
|
14
|
-
)
|
|
15
|
-
from deepeval.test_case import LLMTestCaseParams
|
|
16
|
-
from deepeval.models import DeepEvalBaseMLLM
|
|
17
|
-
import deepeval.metrics.multimodal_metrics.multimodal_contextual_precision.schema as mcpschema
|
|
18
|
-
from deepeval.metrics.indicator import metric_progress_indicator
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class MultimodalContextualPrecisionMetric(BaseMultimodalMetric):
|
|
22
|
-
|
|
23
|
-
_required_params: List[LLMTestCaseParams] = [
|
|
24
|
-
LLMTestCaseParams.INPUT,
|
|
25
|
-
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
26
|
-
LLMTestCaseParams.RETRIEVAL_CONTEXT,
|
|
27
|
-
LLMTestCaseParams.EXPECTED_OUTPUT,
|
|
28
|
-
]
|
|
29
|
-
|
|
30
|
-
def __init__(
|
|
31
|
-
self,
|
|
32
|
-
threshold: float = 0.5,
|
|
33
|
-
model: Optional[Union[str, DeepEvalBaseMLLM]] = None,
|
|
34
|
-
include_reason: bool = True,
|
|
35
|
-
async_mode: bool = True,
|
|
36
|
-
strict_mode: bool = False,
|
|
37
|
-
verbose_mode: bool = False,
|
|
38
|
-
):
|
|
39
|
-
self.threshold = 1 if strict_mode else threshold
|
|
40
|
-
self.include_reason = include_reason
|
|
41
|
-
self.model, self.using_native_model = initialize_multimodal_model(model)
|
|
42
|
-
self.evaluation_model = self.model.get_model_name()
|
|
43
|
-
self.async_mode = async_mode
|
|
44
|
-
self.strict_mode = strict_mode
|
|
45
|
-
self.verbose_mode = verbose_mode
|
|
46
|
-
|
|
47
|
-
def measure(
|
|
48
|
-
self,
|
|
49
|
-
test_case: MLLMTestCase,
|
|
50
|
-
_show_indicator: bool = True,
|
|
51
|
-
_in_component: bool = False,
|
|
52
|
-
_log_metric_to_confident: bool = True,
|
|
53
|
-
) -> float:
|
|
54
|
-
check_mllm_test_case_params(
|
|
55
|
-
test_case, self._required_params, None, None, self
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
59
|
-
with metric_progress_indicator(
|
|
60
|
-
self,
|
|
61
|
-
_show_indicator=_show_indicator,
|
|
62
|
-
_in_component=_in_component,
|
|
63
|
-
):
|
|
64
|
-
if self.async_mode:
|
|
65
|
-
loop = get_or_create_event_loop()
|
|
66
|
-
loop.run_until_complete(
|
|
67
|
-
self.a_measure(
|
|
68
|
-
test_case,
|
|
69
|
-
_show_indicator=False,
|
|
70
|
-
_in_component=_in_component,
|
|
71
|
-
_log_metric_to_confident=_log_metric_to_confident,
|
|
72
|
-
)
|
|
73
|
-
)
|
|
74
|
-
else:
|
|
75
|
-
self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
|
|
76
|
-
self._generate_verdicts(
|
|
77
|
-
test_case.input,
|
|
78
|
-
test_case.expected_output,
|
|
79
|
-
test_case.retrieval_context,
|
|
80
|
-
)
|
|
81
|
-
)
|
|
82
|
-
self.score = self._calculate_score()
|
|
83
|
-
self.reason = self._generate_reason(test_case.input)
|
|
84
|
-
self.success = self.score >= self.threshold
|
|
85
|
-
self.verbose_logs = construct_verbose_logs(
|
|
86
|
-
self,
|
|
87
|
-
steps=[
|
|
88
|
-
f"Verdicts:\n{prettify_list(self.verdicts)}",
|
|
89
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
|
-
],
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
return self.score
|
|
94
|
-
|
|
95
|
-
async def a_measure(
|
|
96
|
-
self,
|
|
97
|
-
test_case: MLLMTestCase,
|
|
98
|
-
_show_indicator: bool = True,
|
|
99
|
-
_in_component: bool = False,
|
|
100
|
-
_log_metric_to_confident: bool = True,
|
|
101
|
-
) -> float:
|
|
102
|
-
check_mllm_test_case_params(
|
|
103
|
-
test_case, self._required_params, None, None, self
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
self.evaluation_cost = 0 if self.using_native_model else None
|
|
107
|
-
with metric_progress_indicator(
|
|
108
|
-
self,
|
|
109
|
-
async_mode=True,
|
|
110
|
-
_show_indicator=_show_indicator,
|
|
111
|
-
_in_component=_in_component,
|
|
112
|
-
):
|
|
113
|
-
self.verdicts: List[mcpschema.ContextualPrecisionVerdict] = (
|
|
114
|
-
await self._a_generate_verdicts(
|
|
115
|
-
test_case.input,
|
|
116
|
-
test_case.expected_output,
|
|
117
|
-
test_case.retrieval_context,
|
|
118
|
-
)
|
|
119
|
-
)
|
|
120
|
-
self.score = self._calculate_score()
|
|
121
|
-
self.reason = await self._a_generate_reason(test_case.input)
|
|
122
|
-
self.success = self.score >= self.threshold
|
|
123
|
-
self.verbose_logs = construct_verbose_logs(
|
|
124
|
-
self,
|
|
125
|
-
steps=[
|
|
126
|
-
f"Verdicts:\n{prettify_list(self.verdicts)}",
|
|
127
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
|
128
|
-
],
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
return self.score
|
|
132
|
-
|
|
133
|
-
async def _a_generate_reason(self, input: str) -> Optional[str]:
|
|
134
|
-
if self.include_reason is False:
|
|
135
|
-
return None
|
|
136
|
-
|
|
137
|
-
retrieval_contexts_verdicts = [
|
|
138
|
-
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
139
|
-
for verdict in self.verdicts
|
|
140
|
-
]
|
|
141
|
-
prompt = MultiModalContextualPrecisionTemplate.generate_reason(
|
|
142
|
-
input=input,
|
|
143
|
-
verdicts=retrieval_contexts_verdicts,
|
|
144
|
-
score=format(self.score, ".2f"),
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
if self.using_native_model:
|
|
148
|
-
res, cost = await self.model.a_generate(
|
|
149
|
-
prompt,
|
|
150
|
-
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
151
|
-
)
|
|
152
|
-
self.evaluation_cost += cost
|
|
153
|
-
return res.reason
|
|
154
|
-
else:
|
|
155
|
-
try:
|
|
156
|
-
res: mcpschema.MultimodelContextualPrecisionScoreReason = (
|
|
157
|
-
await self.model.a_generate(
|
|
158
|
-
prompt,
|
|
159
|
-
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
160
|
-
)
|
|
161
|
-
)
|
|
162
|
-
return res.reason
|
|
163
|
-
except TypeError:
|
|
164
|
-
res = await self.model.a_generate(prompt)
|
|
165
|
-
data = trimAndLoadJson(res, self)
|
|
166
|
-
return data["reason"]
|
|
167
|
-
|
|
168
|
-
def _generate_reason(self, input: str) -> Optional[str]:
|
|
169
|
-
if self.include_reason is False:
|
|
170
|
-
return None
|
|
171
|
-
|
|
172
|
-
retrieval_contexts_verdicts = [
|
|
173
|
-
{"verdict": verdict.verdict, "reason": verdict.reason}
|
|
174
|
-
for verdict in self.verdicts
|
|
175
|
-
]
|
|
176
|
-
prompt = MultiModalContextualPrecisionTemplate.generate_reason(
|
|
177
|
-
input=input,
|
|
178
|
-
verdicts=retrieval_contexts_verdicts,
|
|
179
|
-
score=format(self.score, ".2f"),
|
|
180
|
-
)
|
|
181
|
-
|
|
182
|
-
if self.using_native_model:
|
|
183
|
-
res, cost = self.model.generate(
|
|
184
|
-
prompt,
|
|
185
|
-
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
186
|
-
)
|
|
187
|
-
self.evaluation_cost += cost
|
|
188
|
-
return res.reason
|
|
189
|
-
else:
|
|
190
|
-
try:
|
|
191
|
-
res: mcpschema.MultimodelContextualPrecisionScoreReason = (
|
|
192
|
-
self.model.generate(
|
|
193
|
-
prompt,
|
|
194
|
-
schema=mcpschema.MultimodelContextualPrecisionScoreReason,
|
|
195
|
-
)
|
|
196
|
-
)
|
|
197
|
-
return res.reason
|
|
198
|
-
except TypeError:
|
|
199
|
-
res = self.model.generate(prompt)
|
|
200
|
-
data = trimAndLoadJson(res, self)
|
|
201
|
-
return data["reason"]
|
|
202
|
-
|
|
203
|
-
async def _a_generate_verdicts(
|
|
204
|
-
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
205
|
-
) -> List[mcpschema.ContextualPrecisionVerdict]:
|
|
206
|
-
prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
|
|
207
|
-
input=input,
|
|
208
|
-
expected_output=expected_output,
|
|
209
|
-
retrieval_context=retrieval_context,
|
|
210
|
-
)
|
|
211
|
-
if self.using_native_model:
|
|
212
|
-
res, cost = await self.model.a_generate(
|
|
213
|
-
prompt, schema=mcpschema.Verdicts
|
|
214
|
-
)
|
|
215
|
-
self.evaluation_cost += cost
|
|
216
|
-
verdicts = [item for item in res.verdicts]
|
|
217
|
-
return verdicts
|
|
218
|
-
else:
|
|
219
|
-
try:
|
|
220
|
-
res: mcpschema.Verdicts = await self.model.a_generate(
|
|
221
|
-
prompt, schema=mcpschema.Verdicts
|
|
222
|
-
)
|
|
223
|
-
verdicts = [item for item in res.verdicts]
|
|
224
|
-
return verdicts
|
|
225
|
-
except TypeError:
|
|
226
|
-
res = await self.model.a_generate(prompt)
|
|
227
|
-
data = trimAndLoadJson(res, self)
|
|
228
|
-
verdicts = [
|
|
229
|
-
mcpschema.ContextualPrecisionVerdict(**item)
|
|
230
|
-
for item in data["verdicts"]
|
|
231
|
-
]
|
|
232
|
-
return verdicts
|
|
233
|
-
|
|
234
|
-
def _generate_verdicts(
|
|
235
|
-
self, input: str, expected_output: str, retrieval_context: List[str]
|
|
236
|
-
) -> List[mcpschema.ContextualPrecisionVerdict]:
|
|
237
|
-
prompt = MultiModalContextualPrecisionTemplate.generate_verdicts(
|
|
238
|
-
input=input,
|
|
239
|
-
expected_output=expected_output,
|
|
240
|
-
retrieval_context=retrieval_context,
|
|
241
|
-
)
|
|
242
|
-
if self.using_native_model:
|
|
243
|
-
res, cost = self.model.generate(prompt, schema=mcpschema.Verdicts)
|
|
244
|
-
self.evaluation_cost += cost
|
|
245
|
-
verdicts = [item for item in res.verdicts]
|
|
246
|
-
return verdicts
|
|
247
|
-
else:
|
|
248
|
-
try:
|
|
249
|
-
res: mcpschema.Verdicts = self.model.generate(
|
|
250
|
-
prompt, schema=mcpschema.Verdicts
|
|
251
|
-
)
|
|
252
|
-
verdicts = [item for item in res.verdicts]
|
|
253
|
-
return verdicts
|
|
254
|
-
except TypeError:
|
|
255
|
-
res = self.model.generate(prompt)
|
|
256
|
-
data = trimAndLoadJson(res, self)
|
|
257
|
-
verdicts = [
|
|
258
|
-
mcpschema.ContextualPrecisionVerdict(**item)
|
|
259
|
-
for item in data["verdicts"]
|
|
260
|
-
]
|
|
261
|
-
return verdicts
|
|
262
|
-
|
|
263
|
-
def _calculate_score(self):
|
|
264
|
-
number_of_verdicts = len(self.verdicts)
|
|
265
|
-
if number_of_verdicts == 0:
|
|
266
|
-
return 0
|
|
267
|
-
|
|
268
|
-
# Convert verdicts to a binary list where 'yes' is 1 and others are 0
|
|
269
|
-
node_verdicts = [
|
|
270
|
-
1 if v.verdict.strip().lower() == "yes" else 0
|
|
271
|
-
for v in self.verdicts
|
|
272
|
-
]
|
|
273
|
-
|
|
274
|
-
sum_weighted_precision_at_k = 0.0
|
|
275
|
-
relevant_nodes_count = 0
|
|
276
|
-
for k, is_relevant in enumerate(node_verdicts, start=1):
|
|
277
|
-
# If the item is relevant, update the counter and add the weighted precision at k to the sum
|
|
278
|
-
if is_relevant:
|
|
279
|
-
relevant_nodes_count += 1
|
|
280
|
-
precision_at_k = relevant_nodes_count / k
|
|
281
|
-
sum_weighted_precision_at_k += precision_at_k * is_relevant
|
|
282
|
-
|
|
283
|
-
if relevant_nodes_count == 0:
|
|
284
|
-
return 0
|
|
285
|
-
# Calculate weighted cumulative precision
|
|
286
|
-
score = sum_weighted_precision_at_k / relevant_nodes_count
|
|
287
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
|
288
|
-
|
|
289
|
-
def is_successful(self) -> bool:
|
|
290
|
-
if self.error is not None:
|
|
291
|
-
self.success = False
|
|
292
|
-
else:
|
|
293
|
-
try:
|
|
294
|
-
self.success = self.score >= self.threshold
|
|
295
|
-
except TypeError:
|
|
296
|
-
self.success = False
|
|
297
|
-
return self.success
|
|
298
|
-
|
|
299
|
-
@property
|
|
300
|
-
def __name__(self):
|
|
301
|
-
return "Multimodal Contextual Precision"
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
2
|
-
from pydantic import BaseModel, Field
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ContextualPrecisionVerdict(BaseModel):
|
|
6
|
-
verdict: str
|
|
7
|
-
reason: str
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class Verdicts(BaseModel):
|
|
11
|
-
verdicts: List[ContextualPrecisionVerdict]
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class MultimodelContextualPrecisionScoreReason(BaseModel):
|
|
15
|
-
reason: str
|
|
@@ -1,132 +0,0 @@
|
|
|
1
|
-
from typing import Union, List
|
|
2
|
-
import textwrap
|
|
3
|
-
|
|
4
|
-
from deepeval.test_case import MLLMImage
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class MultiModalContextualPrecisionTemplate:
|
|
8
|
-
@staticmethod
|
|
9
|
-
def generate_verdicts(
|
|
10
|
-
input: List[Union[str, MLLMImage]],
|
|
11
|
-
expected_output: List[Union[str, MLLMImage]],
|
|
12
|
-
retrieval_context: List[Union[str, MLLMImage]],
|
|
13
|
-
) -> List[Union[str, MLLMImage]]:
|
|
14
|
-
document_count_str = f" ({len(retrieval_context)} document{'s' if len(retrieval_context) > 1 else ''})"
|
|
15
|
-
return (
|
|
16
|
-
[
|
|
17
|
-
textwrap.dedent(
|
|
18
|
-
f"""Given the input, expected output, and retrieval context, please generate a list of JSON objects to determine whether each node in the retrieval context was remotely useful in arriving at the expected output.
|
|
19
|
-
|
|
20
|
-
**
|
|
21
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON. These JSON only contain the `verdict` key that outputs only 'yes' or 'no', and a `reason` key to justify the verdict. In your reason, you should aim to quote parts of the context (which can be text or an image).
|
|
22
|
-
Example Retrieval Context: ["Einstein won the Nobel Prize for his discovery of the photoelectric effect", "He won the Nobel Prize in 1968.", "There was a cat."]
|
|
23
|
-
Example Input: "Who won the Nobel Prize in 1968 and for what?"
|
|
24
|
-
Example Expected Output: "Einstein won the Nobel Prize in 1968 for his discovery of the photoelectric effect."
|
|
25
|
-
|
|
26
|
-
Example:
|
|
27
|
-
{{
|
|
28
|
-
"verdicts": [
|
|
29
|
-
{{
|
|
30
|
-
"reason": "It clearly addresses the question by stating that 'Einstein won the Nobel Prize for his discovery of the photoelectric effect.'",
|
|
31
|
-
"verdict": "yes"
|
|
32
|
-
}},
|
|
33
|
-
{{
|
|
34
|
-
"reason": "The text verifies that the prize was indeed won in 1968.",
|
|
35
|
-
"verdict": "yes"
|
|
36
|
-
}},
|
|
37
|
-
{{
|
|
38
|
-
"reason": "'There was a cat' is not at all relevant to the topic of winning a Nobel Prize.",
|
|
39
|
-
"verdict": "no"
|
|
40
|
-
}}
|
|
41
|
-
]
|
|
42
|
-
}}
|
|
43
|
-
Since you are going to generate a verdict for each context, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to that of the contexts.
|
|
44
|
-
**
|
|
45
|
-
|
|
46
|
-
Input:
|
|
47
|
-
"""
|
|
48
|
-
)
|
|
49
|
-
]
|
|
50
|
-
+ input
|
|
51
|
-
+ [
|
|
52
|
-
textwrap.dedent(
|
|
53
|
-
"""
|
|
54
|
-
Expected output:
|
|
55
|
-
"""
|
|
56
|
-
)
|
|
57
|
-
]
|
|
58
|
-
+ expected_output
|
|
59
|
-
+ [
|
|
60
|
-
textwrap.dedent(
|
|
61
|
-
f"""
|
|
62
|
-
Retrieval Context{document_count_str}:
|
|
63
|
-
"""
|
|
64
|
-
)
|
|
65
|
-
]
|
|
66
|
-
+ MultiModalContextualPrecisionTemplate.id_retrieval_context(
|
|
67
|
-
retrieval_context
|
|
68
|
-
)
|
|
69
|
-
+ [
|
|
70
|
-
textwrap.dedent(
|
|
71
|
-
"""
|
|
72
|
-
JSON:
|
|
73
|
-
"""
|
|
74
|
-
)
|
|
75
|
-
]
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
@staticmethod
|
|
79
|
-
def generate_reason(input, verdicts, score) -> List[Union[str, MLLMImage]]:
|
|
80
|
-
# given the input and retrieval context for this input, where the verdict is whether ... and the node is the ..., give a reason for the score
|
|
81
|
-
return (
|
|
82
|
-
[
|
|
83
|
-
textwrap.dedent(
|
|
84
|
-
f"""Given the input, retrieval contexts, and contextual precision score, provide a CONCISE summarize for the score. Explain why it is not higher, but also why it is at its current score.
|
|
85
|
-
The retrieval contexts is a list of JSON with three keys: `verdict`, `reason` (reason for the verdict) and `node`. `verdict` will be either 'yes' or 'no', which represents whether the corresponding 'node' in the retrieval context is relevant to the input.
|
|
86
|
-
Contextual precision represents if the relevant nodes are ranked higher than irrelevant nodes. Also note that retrieval contexts is given IN THE ORDER OF THEIR RANKINGS.
|
|
87
|
-
|
|
88
|
-
**
|
|
89
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
90
|
-
Example JSON:
|
|
91
|
-
{{
|
|
92
|
-
"reason": "The score is <contextual_precision_score> because <your_reason>."
|
|
93
|
-
}}
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
DO NOT mention 'verdict' in your reason, but instead phrase it as irrelevant nodes. The term 'verdict' are just here for you to understand the broader scope of things.
|
|
97
|
-
Also DO NOT mention there are `reason` fields in the retrieval contexts you are presented with, instead just use the information in the `reason` field.
|
|
98
|
-
In your reason, you MUST USE the `reason`, QUOTES in the 'reason', and the node RANK (starting from 1, eg. first node) to explain why the 'no' verdicts should be ranked lower than the 'yes' verdicts.
|
|
99
|
-
When addressing nodes, make it explicit that it is nodes in retrieval context.
|
|
100
|
-
If the score is 1, keep it short and say something positive with an upbeat tone (but don't overdo it otherwise it gets annoying).
|
|
101
|
-
**
|
|
102
|
-
|
|
103
|
-
Contextual Precision Score:
|
|
104
|
-
{score}
|
|
105
|
-
|
|
106
|
-
Input:
|
|
107
|
-
"""
|
|
108
|
-
)
|
|
109
|
-
]
|
|
110
|
-
+ input
|
|
111
|
-
+ [
|
|
112
|
-
textwrap.dedent(
|
|
113
|
-
f"""
|
|
114
|
-
Retrieval Contexts:
|
|
115
|
-
{verdicts}
|
|
116
|
-
|
|
117
|
-
JSON:
|
|
118
|
-
"""
|
|
119
|
-
)
|
|
120
|
-
]
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
@staticmethod
|
|
124
|
-
def id_retrieval_context(retrieval_context) -> List[Union[str, MLLMImage]]:
|
|
125
|
-
annotated_retrieval_context = []
|
|
126
|
-
for i, context in enumerate(retrieval_context):
|
|
127
|
-
if isinstance(context, str):
|
|
128
|
-
annotated_retrieval_context.append(f"Node {i + 1}: {context}")
|
|
129
|
-
elif isinstance(context, MLLMImage):
|
|
130
|
-
annotated_retrieval_context.append(f"Node {i + 1}:")
|
|
131
|
-
annotated_retrieval_context.append(context)
|
|
132
|
-
return annotated_retrieval_context
|