deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,596 @@
|
|
|
1
|
+
from typing import List, Optional, Union, Type, Tuple
|
|
2
|
+
import asyncio
|
|
3
|
+
|
|
4
|
+
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
6
|
+
from deepeval.utils import (
|
|
7
|
+
get_or_create_event_loop,
|
|
8
|
+
prettify_list,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.metrics.utils import (
|
|
11
|
+
construct_verbose_logs,
|
|
12
|
+
trimAndLoadJson,
|
|
13
|
+
check_conversational_test_case_params,
|
|
14
|
+
get_unit_interactions,
|
|
15
|
+
initialize_model,
|
|
16
|
+
)
|
|
17
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
18
|
+
from deepeval.metrics.turn_faithfulness.template import (
|
|
19
|
+
TurnFaithfulnessTemplate,
|
|
20
|
+
)
|
|
21
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
22
|
+
from deepeval.metrics.turn_faithfulness.schema import (
|
|
23
|
+
FaithfulnessVerdict,
|
|
24
|
+
Verdicts,
|
|
25
|
+
FaithfulnessScoreReason,
|
|
26
|
+
Truths,
|
|
27
|
+
Claims,
|
|
28
|
+
InteractionFaithfulnessScore,
|
|
29
|
+
)
|
|
30
|
+
from deepeval.metrics.api import metric_data_manager
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
34
|
+
_required_test_case_params: List[TurnParams] = [
|
|
35
|
+
TurnParams.CONTENT,
|
|
36
|
+
TurnParams.RETRIEVAL_CONTEXT,
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
threshold: float = 0.5,
|
|
42
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
43
|
+
include_reason: bool = True,
|
|
44
|
+
async_mode: bool = True,
|
|
45
|
+
strict_mode: bool = False,
|
|
46
|
+
verbose_mode: bool = False,
|
|
47
|
+
truths_extraction_limit: Optional[int] = None,
|
|
48
|
+
penalize_ambiguous_claims: bool = False,
|
|
49
|
+
evaluation_template: Type[
|
|
50
|
+
TurnFaithfulnessTemplate
|
|
51
|
+
] = TurnFaithfulnessTemplate,
|
|
52
|
+
):
|
|
53
|
+
self.threshold = 1 if strict_mode else threshold
|
|
54
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
55
|
+
self.evaluation_model = self.model.get_model_name()
|
|
56
|
+
self.include_reason = include_reason
|
|
57
|
+
self.async_mode = async_mode
|
|
58
|
+
self.strict_mode = strict_mode
|
|
59
|
+
self.verbose_mode = verbose_mode
|
|
60
|
+
self.evaluation_template = evaluation_template
|
|
61
|
+
self.penalize_ambiguous_claims = penalize_ambiguous_claims
|
|
62
|
+
|
|
63
|
+
self.truths_extraction_limit = truths_extraction_limit
|
|
64
|
+
if self.truths_extraction_limit is not None:
|
|
65
|
+
self.truths_extraction_limit = max(self.truths_extraction_limit, 0)
|
|
66
|
+
|
|
67
|
+
def measure(
|
|
68
|
+
self,
|
|
69
|
+
test_case: ConversationalTestCase,
|
|
70
|
+
_show_indicator: bool = True,
|
|
71
|
+
_in_component: bool = False,
|
|
72
|
+
_log_metric_to_confident: bool = True,
|
|
73
|
+
):
|
|
74
|
+
check_conversational_test_case_params(
|
|
75
|
+
test_case,
|
|
76
|
+
self._required_test_case_params,
|
|
77
|
+
self,
|
|
78
|
+
False,
|
|
79
|
+
self.model,
|
|
80
|
+
test_case.multimodal,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
multimodal = test_case.multimodal
|
|
84
|
+
|
|
85
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
86
|
+
with metric_progress_indicator(
|
|
87
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
88
|
+
):
|
|
89
|
+
if self.async_mode:
|
|
90
|
+
loop = get_or_create_event_loop()
|
|
91
|
+
loop.run_until_complete(
|
|
92
|
+
self.a_measure(
|
|
93
|
+
test_case,
|
|
94
|
+
_show_indicator=False,
|
|
95
|
+
_in_component=_in_component,
|
|
96
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
else:
|
|
100
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
101
|
+
scores = self._get_faithfulness_scores(
|
|
102
|
+
unit_interactions, multimodal
|
|
103
|
+
)
|
|
104
|
+
self.score = self._calculate_score(scores)
|
|
105
|
+
self.success = self.score >= self.threshold
|
|
106
|
+
self.reason = self._generate_reason(scores)
|
|
107
|
+
verbose_steps = self._get_verbose_steps(scores)
|
|
108
|
+
self.verbose_logs = construct_verbose_logs(
|
|
109
|
+
self,
|
|
110
|
+
steps=[
|
|
111
|
+
*verbose_steps,
|
|
112
|
+
f"Final Score: {self.score}\n",
|
|
113
|
+
f"Final Reason: {self.reason}\n",
|
|
114
|
+
],
|
|
115
|
+
)
|
|
116
|
+
if _log_metric_to_confident:
|
|
117
|
+
metric_data_manager.post_metric_if_enabled(
|
|
118
|
+
self, test_case=test_case
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return self.score
|
|
122
|
+
|
|
123
|
+
async def a_measure(
|
|
124
|
+
self,
|
|
125
|
+
test_case: ConversationalTestCase,
|
|
126
|
+
_show_indicator: bool = True,
|
|
127
|
+
_in_component: bool = False,
|
|
128
|
+
_log_metric_to_confident: bool = True,
|
|
129
|
+
) -> float:
|
|
130
|
+
check_conversational_test_case_params(
|
|
131
|
+
test_case,
|
|
132
|
+
self._required_test_case_params,
|
|
133
|
+
self,
|
|
134
|
+
False,
|
|
135
|
+
self.model,
|
|
136
|
+
test_case.multimodal,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
multimodal = test_case.multimodal
|
|
140
|
+
|
|
141
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
142
|
+
with metric_progress_indicator(
|
|
143
|
+
self,
|
|
144
|
+
async_mode=True,
|
|
145
|
+
_show_indicator=_show_indicator,
|
|
146
|
+
_in_component=_in_component,
|
|
147
|
+
):
|
|
148
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
149
|
+
scores = await self._a_get_faithfulness_scores(
|
|
150
|
+
unit_interactions, multimodal
|
|
151
|
+
)
|
|
152
|
+
self.score = self._calculate_score(scores)
|
|
153
|
+
self.success = self.score >= self.threshold
|
|
154
|
+
self.reason = await self._a_generate_reason(scores)
|
|
155
|
+
verbose_steps = self._get_verbose_steps(scores)
|
|
156
|
+
self.verbose_logs = construct_verbose_logs(
|
|
157
|
+
self,
|
|
158
|
+
steps=[
|
|
159
|
+
*verbose_steps,
|
|
160
|
+
f"Final Score: {self.score}\n",
|
|
161
|
+
f"Final Reason: {self.reason}\n",
|
|
162
|
+
],
|
|
163
|
+
)
|
|
164
|
+
if _log_metric_to_confident:
|
|
165
|
+
metric_data_manager.post_metric_if_enabled(
|
|
166
|
+
self, test_case=test_case
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
return self.score
|
|
170
|
+
|
|
171
|
+
async def _a_get_faithfulness_scores(
|
|
172
|
+
self, unit_interactions: List[List[Turn]], multimodal: bool
|
|
173
|
+
):
|
|
174
|
+
|
|
175
|
+
async def get_interaction_score(unit_interaction: List[Turn]):
|
|
176
|
+
user_content = "User Message: "
|
|
177
|
+
retrieval_context = []
|
|
178
|
+
assistant_content = "Assistant Message: "
|
|
179
|
+
for turn in unit_interaction:
|
|
180
|
+
if turn.role == "user":
|
|
181
|
+
user_content += f"\n{turn.content} "
|
|
182
|
+
else:
|
|
183
|
+
assistant_content += f"\n{turn.content} "
|
|
184
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
185
|
+
truths = await self._a_generate_truths(
|
|
186
|
+
retrieval_context, multimodal
|
|
187
|
+
)
|
|
188
|
+
claims = await self._a_generate_claims(
|
|
189
|
+
user_content, assistant_content, multimodal
|
|
190
|
+
)
|
|
191
|
+
verdicts = await self._a_generate_verdicts(
|
|
192
|
+
claims, truths, multimodal
|
|
193
|
+
)
|
|
194
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
195
|
+
verdicts, multimodal
|
|
196
|
+
)
|
|
197
|
+
interaction_score = InteractionFaithfulnessScore(
|
|
198
|
+
score=score,
|
|
199
|
+
reason=reason,
|
|
200
|
+
claims=claims,
|
|
201
|
+
truths=truths,
|
|
202
|
+
verdicts=verdicts,
|
|
203
|
+
)
|
|
204
|
+
return interaction_score
|
|
205
|
+
|
|
206
|
+
final_scores = await asyncio.gather(
|
|
207
|
+
*[
|
|
208
|
+
get_interaction_score(unit_interaction)
|
|
209
|
+
for unit_interaction in unit_interactions
|
|
210
|
+
]
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
return final_scores
|
|
214
|
+
|
|
215
|
+
def _get_faithfulness_scores(
|
|
216
|
+
self, unit_interactions: List[List[Turn]], multimodal: bool
|
|
217
|
+
):
|
|
218
|
+
interaction_scores = []
|
|
219
|
+
|
|
220
|
+
for unit_interaction in unit_interactions:
|
|
221
|
+
user_content = "User Message: "
|
|
222
|
+
retrieval_context = []
|
|
223
|
+
assistant_content = "Assistant Message: "
|
|
224
|
+
for turn in unit_interaction:
|
|
225
|
+
if turn.role == "user":
|
|
226
|
+
user_content += f"\n{turn.content} "
|
|
227
|
+
else:
|
|
228
|
+
assistant_content += f"\n{turn.content} "
|
|
229
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
230
|
+
truths = self._generate_truths(retrieval_context, multimodal)
|
|
231
|
+
claims = self._generate_claims(
|
|
232
|
+
user_content, assistant_content, multimodal
|
|
233
|
+
)
|
|
234
|
+
verdicts = self._generate_verdicts(claims, truths, multimodal)
|
|
235
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
236
|
+
verdicts, multimodal
|
|
237
|
+
)
|
|
238
|
+
interaction_score = InteractionFaithfulnessScore(
|
|
239
|
+
score=score,
|
|
240
|
+
reason=reason,
|
|
241
|
+
claims=claims,
|
|
242
|
+
truths=truths,
|
|
243
|
+
verdicts=verdicts,
|
|
244
|
+
)
|
|
245
|
+
interaction_scores.append(interaction_score)
|
|
246
|
+
|
|
247
|
+
return interaction_scores
|
|
248
|
+
|
|
249
|
+
async def _a_generate_truths(
|
|
250
|
+
self, retrieval_context: str, multimodal: bool
|
|
251
|
+
) -> List[str]:
|
|
252
|
+
prompt = self.evaluation_template.generate_truths(
|
|
253
|
+
reference_context="\n\n".join(retrieval_context),
|
|
254
|
+
extraction_limit=self.truths_extraction_limit,
|
|
255
|
+
multimodal=multimodal,
|
|
256
|
+
)
|
|
257
|
+
if self.using_native_model:
|
|
258
|
+
res, cost = await self.model.a_generate(prompt, schema=Truths)
|
|
259
|
+
self.evaluation_cost += cost
|
|
260
|
+
return res.truths
|
|
261
|
+
else:
|
|
262
|
+
try:
|
|
263
|
+
res: Truths = await self.model.a_generate(prompt, schema=Truths)
|
|
264
|
+
return res.truths
|
|
265
|
+
except TypeError:
|
|
266
|
+
res = await self.model.a_generate(prompt)
|
|
267
|
+
data = trimAndLoadJson(res, self)
|
|
268
|
+
return data["truths"]
|
|
269
|
+
|
|
270
|
+
def _generate_truths(
|
|
271
|
+
self, retrieval_context: str, multimodal: bool
|
|
272
|
+
) -> List[str]:
|
|
273
|
+
prompt = self.evaluation_template.generate_truths(
|
|
274
|
+
reference_context="\n\n".join(retrieval_context),
|
|
275
|
+
extraction_limit=self.truths_extraction_limit,
|
|
276
|
+
multimodal=multimodal,
|
|
277
|
+
)
|
|
278
|
+
if self.using_native_model:
|
|
279
|
+
res, cost = self.model.generate(prompt, schema=Truths)
|
|
280
|
+
self.evaluation_cost += cost
|
|
281
|
+
return res.truths
|
|
282
|
+
else:
|
|
283
|
+
try:
|
|
284
|
+
res: Truths = self.model.generate(prompt, schema=Truths)
|
|
285
|
+
return res.truths
|
|
286
|
+
except TypeError:
|
|
287
|
+
res = self.model.generate(prompt)
|
|
288
|
+
data = trimAndLoadJson(res, self)
|
|
289
|
+
return data["truths"]
|
|
290
|
+
|
|
291
|
+
async def _a_generate_claims(
|
|
292
|
+
self, user_content: str, assistant_content: str, multimodal: bool
|
|
293
|
+
) -> List[str]:
|
|
294
|
+
prompt = self.evaluation_template.generate_claims(
|
|
295
|
+
input=user_content,
|
|
296
|
+
assistant_output=assistant_content,
|
|
297
|
+
multimodal=multimodal,
|
|
298
|
+
)
|
|
299
|
+
if self.using_native_model:
|
|
300
|
+
res, cost = await self.model.a_generate(prompt, schema=Claims)
|
|
301
|
+
self.evaluation_cost += cost
|
|
302
|
+
return res.claims
|
|
303
|
+
else:
|
|
304
|
+
try:
|
|
305
|
+
res: Claims = await self.model.a_generate(prompt, schema=Claims)
|
|
306
|
+
return res.claims
|
|
307
|
+
except TypeError:
|
|
308
|
+
res = await self.model.a_generate(prompt)
|
|
309
|
+
data = trimAndLoadJson(res, self)
|
|
310
|
+
return data["claims"]
|
|
311
|
+
|
|
312
|
+
def _generate_claims(
|
|
313
|
+
self, user_content: str, assistant_content: str, multimodal: bool
|
|
314
|
+
) -> List[str]:
|
|
315
|
+
prompt = self.evaluation_template.generate_claims(
|
|
316
|
+
input=user_content,
|
|
317
|
+
assistant_output=assistant_content,
|
|
318
|
+
multimodal=multimodal,
|
|
319
|
+
)
|
|
320
|
+
if self.using_native_model:
|
|
321
|
+
res, cost = self.model.generate(prompt, schema=Claims)
|
|
322
|
+
self.evaluation_cost += cost
|
|
323
|
+
return res.claims
|
|
324
|
+
else:
|
|
325
|
+
try:
|
|
326
|
+
res: Claims = self.model.generate(prompt, schema=Claims)
|
|
327
|
+
return res.claims
|
|
328
|
+
except TypeError:
|
|
329
|
+
res = self.model.generate(prompt)
|
|
330
|
+
data = trimAndLoadJson(res, self)
|
|
331
|
+
return data["claims"]
|
|
332
|
+
|
|
333
|
+
async def _a_generate_verdicts(
|
|
334
|
+
self, claims: Claims, truths: Truths, multimodal: bool
|
|
335
|
+
) -> List[FaithfulnessVerdict]:
|
|
336
|
+
if len(claims) == 0:
|
|
337
|
+
return []
|
|
338
|
+
|
|
339
|
+
verdicts: List[FaithfulnessVerdict] = []
|
|
340
|
+
|
|
341
|
+
prompt = self.evaluation_template.generate_verdicts(
|
|
342
|
+
claims=claims,
|
|
343
|
+
reference_context="\n\n".join(truths),
|
|
344
|
+
multimodal=multimodal,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
if self.using_native_model:
|
|
348
|
+
res, cost = await self.model.a_generate(prompt, schema=Verdicts)
|
|
349
|
+
self.evaluation_cost += cost
|
|
350
|
+
verdicts = [item for item in res.verdicts]
|
|
351
|
+
return verdicts
|
|
352
|
+
else:
|
|
353
|
+
try:
|
|
354
|
+
res: Verdicts = await self.model.a_generate(
|
|
355
|
+
prompt, schema=Verdicts
|
|
356
|
+
)
|
|
357
|
+
verdicts = [item for item in res.verdicts]
|
|
358
|
+
return verdicts
|
|
359
|
+
except TypeError:
|
|
360
|
+
res = await self.model.a_generate(prompt)
|
|
361
|
+
data = trimAndLoadJson(res, self)
|
|
362
|
+
verdicts = [
|
|
363
|
+
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
364
|
+
]
|
|
365
|
+
return verdicts
|
|
366
|
+
|
|
367
|
+
def _generate_verdicts(
|
|
368
|
+
self, claims: Claims, truths: Truths, multimodal: bool
|
|
369
|
+
) -> List[FaithfulnessVerdict]:
|
|
370
|
+
if len(claims) == 0:
|
|
371
|
+
return []
|
|
372
|
+
|
|
373
|
+
verdicts: List[FaithfulnessVerdict] = []
|
|
374
|
+
|
|
375
|
+
prompt = self.evaluation_template.generate_verdicts(
|
|
376
|
+
claims=claims,
|
|
377
|
+
reference_context="\n\n".join(truths),
|
|
378
|
+
multimodal=multimodal,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
if self.using_native_model:
|
|
382
|
+
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
383
|
+
self.evaluation_cost += cost
|
|
384
|
+
verdicts = [item for item in res.verdicts]
|
|
385
|
+
return verdicts
|
|
386
|
+
else:
|
|
387
|
+
try:
|
|
388
|
+
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
389
|
+
verdicts = [item for item in res.verdicts]
|
|
390
|
+
return verdicts
|
|
391
|
+
except TypeError:
|
|
392
|
+
res = self.model.generate(prompt)
|
|
393
|
+
data = trimAndLoadJson(res, self)
|
|
394
|
+
verdicts = [
|
|
395
|
+
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
396
|
+
]
|
|
397
|
+
return verdicts
|
|
398
|
+
|
|
399
|
+
def _get_interaction_score_and_reason(
|
|
400
|
+
self, verdicts, multimodal: bool
|
|
401
|
+
) -> Tuple[float, str]:
|
|
402
|
+
number_of_verdicts = len(verdicts)
|
|
403
|
+
if number_of_verdicts == 0:
|
|
404
|
+
return 1
|
|
405
|
+
|
|
406
|
+
faithfulness_count = 0
|
|
407
|
+
for verdict in verdicts:
|
|
408
|
+
if verdict.verdict.strip().lower() != "no":
|
|
409
|
+
faithfulness_count += 1
|
|
410
|
+
|
|
411
|
+
if (
|
|
412
|
+
self.penalize_ambiguous_claims
|
|
413
|
+
and verdict.verdict.strip().lower() == "idk"
|
|
414
|
+
):
|
|
415
|
+
faithfulness_count -= 1
|
|
416
|
+
|
|
417
|
+
score = faithfulness_count / number_of_verdicts
|
|
418
|
+
reason = self._get_interaction_reason(score, verdicts, multimodal)
|
|
419
|
+
return (
|
|
420
|
+
(0, reason)
|
|
421
|
+
if self.strict_mode and score < self.threshold
|
|
422
|
+
else (score, reason)
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
async def _a_get_interaction_score_and_reason(
|
|
426
|
+
self, verdicts, multimodal: bool
|
|
427
|
+
) -> Tuple[float, str]:
|
|
428
|
+
number_of_verdicts = len(verdicts)
|
|
429
|
+
if number_of_verdicts == 0:
|
|
430
|
+
return 1
|
|
431
|
+
|
|
432
|
+
faithfulness_count = 0
|
|
433
|
+
for verdict in verdicts:
|
|
434
|
+
if verdict.verdict.strip().lower() != "no":
|
|
435
|
+
faithfulness_count += 1
|
|
436
|
+
|
|
437
|
+
if (
|
|
438
|
+
self.penalize_ambiguous_claims
|
|
439
|
+
and verdict.verdict.strip().lower() == "idk"
|
|
440
|
+
):
|
|
441
|
+
faithfulness_count -= 1
|
|
442
|
+
|
|
443
|
+
score = faithfulness_count / number_of_verdicts
|
|
444
|
+
reason = await self._a_get_interaction_reason(
|
|
445
|
+
score, verdicts, multimodal
|
|
446
|
+
)
|
|
447
|
+
return (
|
|
448
|
+
(0, reason)
|
|
449
|
+
if self.strict_mode and score < self.threshold
|
|
450
|
+
else (score, reason)
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
async def _a_get_interaction_reason(
|
|
454
|
+
self, score, verdicts, multimodal: bool
|
|
455
|
+
) -> str:
|
|
456
|
+
if self.include_reason is False:
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
contradictions = []
|
|
460
|
+
for verdict in verdicts:
|
|
461
|
+
if verdict.verdict.strip().lower() == "no":
|
|
462
|
+
contradictions.append(verdict.reason)
|
|
463
|
+
|
|
464
|
+
prompt = self.evaluation_template.generate_reason(
|
|
465
|
+
contradictions=contradictions,
|
|
466
|
+
score=format(score, ".2f"),
|
|
467
|
+
multimodal=multimodal,
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
if self.using_native_model:
|
|
471
|
+
res, cost = await self.model.a_generate(
|
|
472
|
+
prompt, schema=FaithfulnessScoreReason
|
|
473
|
+
)
|
|
474
|
+
self.evaluation_cost += cost
|
|
475
|
+
return res.reason
|
|
476
|
+
else:
|
|
477
|
+
try:
|
|
478
|
+
res: FaithfulnessScoreReason = await self.model.a_generate(
|
|
479
|
+
prompt, schema=FaithfulnessScoreReason
|
|
480
|
+
)
|
|
481
|
+
return res.reason
|
|
482
|
+
except TypeError:
|
|
483
|
+
res = await self.model.a_generate(prompt)
|
|
484
|
+
data = trimAndLoadJson(res, self)
|
|
485
|
+
return data["reason"]
|
|
486
|
+
|
|
487
|
+
def _get_interaction_reason(self, score, verdicts, multimodal: bool) -> str:
|
|
488
|
+
if self.include_reason is False:
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
contradictions = []
|
|
492
|
+
for verdict in verdicts:
|
|
493
|
+
if verdict.verdict.strip().lower() == "no":
|
|
494
|
+
contradictions.append(verdict.reason)
|
|
495
|
+
|
|
496
|
+
prompt = self.evaluation_template.generate_reason(
|
|
497
|
+
contradictions=contradictions,
|
|
498
|
+
score=format(score, ".2f"),
|
|
499
|
+
multimodal=multimodal,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
if self.using_native_model:
|
|
503
|
+
res, cost = self.model.generate(
|
|
504
|
+
prompt, schema=FaithfulnessScoreReason
|
|
505
|
+
)
|
|
506
|
+
self.evaluation_cost += cost
|
|
507
|
+
return res.reason
|
|
508
|
+
else:
|
|
509
|
+
try:
|
|
510
|
+
res: FaithfulnessScoreReason = self.model.generate(
|
|
511
|
+
prompt, schema=FaithfulnessScoreReason
|
|
512
|
+
)
|
|
513
|
+
return res.reason
|
|
514
|
+
except TypeError:
|
|
515
|
+
res = self.model.generate(prompt)
|
|
516
|
+
data = trimAndLoadJson(res, self)
|
|
517
|
+
return data["reason"]
|
|
518
|
+
|
|
519
|
+
def _get_verbose_steps(
|
|
520
|
+
self, interaction_scores: List[InteractionFaithfulnessScore]
|
|
521
|
+
):
|
|
522
|
+
steps = []
|
|
523
|
+
for index, interaction_score in enumerate(interaction_scores):
|
|
524
|
+
interaction_steps = [
|
|
525
|
+
f"Interaction {index + 1} \n",
|
|
526
|
+
f"Truths: {prettify_list(interaction_score.truths)} \n",
|
|
527
|
+
f"Claims: {prettify_list(interaction_score.claims)} \n",
|
|
528
|
+
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
529
|
+
f"Score: {interaction_score.score} \n",
|
|
530
|
+
f"Reason: {interaction_score.reason} \n",
|
|
531
|
+
]
|
|
532
|
+
steps.extend(interaction_steps)
|
|
533
|
+
return steps
|
|
534
|
+
|
|
535
|
+
def _generate_reason(
|
|
536
|
+
self, scores: List[InteractionFaithfulnessScore]
|
|
537
|
+
) -> str:
|
|
538
|
+
reasons = []
|
|
539
|
+
for score in scores:
|
|
540
|
+
reasons.append(score.reason)
|
|
541
|
+
|
|
542
|
+
prompt = self.evaluation_template.generate_final_reason(
|
|
543
|
+
self.score, self.success, reasons
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
if self.using_native_model:
|
|
547
|
+
res, cost = self.model.generate(prompt)
|
|
548
|
+
self.evaluation_cost += cost
|
|
549
|
+
return res
|
|
550
|
+
else:
|
|
551
|
+
res = self.model.generate(prompt)
|
|
552
|
+
return res
|
|
553
|
+
|
|
554
|
+
async def _a_generate_reason(
|
|
555
|
+
self, scores: List[InteractionFaithfulnessScore]
|
|
556
|
+
) -> str:
|
|
557
|
+
reasons = []
|
|
558
|
+
for score in scores:
|
|
559
|
+
reasons.append(score.reason)
|
|
560
|
+
|
|
561
|
+
prompt = self.evaluation_template.generate_final_reason(
|
|
562
|
+
self.score, self.success, reasons
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
if self.using_native_model:
|
|
566
|
+
res, cost = await self.model.a_generate(prompt)
|
|
567
|
+
self.evaluation_cost += cost
|
|
568
|
+
return res
|
|
569
|
+
else:
|
|
570
|
+
res = await self.model.a_generate(prompt)
|
|
571
|
+
return res
|
|
572
|
+
|
|
573
|
+
def _calculate_score(
|
|
574
|
+
self, scores: List[InteractionFaithfulnessScore]
|
|
575
|
+
) -> float:
|
|
576
|
+
number_of_scores = len(scores)
|
|
577
|
+
if number_of_scores == 0:
|
|
578
|
+
return 1
|
|
579
|
+
total_score = 0
|
|
580
|
+
for score in scores:
|
|
581
|
+
total_score += score.score
|
|
582
|
+
return total_score / number_of_scores
|
|
583
|
+
|
|
584
|
+
def is_successful(self) -> bool:
|
|
585
|
+
if self.error is not None:
|
|
586
|
+
self.success = False
|
|
587
|
+
else:
|
|
588
|
+
try:
|
|
589
|
+
self.success = self.score >= self.threshold
|
|
590
|
+
except:
|
|
591
|
+
self.success = False
|
|
592
|
+
return self.success
|
|
593
|
+
|
|
594
|
+
@property
|
|
595
|
+
def __name__(self):
|
|
596
|
+
return "Turn Faithfulness"
|