deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type, Tuple
|
|
2
2
|
import asyncio
|
|
3
|
-
|
|
3
|
+
import itertools
|
|
4
4
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
5
|
from deepeval.metrics import BaseConversationalMetric
|
|
6
6
|
from deepeval.utils import (
|
|
@@ -12,7 +12,10 @@ from deepeval.metrics.utils import (
|
|
|
12
12
|
trimAndLoadJson,
|
|
13
13
|
check_conversational_test_case_params,
|
|
14
14
|
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
15
16
|
initialize_model,
|
|
17
|
+
generate_with_schema_and_extract,
|
|
18
|
+
a_generate_with_schema_and_extract,
|
|
16
19
|
)
|
|
17
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
18
21
|
from deepeval.metrics.turn_faithfulness.template import (
|
|
@@ -32,6 +35,7 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
32
35
|
|
|
33
36
|
class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
34
37
|
_required_test_case_params: List[TurnParams] = [
|
|
38
|
+
TurnParams.ROLE,
|
|
35
39
|
TurnParams.CONTENT,
|
|
36
40
|
TurnParams.RETRIEVAL_CONTEXT,
|
|
37
41
|
]
|
|
@@ -46,6 +50,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
46
50
|
verbose_mode: bool = False,
|
|
47
51
|
truths_extraction_limit: Optional[int] = None,
|
|
48
52
|
penalize_ambiguous_claims: bool = False,
|
|
53
|
+
window_size: int = 10,
|
|
49
54
|
evaluation_template: Type[
|
|
50
55
|
TurnFaithfulnessTemplate
|
|
51
56
|
] = TurnFaithfulnessTemplate,
|
|
@@ -59,6 +64,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
59
64
|
self.verbose_mode = verbose_mode
|
|
60
65
|
self.evaluation_template = evaluation_template
|
|
61
66
|
self.penalize_ambiguous_claims = penalize_ambiguous_claims
|
|
67
|
+
self.window_size = window_size
|
|
62
68
|
|
|
63
69
|
self.truths_extraction_limit = truths_extraction_limit
|
|
64
70
|
if self.truths_extraction_limit is not None:
|
|
@@ -98,9 +104,17 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
98
104
|
)
|
|
99
105
|
else:
|
|
100
106
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
107
|
+
turns_windows: List[List[Turn]] = [
|
|
108
|
+
list(itertools.chain(*window))
|
|
109
|
+
for window in get_turns_in_sliding_window(
|
|
110
|
+
unit_interactions, self.window_size
|
|
111
|
+
)
|
|
112
|
+
]
|
|
113
|
+
scores = []
|
|
114
|
+
for window in turns_windows:
|
|
115
|
+
scores.extend(
|
|
116
|
+
self._get_faithfulness_scores(window, multimodal)
|
|
117
|
+
)
|
|
104
118
|
self.score = self._calculate_score(scores)
|
|
105
119
|
self.success = self.score >= self.threshold
|
|
106
120
|
self.reason = self._generate_reason(scores)
|
|
@@ -146,9 +160,23 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
146
160
|
_in_component=_in_component,
|
|
147
161
|
):
|
|
148
162
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
163
|
+
turns_windows: List[List[Turn]] = [
|
|
164
|
+
list(itertools.chain(*window))
|
|
165
|
+
for window in get_turns_in_sliding_window(
|
|
166
|
+
unit_interactions, self.window_size
|
|
167
|
+
)
|
|
168
|
+
]
|
|
169
|
+
scores = []
|
|
170
|
+
tasks = []
|
|
171
|
+
|
|
172
|
+
async def get_individual_scores(window):
|
|
173
|
+
scores.extend(
|
|
174
|
+
await self._a_get_faithfulness_scores(window, multimodal)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
for window in turns_windows:
|
|
178
|
+
tasks.append(get_individual_scores(window))
|
|
179
|
+
await asyncio.gather(*tasks)
|
|
152
180
|
self.score = self._calculate_score(scores)
|
|
153
181
|
self.success = self.score >= self.threshold
|
|
154
182
|
self.reason = await self._a_generate_reason(scores)
|
|
@@ -169,82 +197,75 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
169
197
|
return self.score
|
|
170
198
|
|
|
171
199
|
async def _a_get_faithfulness_scores(
|
|
172
|
-
self,
|
|
200
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
173
201
|
):
|
|
174
202
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
203
|
+
windows_scores = []
|
|
204
|
+
|
|
205
|
+
user_content = ""
|
|
206
|
+
assistant_content = ""
|
|
207
|
+
retrieval_context = []
|
|
208
|
+
for turn in turns_window:
|
|
209
|
+
if turn.role == "user":
|
|
210
|
+
user_content += f"\n{turn.content} "
|
|
211
|
+
else:
|
|
212
|
+
assistant_content += f"\n{turn.content}"
|
|
213
|
+
if turn.retrieval_context is not None:
|
|
184
214
|
retrieval_context.extend(turn.retrieval_context)
|
|
185
|
-
truths = await self._a_generate_truths(
|
|
186
|
-
retrieval_context, multimodal
|
|
187
|
-
)
|
|
188
|
-
claims = await self._a_generate_claims(
|
|
189
|
-
user_content, assistant_content, multimodal
|
|
190
|
-
)
|
|
191
|
-
verdicts = await self._a_generate_verdicts(
|
|
192
|
-
claims, truths, multimodal
|
|
193
|
-
)
|
|
194
|
-
score, reason = self._get_interaction_score_and_reason(
|
|
195
|
-
verdicts, multimodal
|
|
196
|
-
)
|
|
197
|
-
interaction_score = InteractionFaithfulnessScore(
|
|
198
|
-
score=score,
|
|
199
|
-
reason=reason,
|
|
200
|
-
claims=claims,
|
|
201
|
-
truths=truths,
|
|
202
|
-
verdicts=verdicts,
|
|
203
|
-
)
|
|
204
|
-
return interaction_score
|
|
205
215
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
216
|
+
truths = await self._a_generate_truths(retrieval_context, multimodal)
|
|
217
|
+
claims = await self._a_generate_claims(
|
|
218
|
+
user_content, assistant_content, multimodal
|
|
219
|
+
)
|
|
220
|
+
verdicts = await self._a_generate_verdicts(claims, truths, multimodal)
|
|
221
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
222
|
+
verdicts, multimodal
|
|
211
223
|
)
|
|
224
|
+
interaction_score = InteractionFaithfulnessScore(
|
|
225
|
+
score=score,
|
|
226
|
+
reason=reason,
|
|
227
|
+
claims=claims,
|
|
228
|
+
truths=truths,
|
|
229
|
+
verdicts=verdicts,
|
|
230
|
+
)
|
|
231
|
+
windows_scores.append(interaction_score)
|
|
212
232
|
|
|
213
|
-
return
|
|
233
|
+
return windows_scores
|
|
214
234
|
|
|
215
235
|
def _get_faithfulness_scores(
|
|
216
|
-
self,
|
|
236
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
217
237
|
):
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
238
|
+
windows_scores = []
|
|
239
|
+
|
|
240
|
+
user_content = ""
|
|
241
|
+
assistant_content = ""
|
|
242
|
+
retrieval_context = []
|
|
243
|
+
for turn in turns_window:
|
|
244
|
+
if turn.role == "user":
|
|
245
|
+
user_content += f"\n{turn.content} "
|
|
246
|
+
else:
|
|
247
|
+
assistant_content += f"\n{turn.content}"
|
|
248
|
+
if turn.retrieval_context is not None:
|
|
229
249
|
retrieval_context.extend(turn.retrieval_context)
|
|
230
|
-
truths = self._generate_truths(retrieval_context, multimodal)
|
|
231
|
-
claims = self._generate_claims(
|
|
232
|
-
user_content, assistant_content, multimodal
|
|
233
|
-
)
|
|
234
|
-
verdicts = self._generate_verdicts(claims, truths, multimodal)
|
|
235
|
-
score, reason = self._get_interaction_score_and_reason(
|
|
236
|
-
verdicts, multimodal
|
|
237
|
-
)
|
|
238
|
-
interaction_score = InteractionFaithfulnessScore(
|
|
239
|
-
score=score,
|
|
240
|
-
reason=reason,
|
|
241
|
-
claims=claims,
|
|
242
|
-
truths=truths,
|
|
243
|
-
verdicts=verdicts,
|
|
244
|
-
)
|
|
245
|
-
interaction_scores.append(interaction_score)
|
|
246
250
|
|
|
247
|
-
|
|
251
|
+
truths = self._generate_truths(retrieval_context, multimodal)
|
|
252
|
+
claims = self._generate_claims(
|
|
253
|
+
user_content, assistant_content, multimodal
|
|
254
|
+
)
|
|
255
|
+
verdicts = self._generate_verdicts(claims, truths, multimodal)
|
|
256
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
257
|
+
verdicts, multimodal
|
|
258
|
+
)
|
|
259
|
+
interaction_score = InteractionFaithfulnessScore(
|
|
260
|
+
score=score,
|
|
261
|
+
reason=reason,
|
|
262
|
+
claims=claims,
|
|
263
|
+
truths=truths,
|
|
264
|
+
verdicts=verdicts,
|
|
265
|
+
)
|
|
266
|
+
windows_scores.append(interaction_score)
|
|
267
|
+
|
|
268
|
+
return windows_scores
|
|
248
269
|
|
|
249
270
|
async def _a_generate_truths(
|
|
250
271
|
self, retrieval_context: str, multimodal: bool
|
|
@@ -254,18 +275,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
254
275
|
extraction_limit=self.truths_extraction_limit,
|
|
255
276
|
multimodal=multimodal,
|
|
256
277
|
)
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
self
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
except TypeError:
|
|
266
|
-
res = await self.model.a_generate(prompt)
|
|
267
|
-
data = trimAndLoadJson(res, self)
|
|
268
|
-
return data["truths"]
|
|
278
|
+
|
|
279
|
+
return await a_generate_with_schema_and_extract(
|
|
280
|
+
metric=self,
|
|
281
|
+
prompt=prompt,
|
|
282
|
+
schema_cls=Truths,
|
|
283
|
+
extract_schema=lambda s: s.truths,
|
|
284
|
+
extract_json=lambda data: data["truths"],
|
|
285
|
+
)
|
|
269
286
|
|
|
270
287
|
def _generate_truths(
|
|
271
288
|
self, retrieval_context: str, multimodal: bool
|
|
@@ -275,18 +292,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
275
292
|
extraction_limit=self.truths_extraction_limit,
|
|
276
293
|
multimodal=multimodal,
|
|
277
294
|
)
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
self
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
except TypeError:
|
|
287
|
-
res = self.model.generate(prompt)
|
|
288
|
-
data = trimAndLoadJson(res, self)
|
|
289
|
-
return data["truths"]
|
|
295
|
+
|
|
296
|
+
return generate_with_schema_and_extract(
|
|
297
|
+
metric=self,
|
|
298
|
+
prompt=prompt,
|
|
299
|
+
schema_cls=Truths,
|
|
300
|
+
extract_schema=lambda s: s.truths,
|
|
301
|
+
extract_json=lambda data: data["truths"],
|
|
302
|
+
)
|
|
290
303
|
|
|
291
304
|
async def _a_generate_claims(
|
|
292
305
|
self, user_content: str, assistant_content: str, multimodal: bool
|
|
@@ -296,18 +309,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
296
309
|
assistant_output=assistant_content,
|
|
297
310
|
multimodal=multimodal,
|
|
298
311
|
)
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
self
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
except TypeError:
|
|
308
|
-
res = await self.model.a_generate(prompt)
|
|
309
|
-
data = trimAndLoadJson(res, self)
|
|
310
|
-
return data["claims"]
|
|
312
|
+
|
|
313
|
+
return await a_generate_with_schema_and_extract(
|
|
314
|
+
metric=self,
|
|
315
|
+
prompt=prompt,
|
|
316
|
+
schema_cls=Claims,
|
|
317
|
+
extract_schema=lambda s: s.claims,
|
|
318
|
+
extract_json=lambda data: data["claims"],
|
|
319
|
+
)
|
|
311
320
|
|
|
312
321
|
def _generate_claims(
|
|
313
322
|
self, user_content: str, assistant_content: str, multimodal: bool
|
|
@@ -317,18 +326,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
317
326
|
assistant_output=assistant_content,
|
|
318
327
|
multimodal=multimodal,
|
|
319
328
|
)
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
self
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
except TypeError:
|
|
329
|
-
res = self.model.generate(prompt)
|
|
330
|
-
data = trimAndLoadJson(res, self)
|
|
331
|
-
return data["claims"]
|
|
329
|
+
|
|
330
|
+
return generate_with_schema_and_extract(
|
|
331
|
+
metric=self,
|
|
332
|
+
prompt=prompt,
|
|
333
|
+
schema_cls=Claims,
|
|
334
|
+
extract_schema=lambda s: s.claims,
|
|
335
|
+
extract_json=lambda data: data["claims"],
|
|
336
|
+
)
|
|
332
337
|
|
|
333
338
|
async def _a_generate_verdicts(
|
|
334
339
|
self, claims: Claims, truths: Truths, multimodal: bool
|
|
@@ -344,25 +349,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
344
349
|
multimodal=multimodal,
|
|
345
350
|
)
|
|
346
351
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
res: Verdicts = await self.model.a_generate(
|
|
355
|
-
prompt, schema=Verdicts
|
|
356
|
-
)
|
|
357
|
-
verdicts = [item for item in res.verdicts]
|
|
358
|
-
return verdicts
|
|
359
|
-
except TypeError:
|
|
360
|
-
res = await self.model.a_generate(prompt)
|
|
361
|
-
data = trimAndLoadJson(res, self)
|
|
362
|
-
verdicts = [
|
|
363
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
364
|
-
]
|
|
365
|
-
return verdicts
|
|
352
|
+
return await a_generate_with_schema_and_extract(
|
|
353
|
+
metric=self,
|
|
354
|
+
prompt=prompt,
|
|
355
|
+
schema_cls=Verdicts,
|
|
356
|
+
extract_schema=lambda s: s.verdicts,
|
|
357
|
+
extract_json=lambda data: data["verdicts"],
|
|
358
|
+
)
|
|
366
359
|
|
|
367
360
|
def _generate_verdicts(
|
|
368
361
|
self, claims: Claims, truths: Truths, multimodal: bool
|
|
@@ -378,23 +371,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
378
371
|
multimodal=multimodal,
|
|
379
372
|
)
|
|
380
373
|
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
389
|
-
verdicts = [item for item in res.verdicts]
|
|
390
|
-
return verdicts
|
|
391
|
-
except TypeError:
|
|
392
|
-
res = self.model.generate(prompt)
|
|
393
|
-
data = trimAndLoadJson(res, self)
|
|
394
|
-
verdicts = [
|
|
395
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
396
|
-
]
|
|
397
|
-
return verdicts
|
|
374
|
+
return generate_with_schema_and_extract(
|
|
375
|
+
metric=self,
|
|
376
|
+
prompt=prompt,
|
|
377
|
+
schema_cls=Verdicts,
|
|
378
|
+
extract_schema=lambda s: s.verdicts,
|
|
379
|
+
extract_json=lambda data: data["verdicts"],
|
|
380
|
+
)
|
|
398
381
|
|
|
399
382
|
def _get_interaction_score_and_reason(
|
|
400
383
|
self, verdicts, multimodal: bool
|
|
@@ -467,22 +450,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
467
450
|
multimodal=multimodal,
|
|
468
451
|
)
|
|
469
452
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
try:
|
|
478
|
-
res: FaithfulnessScoreReason = await self.model.a_generate(
|
|
479
|
-
prompt, schema=FaithfulnessScoreReason
|
|
480
|
-
)
|
|
481
|
-
return res.reason
|
|
482
|
-
except TypeError:
|
|
483
|
-
res = await self.model.a_generate(prompt)
|
|
484
|
-
data = trimAndLoadJson(res, self)
|
|
485
|
-
return data["reason"]
|
|
453
|
+
return await a_generate_with_schema_and_extract(
|
|
454
|
+
metric=self,
|
|
455
|
+
prompt=prompt,
|
|
456
|
+
schema_cls=FaithfulnessScoreReason,
|
|
457
|
+
extract_schema=lambda s: s.reason,
|
|
458
|
+
extract_json=lambda data: data["reason"],
|
|
459
|
+
)
|
|
486
460
|
|
|
487
461
|
def _get_interaction_reason(self, score, verdicts, multimodal: bool) -> str:
|
|
488
462
|
if self.include_reason is False:
|
|
@@ -499,22 +473,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
499
473
|
multimodal=multimodal,
|
|
500
474
|
)
|
|
501
475
|
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
try:
|
|
510
|
-
res: FaithfulnessScoreReason = self.model.generate(
|
|
511
|
-
prompt, schema=FaithfulnessScoreReason
|
|
512
|
-
)
|
|
513
|
-
return res.reason
|
|
514
|
-
except TypeError:
|
|
515
|
-
res = self.model.generate(prompt)
|
|
516
|
-
data = trimAndLoadJson(res, self)
|
|
517
|
-
return data["reason"]
|
|
476
|
+
return generate_with_schema_and_extract(
|
|
477
|
+
metric=self,
|
|
478
|
+
prompt=prompt,
|
|
479
|
+
schema_cls=FaithfulnessScoreReason,
|
|
480
|
+
extract_schema=lambda s: s.reason,
|
|
481
|
+
extract_json=lambda data: data["reason"],
|
|
482
|
+
)
|
|
518
483
|
|
|
519
484
|
def _get_verbose_steps(
|
|
520
485
|
self, interaction_scores: List[InteractionFaithfulnessScore]
|
|
@@ -522,7 +487,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
522
487
|
steps = []
|
|
523
488
|
for index, interaction_score in enumerate(interaction_scores):
|
|
524
489
|
interaction_steps = [
|
|
525
|
-
f"
|
|
490
|
+
f"Window {index + 1} \n",
|
|
526
491
|
f"Truths: {prettify_list(interaction_score.truths)} \n",
|
|
527
492
|
f"Claims: {prettify_list(interaction_score.claims)} \n",
|
|
528
493
|
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
@@ -535,6 +500,12 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
535
500
|
def _generate_reason(
|
|
536
501
|
self, scores: List[InteractionFaithfulnessScore]
|
|
537
502
|
) -> str:
|
|
503
|
+
if self.include_reason is False:
|
|
504
|
+
return None
|
|
505
|
+
|
|
506
|
+
if len(scores) == 0:
|
|
507
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
508
|
+
|
|
538
509
|
reasons = []
|
|
539
510
|
for score in scores:
|
|
540
511
|
reasons.append(score.reason)
|
|
@@ -543,17 +514,23 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
543
514
|
self.score, self.success, reasons
|
|
544
515
|
)
|
|
545
516
|
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
517
|
+
return generate_with_schema_and_extract(
|
|
518
|
+
metric=self,
|
|
519
|
+
prompt=prompt,
|
|
520
|
+
schema_cls=FaithfulnessScoreReason,
|
|
521
|
+
extract_schema=lambda s: s.reason,
|
|
522
|
+
extract_json=lambda data: data["reason"],
|
|
523
|
+
)
|
|
553
524
|
|
|
554
525
|
async def _a_generate_reason(
|
|
555
526
|
self, scores: List[InteractionFaithfulnessScore]
|
|
556
527
|
) -> str:
|
|
528
|
+
if self.include_reason is False:
|
|
529
|
+
return None
|
|
530
|
+
|
|
531
|
+
if len(scores) == 0:
|
|
532
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
533
|
+
|
|
557
534
|
reasons = []
|
|
558
535
|
for score in scores:
|
|
559
536
|
reasons.append(score.reason)
|
|
@@ -562,13 +539,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
562
539
|
self.score, self.success, reasons
|
|
563
540
|
)
|
|
564
541
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
542
|
+
return await a_generate_with_schema_and_extract(
|
|
543
|
+
metric=self,
|
|
544
|
+
prompt=prompt,
|
|
545
|
+
schema_cls=FaithfulnessScoreReason,
|
|
546
|
+
extract_schema=lambda s: s.reason,
|
|
547
|
+
extract_json=lambda data: data["reason"],
|
|
548
|
+
)
|
|
572
549
|
|
|
573
550
|
def _calculate_score(
|
|
574
551
|
self, scores: List[InteractionFaithfulnessScore]
|
|
@@ -2,9 +2,20 @@ from typing import List, Dict
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class TurnRelevancyTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_verdicts(sliding_window: List[Dict]):
|
|
7
15
|
return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether the LAST `assistant` message is relevant to context in messages. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
16
|
+
|
|
17
|
+
{TurnRelevancyTemplate.multimodal_rules}
|
|
18
|
+
|
|
8
19
|
The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the last `assistant` message is relevant according to the context in messages
|
|
9
20
|
Provide a 'reason' ONLY if the answer is 'no'.
|
|
10
21
|
You MUST USE the previous messages (if any) provided in the list of messages to make an informed judgement on relevancy.
|
|
@@ -52,6 +63,9 @@ JSON:
|
|
|
52
63
|
@staticmethod
|
|
53
64
|
def generate_reason(score, irrelevancies):
|
|
54
65
|
return f"""Below is a list of irrelevancies drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why the 'assistant' messages are irrelevant to the 'user' messages.
|
|
66
|
+
|
|
67
|
+
{TurnRelevancyTemplate.multimodal_rules}
|
|
68
|
+
|
|
55
69
|
Given the relevancy score, which is a 0-1 score indicating how irrelevant the OVERALL AI messages are in a conversation (higher the better), CONCISELY summarize the irrelevancies to justify the score.
|
|
56
70
|
|
|
57
71
|
**
|