deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type, Tuple
|
|
2
2
|
import asyncio
|
|
3
|
-
|
|
3
|
+
import itertools
|
|
4
4
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
5
|
from deepeval.metrics import BaseConversationalMetric
|
|
6
6
|
from deepeval.utils import (
|
|
@@ -12,6 +12,7 @@ from deepeval.metrics.utils import (
|
|
|
12
12
|
trimAndLoadJson,
|
|
13
13
|
check_conversational_test_case_params,
|
|
14
14
|
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
15
16
|
initialize_model,
|
|
16
17
|
)
|
|
17
18
|
from deepeval.models import DeepEvalBaseLLM
|
|
@@ -30,6 +31,7 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
30
31
|
|
|
31
32
|
class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
32
33
|
_required_test_case_params: List[TurnParams] = [
|
|
34
|
+
TurnParams.ROLE,
|
|
33
35
|
TurnParams.CONTENT,
|
|
34
36
|
TurnParams.RETRIEVAL_CONTEXT,
|
|
35
37
|
]
|
|
@@ -42,6 +44,7 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
42
44
|
async_mode: bool = True,
|
|
43
45
|
strict_mode: bool = False,
|
|
44
46
|
verbose_mode: bool = False,
|
|
47
|
+
window_size: int = 10,
|
|
45
48
|
evaluation_template: Type[
|
|
46
49
|
TurnContextualRelevancyTemplate
|
|
47
50
|
] = TurnContextualRelevancyTemplate,
|
|
@@ -53,6 +56,7 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
53
56
|
self.async_mode = async_mode
|
|
54
57
|
self.strict_mode = strict_mode
|
|
55
58
|
self.verbose_mode = verbose_mode
|
|
59
|
+
self.window_size = window_size
|
|
56
60
|
self.evaluation_template = evaluation_template
|
|
57
61
|
|
|
58
62
|
def measure(
|
|
@@ -89,9 +93,19 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
89
93
|
)
|
|
90
94
|
else:
|
|
91
95
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
96
|
+
turns_windows: List[List[Turn]] = [
|
|
97
|
+
list(itertools.chain(*window))
|
|
98
|
+
for window in get_turns_in_sliding_window(
|
|
99
|
+
unit_interactions, self.window_size
|
|
100
|
+
)
|
|
101
|
+
]
|
|
102
|
+
scores = []
|
|
103
|
+
for window in turns_windows:
|
|
104
|
+
scores.extend(
|
|
105
|
+
self._get_contextual_relevancy_scores(
|
|
106
|
+
window, multimodal
|
|
107
|
+
)
|
|
108
|
+
)
|
|
95
109
|
self.score = self._calculate_score(scores)
|
|
96
110
|
self.success = self.score >= self.threshold
|
|
97
111
|
self.reason = self._generate_reason(scores)
|
|
@@ -137,9 +151,25 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
137
151
|
_in_component=_in_component,
|
|
138
152
|
):
|
|
139
153
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
154
|
+
turns_windows: List[List[Turn]] = [
|
|
155
|
+
list(itertools.chain(*window))
|
|
156
|
+
for window in get_turns_in_sliding_window(
|
|
157
|
+
unit_interactions, self.window_size
|
|
158
|
+
)
|
|
159
|
+
]
|
|
160
|
+
scores = []
|
|
161
|
+
tasks = []
|
|
162
|
+
|
|
163
|
+
async def get_individual_scores(window):
|
|
164
|
+
scores.extend(
|
|
165
|
+
await self._a_get_contextual_relevancy_scores(
|
|
166
|
+
window, multimodal
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
for window in turns_windows:
|
|
171
|
+
tasks.append(get_individual_scores(window))
|
|
172
|
+
await asyncio.gather(*tasks)
|
|
143
173
|
self.score = self._calculate_score(scores)
|
|
144
174
|
self.success = self.score >= self.threshold
|
|
145
175
|
self.reason = await self._a_generate_reason(scores)
|
|
@@ -160,69 +190,63 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
160
190
|
return self.score
|
|
161
191
|
|
|
162
192
|
async def _a_get_contextual_relevancy_scores(
|
|
163
|
-
self,
|
|
193
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
164
194
|
):
|
|
165
|
-
|
|
166
|
-
user_content = "User Message: "
|
|
167
|
-
retrieval_context = []
|
|
168
|
-
for turn in unit_interaction:
|
|
169
|
-
if turn.role == "user":
|
|
170
|
-
user_content += f"\n{turn.content} "
|
|
171
|
-
else:
|
|
172
|
-
retrieval_context.extend(turn.retrieval_context)
|
|
195
|
+
windows_scores = []
|
|
173
196
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
score=score,
|
|
183
|
-
reason=reason,
|
|
184
|
-
verdicts=verdicts,
|
|
185
|
-
)
|
|
186
|
-
return interaction_score
|
|
197
|
+
user_content = ""
|
|
198
|
+
retrieval_context = []
|
|
199
|
+
for turn in turns_window:
|
|
200
|
+
if turn.role == "user":
|
|
201
|
+
user_content += f"\n{turn.content} "
|
|
202
|
+
else:
|
|
203
|
+
if turn.retrieval_context is not None:
|
|
204
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
187
205
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
206
|
+
verdicts = await self._a_generate_verdicts(
|
|
207
|
+
user_content, retrieval_context, multimodal
|
|
208
|
+
)
|
|
209
|
+
score, reason = await self._a_get_interaction_score_and_reason(
|
|
210
|
+
user_content, verdicts, multimodal
|
|
211
|
+
)
|
|
212
|
+
interaction_score = InteractionContextualRelevancyScore(
|
|
213
|
+
score=score,
|
|
214
|
+
reason=reason,
|
|
215
|
+
verdicts=verdicts,
|
|
193
216
|
)
|
|
194
217
|
|
|
195
|
-
|
|
218
|
+
windows_scores.append(interaction_score)
|
|
219
|
+
|
|
220
|
+
return windows_scores
|
|
196
221
|
|
|
197
222
|
def _get_contextual_relevancy_scores(
|
|
198
|
-
self,
|
|
223
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
199
224
|
):
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
225
|
+
windows_scores = []
|
|
226
|
+
|
|
227
|
+
user_content = ""
|
|
228
|
+
retrieval_context = []
|
|
229
|
+
for turn in turns_window:
|
|
230
|
+
if turn.role == "user":
|
|
231
|
+
user_content += f"\n{turn.content} "
|
|
232
|
+
else:
|
|
233
|
+
if turn.retrieval_context is not None:
|
|
209
234
|
retrieval_context.extend(turn.retrieval_context)
|
|
210
235
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
interaction_scores.append(interaction_score)
|
|
236
|
+
verdicts = self._generate_verdicts(
|
|
237
|
+
user_content, retrieval_context, multimodal
|
|
238
|
+
)
|
|
239
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
240
|
+
user_content, verdicts, multimodal
|
|
241
|
+
)
|
|
242
|
+
interaction_score = InteractionContextualRelevancyScore(
|
|
243
|
+
score=score,
|
|
244
|
+
reason=reason,
|
|
245
|
+
verdicts=verdicts,
|
|
246
|
+
)
|
|
247
|
+
windows_scores.append(interaction_score)
|
|
224
248
|
|
|
225
|
-
return
|
|
249
|
+
return windows_scores
|
|
226
250
|
|
|
227
251
|
async def _a_generate_verdicts(
|
|
228
252
|
self, input: str, retrieval_context: List[str], multimodal: bool
|
|
@@ -313,7 +337,10 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
313
337
|
multimodal: bool,
|
|
314
338
|
) -> Tuple[float, str]:
|
|
315
339
|
if len(verdicts) == 0:
|
|
316
|
-
return
|
|
340
|
+
return (
|
|
341
|
+
1,
|
|
342
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
|
|
343
|
+
)
|
|
317
344
|
|
|
318
345
|
score = self._calculate_interaction_score(verdicts)
|
|
319
346
|
reason = await self._a_get_interaction_reason(
|
|
@@ -332,7 +359,10 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
332
359
|
multimodal: bool,
|
|
333
360
|
) -> Tuple[float, str]:
|
|
334
361
|
if len(verdicts) == 0:
|
|
335
|
-
return
|
|
362
|
+
return (
|
|
363
|
+
1,
|
|
364
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
|
|
365
|
+
)
|
|
336
366
|
|
|
337
367
|
score = self._calculate_interaction_score(verdicts)
|
|
338
368
|
reason = self._get_interaction_reason(
|
|
@@ -377,7 +407,6 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
377
407
|
if verdict.verdict.strip().lower() == "yes":
|
|
378
408
|
relevant_statements.append(verdict.statement)
|
|
379
409
|
else:
|
|
380
|
-
# Include the reason for irrelevance
|
|
381
410
|
irrelevant_statements.append(
|
|
382
411
|
f"{verdict.statement}: {verdict.reason}"
|
|
383
412
|
)
|
|
@@ -458,12 +487,12 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
458
487
|
return data["reason"]
|
|
459
488
|
|
|
460
489
|
def _get_verbose_steps(
|
|
461
|
-
self,
|
|
490
|
+
self, windows_scores: List[InteractionContextualRelevancyScore]
|
|
462
491
|
):
|
|
463
492
|
steps = []
|
|
464
|
-
for index, interaction_score in enumerate(
|
|
493
|
+
for index, interaction_score in enumerate(windows_scores):
|
|
465
494
|
interaction_steps = [
|
|
466
|
-
f"
|
|
495
|
+
f"Window {index + 1} \n",
|
|
467
496
|
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
468
497
|
f"Score: {interaction_score.score} \n",
|
|
469
498
|
f"Reason: {interaction_score.reason} \n",
|
|
@@ -474,6 +503,12 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
474
503
|
def _generate_reason(
|
|
475
504
|
self, scores: List[InteractionContextualRelevancyScore]
|
|
476
505
|
) -> str:
|
|
506
|
+
if self.include_reason is False:
|
|
507
|
+
return None
|
|
508
|
+
|
|
509
|
+
if len(scores) == 0:
|
|
510
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
511
|
+
|
|
477
512
|
reasons = []
|
|
478
513
|
for score in scores:
|
|
479
514
|
reasons.append(score.reason)
|
|
@@ -493,6 +528,12 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
493
528
|
async def _a_generate_reason(
|
|
494
529
|
self, scores: List[InteractionContextualRelevancyScore]
|
|
495
530
|
) -> str:
|
|
531
|
+
if self.include_reason is False:
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
if len(scores) == 0:
|
|
535
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
536
|
+
|
|
496
537
|
reasons = []
|
|
497
538
|
for score in scores:
|
|
498
539
|
reasons.append(score.reason)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type, Tuple
|
|
2
2
|
import asyncio
|
|
3
|
-
|
|
3
|
+
import itertools
|
|
4
4
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
5
|
from deepeval.metrics import BaseConversationalMetric
|
|
6
6
|
from deepeval.utils import (
|
|
@@ -12,6 +12,7 @@ from deepeval.metrics.utils import (
|
|
|
12
12
|
trimAndLoadJson,
|
|
13
13
|
check_conversational_test_case_params,
|
|
14
14
|
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
15
16
|
initialize_model,
|
|
16
17
|
)
|
|
17
18
|
from deepeval.models import DeepEvalBaseLLM
|
|
@@ -32,6 +33,7 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
32
33
|
|
|
33
34
|
class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
34
35
|
_required_test_case_params: List[TurnParams] = [
|
|
36
|
+
TurnParams.ROLE,
|
|
35
37
|
TurnParams.CONTENT,
|
|
36
38
|
TurnParams.RETRIEVAL_CONTEXT,
|
|
37
39
|
]
|
|
@@ -46,6 +48,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
46
48
|
verbose_mode: bool = False,
|
|
47
49
|
truths_extraction_limit: Optional[int] = None,
|
|
48
50
|
penalize_ambiguous_claims: bool = False,
|
|
51
|
+
window_size: int = 10,
|
|
49
52
|
evaluation_template: Type[
|
|
50
53
|
TurnFaithfulnessTemplate
|
|
51
54
|
] = TurnFaithfulnessTemplate,
|
|
@@ -59,6 +62,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
59
62
|
self.verbose_mode = verbose_mode
|
|
60
63
|
self.evaluation_template = evaluation_template
|
|
61
64
|
self.penalize_ambiguous_claims = penalize_ambiguous_claims
|
|
65
|
+
self.window_size = window_size
|
|
62
66
|
|
|
63
67
|
self.truths_extraction_limit = truths_extraction_limit
|
|
64
68
|
if self.truths_extraction_limit is not None:
|
|
@@ -98,9 +102,17 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
98
102
|
)
|
|
99
103
|
else:
|
|
100
104
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
105
|
+
turns_windows: List[List[Turn]] = [
|
|
106
|
+
list(itertools.chain(*window))
|
|
107
|
+
for window in get_turns_in_sliding_window(
|
|
108
|
+
unit_interactions, self.window_size
|
|
109
|
+
)
|
|
110
|
+
]
|
|
111
|
+
scores = []
|
|
112
|
+
for window in turns_windows:
|
|
113
|
+
scores.extend(
|
|
114
|
+
self._get_faithfulness_scores(window, multimodal)
|
|
115
|
+
)
|
|
104
116
|
self.score = self._calculate_score(scores)
|
|
105
117
|
self.success = self.score >= self.threshold
|
|
106
118
|
self.reason = self._generate_reason(scores)
|
|
@@ -146,9 +158,23 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
146
158
|
_in_component=_in_component,
|
|
147
159
|
):
|
|
148
160
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
161
|
+
turns_windows: List[List[Turn]] = [
|
|
162
|
+
list(itertools.chain(*window))
|
|
163
|
+
for window in get_turns_in_sliding_window(
|
|
164
|
+
unit_interactions, self.window_size
|
|
165
|
+
)
|
|
166
|
+
]
|
|
167
|
+
scores = []
|
|
168
|
+
tasks = []
|
|
169
|
+
|
|
170
|
+
async def get_individual_scores(window):
|
|
171
|
+
scores.extend(
|
|
172
|
+
await self._a_get_faithfulness_scores(window, multimodal)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
for window in turns_windows:
|
|
176
|
+
tasks.append(get_individual_scores(window))
|
|
177
|
+
await asyncio.gather(*tasks)
|
|
152
178
|
self.score = self._calculate_score(scores)
|
|
153
179
|
self.success = self.score >= self.threshold
|
|
154
180
|
self.reason = await self._a_generate_reason(scores)
|
|
@@ -169,82 +195,75 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
169
195
|
return self.score
|
|
170
196
|
|
|
171
197
|
async def _a_get_faithfulness_scores(
|
|
172
|
-
self,
|
|
198
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
173
199
|
):
|
|
174
200
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
201
|
+
windows_scores = []
|
|
202
|
+
|
|
203
|
+
user_content = ""
|
|
204
|
+
assistant_content = ""
|
|
205
|
+
retrieval_context = []
|
|
206
|
+
for turn in turns_window:
|
|
207
|
+
if turn.role == "user":
|
|
208
|
+
user_content += f"\n{turn.content} "
|
|
209
|
+
else:
|
|
210
|
+
assistant_content += f"\n{turn.content}"
|
|
211
|
+
if turn.retrieval_context is not None:
|
|
184
212
|
retrieval_context.extend(turn.retrieval_context)
|
|
185
|
-
truths = await self._a_generate_truths(
|
|
186
|
-
retrieval_context, multimodal
|
|
187
|
-
)
|
|
188
|
-
claims = await self._a_generate_claims(
|
|
189
|
-
user_content, assistant_content, multimodal
|
|
190
|
-
)
|
|
191
|
-
verdicts = await self._a_generate_verdicts(
|
|
192
|
-
claims, truths, multimodal
|
|
193
|
-
)
|
|
194
|
-
score, reason = self._get_interaction_score_and_reason(
|
|
195
|
-
verdicts, multimodal
|
|
196
|
-
)
|
|
197
|
-
interaction_score = InteractionFaithfulnessScore(
|
|
198
|
-
score=score,
|
|
199
|
-
reason=reason,
|
|
200
|
-
claims=claims,
|
|
201
|
-
truths=truths,
|
|
202
|
-
verdicts=verdicts,
|
|
203
|
-
)
|
|
204
|
-
return interaction_score
|
|
205
213
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
214
|
+
truths = await self._a_generate_truths(retrieval_context, multimodal)
|
|
215
|
+
claims = await self._a_generate_claims(
|
|
216
|
+
user_content, assistant_content, multimodal
|
|
217
|
+
)
|
|
218
|
+
verdicts = await self._a_generate_verdicts(claims, truths, multimodal)
|
|
219
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
220
|
+
verdicts, multimodal
|
|
211
221
|
)
|
|
222
|
+
interaction_score = InteractionFaithfulnessScore(
|
|
223
|
+
score=score,
|
|
224
|
+
reason=reason,
|
|
225
|
+
claims=claims,
|
|
226
|
+
truths=truths,
|
|
227
|
+
verdicts=verdicts,
|
|
228
|
+
)
|
|
229
|
+
windows_scores.append(interaction_score)
|
|
212
230
|
|
|
213
|
-
return
|
|
231
|
+
return windows_scores
|
|
214
232
|
|
|
215
233
|
def _get_faithfulness_scores(
|
|
216
|
-
self,
|
|
234
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
217
235
|
):
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
236
|
+
windows_scores = []
|
|
237
|
+
|
|
238
|
+
user_content = ""
|
|
239
|
+
assistant_content = ""
|
|
240
|
+
retrieval_context = []
|
|
241
|
+
for turn in turns_window:
|
|
242
|
+
if turn.role == "user":
|
|
243
|
+
user_content += f"\n{turn.content} "
|
|
244
|
+
else:
|
|
245
|
+
assistant_content += f"\n{turn.content}"
|
|
246
|
+
if turn.retrieval_context is not None:
|
|
229
247
|
retrieval_context.extend(turn.retrieval_context)
|
|
230
|
-
truths = self._generate_truths(retrieval_context, multimodal)
|
|
231
|
-
claims = self._generate_claims(
|
|
232
|
-
user_content, assistant_content, multimodal
|
|
233
|
-
)
|
|
234
|
-
verdicts = self._generate_verdicts(claims, truths, multimodal)
|
|
235
|
-
score, reason = self._get_interaction_score_and_reason(
|
|
236
|
-
verdicts, multimodal
|
|
237
|
-
)
|
|
238
|
-
interaction_score = InteractionFaithfulnessScore(
|
|
239
|
-
score=score,
|
|
240
|
-
reason=reason,
|
|
241
|
-
claims=claims,
|
|
242
|
-
truths=truths,
|
|
243
|
-
verdicts=verdicts,
|
|
244
|
-
)
|
|
245
|
-
interaction_scores.append(interaction_score)
|
|
246
248
|
|
|
247
|
-
|
|
249
|
+
truths = self._generate_truths(retrieval_context, multimodal)
|
|
250
|
+
claims = self._generate_claims(
|
|
251
|
+
user_content, assistant_content, multimodal
|
|
252
|
+
)
|
|
253
|
+
verdicts = self._generate_verdicts(claims, truths, multimodal)
|
|
254
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
255
|
+
verdicts, multimodal
|
|
256
|
+
)
|
|
257
|
+
interaction_score = InteractionFaithfulnessScore(
|
|
258
|
+
score=score,
|
|
259
|
+
reason=reason,
|
|
260
|
+
claims=claims,
|
|
261
|
+
truths=truths,
|
|
262
|
+
verdicts=verdicts,
|
|
263
|
+
)
|
|
264
|
+
windows_scores.append(interaction_score)
|
|
265
|
+
|
|
266
|
+
return windows_scores
|
|
248
267
|
|
|
249
268
|
async def _a_generate_truths(
|
|
250
269
|
self, retrieval_context: str, multimodal: bool
|
|
@@ -522,7 +541,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
522
541
|
steps = []
|
|
523
542
|
for index, interaction_score in enumerate(interaction_scores):
|
|
524
543
|
interaction_steps = [
|
|
525
|
-
f"
|
|
544
|
+
f"Window {index + 1} \n",
|
|
526
545
|
f"Truths: {prettify_list(interaction_score.truths)} \n",
|
|
527
546
|
f"Claims: {prettify_list(interaction_score.claims)} \n",
|
|
528
547
|
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
@@ -535,6 +554,12 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
535
554
|
def _generate_reason(
|
|
536
555
|
self, scores: List[InteractionFaithfulnessScore]
|
|
537
556
|
) -> str:
|
|
557
|
+
if self.include_reason is False:
|
|
558
|
+
return None
|
|
559
|
+
|
|
560
|
+
if len(scores) == 0:
|
|
561
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
562
|
+
|
|
538
563
|
reasons = []
|
|
539
564
|
for score in scores:
|
|
540
565
|
reasons.append(score.reason)
|
|
@@ -554,6 +579,12 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
|
554
579
|
async def _a_generate_reason(
|
|
555
580
|
self, scores: List[InteractionFaithfulnessScore]
|
|
556
581
|
) -> str:
|
|
582
|
+
if self.include_reason is False:
|
|
583
|
+
return None
|
|
584
|
+
|
|
585
|
+
if len(scores) == 0:
|
|
586
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
587
|
+
|
|
557
588
|
reasons = []
|
|
558
589
|
for score in scores:
|
|
559
590
|
reasons.append(score.reason)
|
|
@@ -2,9 +2,20 @@ from typing import List, Dict
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class TurnRelevancyTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_verdicts(sliding_window: List[Dict]):
|
|
7
15
|
return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether the LAST `assistant` message is relevant to context in messages. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
16
|
+
|
|
17
|
+
{TurnRelevancyTemplate.multimodal_rules}
|
|
18
|
+
|
|
8
19
|
The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the last `assistant` message is relevant according to the context in messages
|
|
9
20
|
Provide a 'reason' ONLY if the answer is 'no'.
|
|
10
21
|
You MUST USE the previous messages (if any) provided in the list of messages to make an informed judgement on relevancy.
|
|
@@ -52,6 +63,9 @@ JSON:
|
|
|
52
63
|
@staticmethod
|
|
53
64
|
def generate_reason(score, irrelevancies):
|
|
54
65
|
return f"""Below is a list of irrelevancies drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why the 'assistant' messages are irrelevant to the 'user' messages.
|
|
66
|
+
|
|
67
|
+
{TurnRelevancyTemplate.multimodal_rules}
|
|
68
|
+
|
|
55
69
|
Given the relevancy score, which is a 0-1 score indicating how irrelevant the OVERALL AI messages are in a conversation (higher the better), CONCISELY summarize the irrelevancies to justify the score.
|
|
56
70
|
|
|
57
71
|
**
|