deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,627 @@
|
|
|
1
|
+
from typing import List, Optional, Union, Type, Tuple
|
|
2
|
+
import asyncio
|
|
3
|
+
import itertools
|
|
4
|
+
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
6
|
+
from deepeval.utils import (
|
|
7
|
+
get_or_create_event_loop,
|
|
8
|
+
prettify_list,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.metrics.utils import (
|
|
11
|
+
construct_verbose_logs,
|
|
12
|
+
trimAndLoadJson,
|
|
13
|
+
check_conversational_test_case_params,
|
|
14
|
+
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
16
|
+
initialize_model,
|
|
17
|
+
)
|
|
18
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
19
|
+
from deepeval.metrics.turn_faithfulness.template import (
|
|
20
|
+
TurnFaithfulnessTemplate,
|
|
21
|
+
)
|
|
22
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
23
|
+
from deepeval.metrics.turn_faithfulness.schema import (
|
|
24
|
+
FaithfulnessVerdict,
|
|
25
|
+
Verdicts,
|
|
26
|
+
FaithfulnessScoreReason,
|
|
27
|
+
Truths,
|
|
28
|
+
Claims,
|
|
29
|
+
InteractionFaithfulnessScore,
|
|
30
|
+
)
|
|
31
|
+
from deepeval.metrics.api import metric_data_manager
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TurnFaithfulnessMetric(BaseConversationalMetric):
|
|
35
|
+
_required_test_case_params: List[TurnParams] = [
|
|
36
|
+
TurnParams.ROLE,
|
|
37
|
+
TurnParams.CONTENT,
|
|
38
|
+
TurnParams.RETRIEVAL_CONTEXT,
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
threshold: float = 0.5,
|
|
44
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
45
|
+
include_reason: bool = True,
|
|
46
|
+
async_mode: bool = True,
|
|
47
|
+
strict_mode: bool = False,
|
|
48
|
+
verbose_mode: bool = False,
|
|
49
|
+
truths_extraction_limit: Optional[int] = None,
|
|
50
|
+
penalize_ambiguous_claims: bool = False,
|
|
51
|
+
window_size: int = 10,
|
|
52
|
+
evaluation_template: Type[
|
|
53
|
+
TurnFaithfulnessTemplate
|
|
54
|
+
] = TurnFaithfulnessTemplate,
|
|
55
|
+
):
|
|
56
|
+
self.threshold = 1 if strict_mode else threshold
|
|
57
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
58
|
+
self.evaluation_model = self.model.get_model_name()
|
|
59
|
+
self.include_reason = include_reason
|
|
60
|
+
self.async_mode = async_mode
|
|
61
|
+
self.strict_mode = strict_mode
|
|
62
|
+
self.verbose_mode = verbose_mode
|
|
63
|
+
self.evaluation_template = evaluation_template
|
|
64
|
+
self.penalize_ambiguous_claims = penalize_ambiguous_claims
|
|
65
|
+
self.window_size = window_size
|
|
66
|
+
|
|
67
|
+
self.truths_extraction_limit = truths_extraction_limit
|
|
68
|
+
if self.truths_extraction_limit is not None:
|
|
69
|
+
self.truths_extraction_limit = max(self.truths_extraction_limit, 0)
|
|
70
|
+
|
|
71
|
+
def measure(
|
|
72
|
+
self,
|
|
73
|
+
test_case: ConversationalTestCase,
|
|
74
|
+
_show_indicator: bool = True,
|
|
75
|
+
_in_component: bool = False,
|
|
76
|
+
_log_metric_to_confident: bool = True,
|
|
77
|
+
):
|
|
78
|
+
check_conversational_test_case_params(
|
|
79
|
+
test_case,
|
|
80
|
+
self._required_test_case_params,
|
|
81
|
+
self,
|
|
82
|
+
False,
|
|
83
|
+
self.model,
|
|
84
|
+
test_case.multimodal,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
multimodal = test_case.multimodal
|
|
88
|
+
|
|
89
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
90
|
+
with metric_progress_indicator(
|
|
91
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
92
|
+
):
|
|
93
|
+
if self.async_mode:
|
|
94
|
+
loop = get_or_create_event_loop()
|
|
95
|
+
loop.run_until_complete(
|
|
96
|
+
self.a_measure(
|
|
97
|
+
test_case,
|
|
98
|
+
_show_indicator=False,
|
|
99
|
+
_in_component=_in_component,
|
|
100
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
105
|
+
turns_windows: List[List[Turn]] = [
|
|
106
|
+
list(itertools.chain(*window))
|
|
107
|
+
for window in get_turns_in_sliding_window(
|
|
108
|
+
unit_interactions, self.window_size
|
|
109
|
+
)
|
|
110
|
+
]
|
|
111
|
+
scores = []
|
|
112
|
+
for window in turns_windows:
|
|
113
|
+
scores.extend(
|
|
114
|
+
self._get_faithfulness_scores(window, multimodal)
|
|
115
|
+
)
|
|
116
|
+
self.score = self._calculate_score(scores)
|
|
117
|
+
self.success = self.score >= self.threshold
|
|
118
|
+
self.reason = self._generate_reason(scores)
|
|
119
|
+
verbose_steps = self._get_verbose_steps(scores)
|
|
120
|
+
self.verbose_logs = construct_verbose_logs(
|
|
121
|
+
self,
|
|
122
|
+
steps=[
|
|
123
|
+
*verbose_steps,
|
|
124
|
+
f"Final Score: {self.score}\n",
|
|
125
|
+
f"Final Reason: {self.reason}\n",
|
|
126
|
+
],
|
|
127
|
+
)
|
|
128
|
+
if _log_metric_to_confident:
|
|
129
|
+
metric_data_manager.post_metric_if_enabled(
|
|
130
|
+
self, test_case=test_case
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return self.score
|
|
134
|
+
|
|
135
|
+
async def a_measure(
|
|
136
|
+
self,
|
|
137
|
+
test_case: ConversationalTestCase,
|
|
138
|
+
_show_indicator: bool = True,
|
|
139
|
+
_in_component: bool = False,
|
|
140
|
+
_log_metric_to_confident: bool = True,
|
|
141
|
+
) -> float:
|
|
142
|
+
check_conversational_test_case_params(
|
|
143
|
+
test_case,
|
|
144
|
+
self._required_test_case_params,
|
|
145
|
+
self,
|
|
146
|
+
False,
|
|
147
|
+
self.model,
|
|
148
|
+
test_case.multimodal,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
multimodal = test_case.multimodal
|
|
152
|
+
|
|
153
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
154
|
+
with metric_progress_indicator(
|
|
155
|
+
self,
|
|
156
|
+
async_mode=True,
|
|
157
|
+
_show_indicator=_show_indicator,
|
|
158
|
+
_in_component=_in_component,
|
|
159
|
+
):
|
|
160
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
161
|
+
turns_windows: List[List[Turn]] = [
|
|
162
|
+
list(itertools.chain(*window))
|
|
163
|
+
for window in get_turns_in_sliding_window(
|
|
164
|
+
unit_interactions, self.window_size
|
|
165
|
+
)
|
|
166
|
+
]
|
|
167
|
+
scores = []
|
|
168
|
+
tasks = []
|
|
169
|
+
|
|
170
|
+
async def get_individual_scores(window):
|
|
171
|
+
scores.extend(
|
|
172
|
+
await self._a_get_faithfulness_scores(window, multimodal)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
for window in turns_windows:
|
|
176
|
+
tasks.append(get_individual_scores(window))
|
|
177
|
+
await asyncio.gather(*tasks)
|
|
178
|
+
self.score = self._calculate_score(scores)
|
|
179
|
+
self.success = self.score >= self.threshold
|
|
180
|
+
self.reason = await self._a_generate_reason(scores)
|
|
181
|
+
verbose_steps = self._get_verbose_steps(scores)
|
|
182
|
+
self.verbose_logs = construct_verbose_logs(
|
|
183
|
+
self,
|
|
184
|
+
steps=[
|
|
185
|
+
*verbose_steps,
|
|
186
|
+
f"Final Score: {self.score}\n",
|
|
187
|
+
f"Final Reason: {self.reason}\n",
|
|
188
|
+
],
|
|
189
|
+
)
|
|
190
|
+
if _log_metric_to_confident:
|
|
191
|
+
metric_data_manager.post_metric_if_enabled(
|
|
192
|
+
self, test_case=test_case
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
return self.score
|
|
196
|
+
|
|
197
|
+
async def _a_get_faithfulness_scores(
|
|
198
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
199
|
+
):
|
|
200
|
+
|
|
201
|
+
windows_scores = []
|
|
202
|
+
|
|
203
|
+
user_content = ""
|
|
204
|
+
assistant_content = ""
|
|
205
|
+
retrieval_context = []
|
|
206
|
+
for turn in turns_window:
|
|
207
|
+
if turn.role == "user":
|
|
208
|
+
user_content += f"\n{turn.content} "
|
|
209
|
+
else:
|
|
210
|
+
assistant_content += f"\n{turn.content}"
|
|
211
|
+
if turn.retrieval_context is not None:
|
|
212
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
213
|
+
|
|
214
|
+
truths = await self._a_generate_truths(retrieval_context, multimodal)
|
|
215
|
+
claims = await self._a_generate_claims(
|
|
216
|
+
user_content, assistant_content, multimodal
|
|
217
|
+
)
|
|
218
|
+
verdicts = await self._a_generate_verdicts(claims, truths, multimodal)
|
|
219
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
220
|
+
verdicts, multimodal
|
|
221
|
+
)
|
|
222
|
+
interaction_score = InteractionFaithfulnessScore(
|
|
223
|
+
score=score,
|
|
224
|
+
reason=reason,
|
|
225
|
+
claims=claims,
|
|
226
|
+
truths=truths,
|
|
227
|
+
verdicts=verdicts,
|
|
228
|
+
)
|
|
229
|
+
windows_scores.append(interaction_score)
|
|
230
|
+
|
|
231
|
+
return windows_scores
|
|
232
|
+
|
|
233
|
+
def _get_faithfulness_scores(
|
|
234
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
235
|
+
):
|
|
236
|
+
windows_scores = []
|
|
237
|
+
|
|
238
|
+
user_content = ""
|
|
239
|
+
assistant_content = ""
|
|
240
|
+
retrieval_context = []
|
|
241
|
+
for turn in turns_window:
|
|
242
|
+
if turn.role == "user":
|
|
243
|
+
user_content += f"\n{turn.content} "
|
|
244
|
+
else:
|
|
245
|
+
assistant_content += f"\n{turn.content}"
|
|
246
|
+
if turn.retrieval_context is not None:
|
|
247
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
248
|
+
|
|
249
|
+
truths = self._generate_truths(retrieval_context, multimodal)
|
|
250
|
+
claims = self._generate_claims(
|
|
251
|
+
user_content, assistant_content, multimodal
|
|
252
|
+
)
|
|
253
|
+
verdicts = self._generate_verdicts(claims, truths, multimodal)
|
|
254
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
255
|
+
verdicts, multimodal
|
|
256
|
+
)
|
|
257
|
+
interaction_score = InteractionFaithfulnessScore(
|
|
258
|
+
score=score,
|
|
259
|
+
reason=reason,
|
|
260
|
+
claims=claims,
|
|
261
|
+
truths=truths,
|
|
262
|
+
verdicts=verdicts,
|
|
263
|
+
)
|
|
264
|
+
windows_scores.append(interaction_score)
|
|
265
|
+
|
|
266
|
+
return windows_scores
|
|
267
|
+
|
|
268
|
+
async def _a_generate_truths(
|
|
269
|
+
self, retrieval_context: str, multimodal: bool
|
|
270
|
+
) -> List[str]:
|
|
271
|
+
prompt = self.evaluation_template.generate_truths(
|
|
272
|
+
reference_context="\n\n".join(retrieval_context),
|
|
273
|
+
extraction_limit=self.truths_extraction_limit,
|
|
274
|
+
multimodal=multimodal,
|
|
275
|
+
)
|
|
276
|
+
if self.using_native_model:
|
|
277
|
+
res, cost = await self.model.a_generate(prompt, schema=Truths)
|
|
278
|
+
self.evaluation_cost += cost
|
|
279
|
+
return res.truths
|
|
280
|
+
else:
|
|
281
|
+
try:
|
|
282
|
+
res: Truths = await self.model.a_generate(prompt, schema=Truths)
|
|
283
|
+
return res.truths
|
|
284
|
+
except TypeError:
|
|
285
|
+
res = await self.model.a_generate(prompt)
|
|
286
|
+
data = trimAndLoadJson(res, self)
|
|
287
|
+
return data["truths"]
|
|
288
|
+
|
|
289
|
+
def _generate_truths(
|
|
290
|
+
self, retrieval_context: str, multimodal: bool
|
|
291
|
+
) -> List[str]:
|
|
292
|
+
prompt = self.evaluation_template.generate_truths(
|
|
293
|
+
reference_context="\n\n".join(retrieval_context),
|
|
294
|
+
extraction_limit=self.truths_extraction_limit,
|
|
295
|
+
multimodal=multimodal,
|
|
296
|
+
)
|
|
297
|
+
if self.using_native_model:
|
|
298
|
+
res, cost = self.model.generate(prompt, schema=Truths)
|
|
299
|
+
self.evaluation_cost += cost
|
|
300
|
+
return res.truths
|
|
301
|
+
else:
|
|
302
|
+
try:
|
|
303
|
+
res: Truths = self.model.generate(prompt, schema=Truths)
|
|
304
|
+
return res.truths
|
|
305
|
+
except TypeError:
|
|
306
|
+
res = self.model.generate(prompt)
|
|
307
|
+
data = trimAndLoadJson(res, self)
|
|
308
|
+
return data["truths"]
|
|
309
|
+
|
|
310
|
+
async def _a_generate_claims(
|
|
311
|
+
self, user_content: str, assistant_content: str, multimodal: bool
|
|
312
|
+
) -> List[str]:
|
|
313
|
+
prompt = self.evaluation_template.generate_claims(
|
|
314
|
+
input=user_content,
|
|
315
|
+
assistant_output=assistant_content,
|
|
316
|
+
multimodal=multimodal,
|
|
317
|
+
)
|
|
318
|
+
if self.using_native_model:
|
|
319
|
+
res, cost = await self.model.a_generate(prompt, schema=Claims)
|
|
320
|
+
self.evaluation_cost += cost
|
|
321
|
+
return res.claims
|
|
322
|
+
else:
|
|
323
|
+
try:
|
|
324
|
+
res: Claims = await self.model.a_generate(prompt, schema=Claims)
|
|
325
|
+
return res.claims
|
|
326
|
+
except TypeError:
|
|
327
|
+
res = await self.model.a_generate(prompt)
|
|
328
|
+
data = trimAndLoadJson(res, self)
|
|
329
|
+
return data["claims"]
|
|
330
|
+
|
|
331
|
+
def _generate_claims(
|
|
332
|
+
self, user_content: str, assistant_content: str, multimodal: bool
|
|
333
|
+
) -> List[str]:
|
|
334
|
+
prompt = self.evaluation_template.generate_claims(
|
|
335
|
+
input=user_content,
|
|
336
|
+
assistant_output=assistant_content,
|
|
337
|
+
multimodal=multimodal,
|
|
338
|
+
)
|
|
339
|
+
if self.using_native_model:
|
|
340
|
+
res, cost = self.model.generate(prompt, schema=Claims)
|
|
341
|
+
self.evaluation_cost += cost
|
|
342
|
+
return res.claims
|
|
343
|
+
else:
|
|
344
|
+
try:
|
|
345
|
+
res: Claims = self.model.generate(prompt, schema=Claims)
|
|
346
|
+
return res.claims
|
|
347
|
+
except TypeError:
|
|
348
|
+
res = self.model.generate(prompt)
|
|
349
|
+
data = trimAndLoadJson(res, self)
|
|
350
|
+
return data["claims"]
|
|
351
|
+
|
|
352
|
+
async def _a_generate_verdicts(
|
|
353
|
+
self, claims: Claims, truths: Truths, multimodal: bool
|
|
354
|
+
) -> List[FaithfulnessVerdict]:
|
|
355
|
+
if len(claims) == 0:
|
|
356
|
+
return []
|
|
357
|
+
|
|
358
|
+
verdicts: List[FaithfulnessVerdict] = []
|
|
359
|
+
|
|
360
|
+
prompt = self.evaluation_template.generate_verdicts(
|
|
361
|
+
claims=claims,
|
|
362
|
+
reference_context="\n\n".join(truths),
|
|
363
|
+
multimodal=multimodal,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
if self.using_native_model:
|
|
367
|
+
res, cost = await self.model.a_generate(prompt, schema=Verdicts)
|
|
368
|
+
self.evaluation_cost += cost
|
|
369
|
+
verdicts = [item for item in res.verdicts]
|
|
370
|
+
return verdicts
|
|
371
|
+
else:
|
|
372
|
+
try:
|
|
373
|
+
res: Verdicts = await self.model.a_generate(
|
|
374
|
+
prompt, schema=Verdicts
|
|
375
|
+
)
|
|
376
|
+
verdicts = [item for item in res.verdicts]
|
|
377
|
+
return verdicts
|
|
378
|
+
except TypeError:
|
|
379
|
+
res = await self.model.a_generate(prompt)
|
|
380
|
+
data = trimAndLoadJson(res, self)
|
|
381
|
+
verdicts = [
|
|
382
|
+
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
383
|
+
]
|
|
384
|
+
return verdicts
|
|
385
|
+
|
|
386
|
+
def _generate_verdicts(
|
|
387
|
+
self, claims: Claims, truths: Truths, multimodal: bool
|
|
388
|
+
) -> List[FaithfulnessVerdict]:
|
|
389
|
+
if len(claims) == 0:
|
|
390
|
+
return []
|
|
391
|
+
|
|
392
|
+
verdicts: List[FaithfulnessVerdict] = []
|
|
393
|
+
|
|
394
|
+
prompt = self.evaluation_template.generate_verdicts(
|
|
395
|
+
claims=claims,
|
|
396
|
+
reference_context="\n\n".join(truths),
|
|
397
|
+
multimodal=multimodal,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
if self.using_native_model:
|
|
401
|
+
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
402
|
+
self.evaluation_cost += cost
|
|
403
|
+
verdicts = [item for item in res.verdicts]
|
|
404
|
+
return verdicts
|
|
405
|
+
else:
|
|
406
|
+
try:
|
|
407
|
+
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
408
|
+
verdicts = [item for item in res.verdicts]
|
|
409
|
+
return verdicts
|
|
410
|
+
except TypeError:
|
|
411
|
+
res = self.model.generate(prompt)
|
|
412
|
+
data = trimAndLoadJson(res, self)
|
|
413
|
+
verdicts = [
|
|
414
|
+
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
415
|
+
]
|
|
416
|
+
return verdicts
|
|
417
|
+
|
|
418
|
+
def _get_interaction_score_and_reason(
|
|
419
|
+
self, verdicts, multimodal: bool
|
|
420
|
+
) -> Tuple[float, str]:
|
|
421
|
+
number_of_verdicts = len(verdicts)
|
|
422
|
+
if number_of_verdicts == 0:
|
|
423
|
+
return 1
|
|
424
|
+
|
|
425
|
+
faithfulness_count = 0
|
|
426
|
+
for verdict in verdicts:
|
|
427
|
+
if verdict.verdict.strip().lower() != "no":
|
|
428
|
+
faithfulness_count += 1
|
|
429
|
+
|
|
430
|
+
if (
|
|
431
|
+
self.penalize_ambiguous_claims
|
|
432
|
+
and verdict.verdict.strip().lower() == "idk"
|
|
433
|
+
):
|
|
434
|
+
faithfulness_count -= 1
|
|
435
|
+
|
|
436
|
+
score = faithfulness_count / number_of_verdicts
|
|
437
|
+
reason = self._get_interaction_reason(score, verdicts, multimodal)
|
|
438
|
+
return (
|
|
439
|
+
(0, reason)
|
|
440
|
+
if self.strict_mode and score < self.threshold
|
|
441
|
+
else (score, reason)
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
async def _a_get_interaction_score_and_reason(
|
|
445
|
+
self, verdicts, multimodal: bool
|
|
446
|
+
) -> Tuple[float, str]:
|
|
447
|
+
number_of_verdicts = len(verdicts)
|
|
448
|
+
if number_of_verdicts == 0:
|
|
449
|
+
return 1
|
|
450
|
+
|
|
451
|
+
faithfulness_count = 0
|
|
452
|
+
for verdict in verdicts:
|
|
453
|
+
if verdict.verdict.strip().lower() != "no":
|
|
454
|
+
faithfulness_count += 1
|
|
455
|
+
|
|
456
|
+
if (
|
|
457
|
+
self.penalize_ambiguous_claims
|
|
458
|
+
and verdict.verdict.strip().lower() == "idk"
|
|
459
|
+
):
|
|
460
|
+
faithfulness_count -= 1
|
|
461
|
+
|
|
462
|
+
score = faithfulness_count / number_of_verdicts
|
|
463
|
+
reason = await self._a_get_interaction_reason(
|
|
464
|
+
score, verdicts, multimodal
|
|
465
|
+
)
|
|
466
|
+
return (
|
|
467
|
+
(0, reason)
|
|
468
|
+
if self.strict_mode and score < self.threshold
|
|
469
|
+
else (score, reason)
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
async def _a_get_interaction_reason(
|
|
473
|
+
self, score, verdicts, multimodal: bool
|
|
474
|
+
) -> str:
|
|
475
|
+
if self.include_reason is False:
|
|
476
|
+
return None
|
|
477
|
+
|
|
478
|
+
contradictions = []
|
|
479
|
+
for verdict in verdicts:
|
|
480
|
+
if verdict.verdict.strip().lower() == "no":
|
|
481
|
+
contradictions.append(verdict.reason)
|
|
482
|
+
|
|
483
|
+
prompt = self.evaluation_template.generate_reason(
|
|
484
|
+
contradictions=contradictions,
|
|
485
|
+
score=format(score, ".2f"),
|
|
486
|
+
multimodal=multimodal,
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
if self.using_native_model:
|
|
490
|
+
res, cost = await self.model.a_generate(
|
|
491
|
+
prompt, schema=FaithfulnessScoreReason
|
|
492
|
+
)
|
|
493
|
+
self.evaluation_cost += cost
|
|
494
|
+
return res.reason
|
|
495
|
+
else:
|
|
496
|
+
try:
|
|
497
|
+
res: FaithfulnessScoreReason = await self.model.a_generate(
|
|
498
|
+
prompt, schema=FaithfulnessScoreReason
|
|
499
|
+
)
|
|
500
|
+
return res.reason
|
|
501
|
+
except TypeError:
|
|
502
|
+
res = await self.model.a_generate(prompt)
|
|
503
|
+
data = trimAndLoadJson(res, self)
|
|
504
|
+
return data["reason"]
|
|
505
|
+
|
|
506
|
+
def _get_interaction_reason(self, score, verdicts, multimodal: bool) -> str:
|
|
507
|
+
if self.include_reason is False:
|
|
508
|
+
return None
|
|
509
|
+
|
|
510
|
+
contradictions = []
|
|
511
|
+
for verdict in verdicts:
|
|
512
|
+
if verdict.verdict.strip().lower() == "no":
|
|
513
|
+
contradictions.append(verdict.reason)
|
|
514
|
+
|
|
515
|
+
prompt = self.evaluation_template.generate_reason(
|
|
516
|
+
contradictions=contradictions,
|
|
517
|
+
score=format(score, ".2f"),
|
|
518
|
+
multimodal=multimodal,
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
if self.using_native_model:
|
|
522
|
+
res, cost = self.model.generate(
|
|
523
|
+
prompt, schema=FaithfulnessScoreReason
|
|
524
|
+
)
|
|
525
|
+
self.evaluation_cost += cost
|
|
526
|
+
return res.reason
|
|
527
|
+
else:
|
|
528
|
+
try:
|
|
529
|
+
res: FaithfulnessScoreReason = self.model.generate(
|
|
530
|
+
prompt, schema=FaithfulnessScoreReason
|
|
531
|
+
)
|
|
532
|
+
return res.reason
|
|
533
|
+
except TypeError:
|
|
534
|
+
res = self.model.generate(prompt)
|
|
535
|
+
data = trimAndLoadJson(res, self)
|
|
536
|
+
return data["reason"]
|
|
537
|
+
|
|
538
|
+
def _get_verbose_steps(
|
|
539
|
+
self, interaction_scores: List[InteractionFaithfulnessScore]
|
|
540
|
+
):
|
|
541
|
+
steps = []
|
|
542
|
+
for index, interaction_score in enumerate(interaction_scores):
|
|
543
|
+
interaction_steps = [
|
|
544
|
+
f"Window {index + 1} \n",
|
|
545
|
+
f"Truths: {prettify_list(interaction_score.truths)} \n",
|
|
546
|
+
f"Claims: {prettify_list(interaction_score.claims)} \n",
|
|
547
|
+
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
548
|
+
f"Score: {interaction_score.score} \n",
|
|
549
|
+
f"Reason: {interaction_score.reason} \n",
|
|
550
|
+
]
|
|
551
|
+
steps.extend(interaction_steps)
|
|
552
|
+
return steps
|
|
553
|
+
|
|
554
|
+
def _generate_reason(
|
|
555
|
+
self, scores: List[InteractionFaithfulnessScore]
|
|
556
|
+
) -> str:
|
|
557
|
+
if self.include_reason is False:
|
|
558
|
+
return None
|
|
559
|
+
|
|
560
|
+
if len(scores) == 0:
|
|
561
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
562
|
+
|
|
563
|
+
reasons = []
|
|
564
|
+
for score in scores:
|
|
565
|
+
reasons.append(score.reason)
|
|
566
|
+
|
|
567
|
+
prompt = self.evaluation_template.generate_final_reason(
|
|
568
|
+
self.score, self.success, reasons
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
if self.using_native_model:
|
|
572
|
+
res, cost = self.model.generate(prompt)
|
|
573
|
+
self.evaluation_cost += cost
|
|
574
|
+
return res
|
|
575
|
+
else:
|
|
576
|
+
res = self.model.generate(prompt)
|
|
577
|
+
return res
|
|
578
|
+
|
|
579
|
+
async def _a_generate_reason(
|
|
580
|
+
self, scores: List[InteractionFaithfulnessScore]
|
|
581
|
+
) -> str:
|
|
582
|
+
if self.include_reason is False:
|
|
583
|
+
return None
|
|
584
|
+
|
|
585
|
+
if len(scores) == 0:
|
|
586
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
587
|
+
|
|
588
|
+
reasons = []
|
|
589
|
+
for score in scores:
|
|
590
|
+
reasons.append(score.reason)
|
|
591
|
+
|
|
592
|
+
prompt = self.evaluation_template.generate_final_reason(
|
|
593
|
+
self.score, self.success, reasons
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
if self.using_native_model:
|
|
597
|
+
res, cost = await self.model.a_generate(prompt)
|
|
598
|
+
self.evaluation_cost += cost
|
|
599
|
+
return res
|
|
600
|
+
else:
|
|
601
|
+
res = await self.model.a_generate(prompt)
|
|
602
|
+
return res
|
|
603
|
+
|
|
604
|
+
def _calculate_score(
|
|
605
|
+
self, scores: List[InteractionFaithfulnessScore]
|
|
606
|
+
) -> float:
|
|
607
|
+
number_of_scores = len(scores)
|
|
608
|
+
if number_of_scores == 0:
|
|
609
|
+
return 1
|
|
610
|
+
total_score = 0
|
|
611
|
+
for score in scores:
|
|
612
|
+
total_score += score.score
|
|
613
|
+
return total_score / number_of_scores
|
|
614
|
+
|
|
615
|
+
def is_successful(self) -> bool:
|
|
616
|
+
if self.error is not None:
|
|
617
|
+
self.success = False
|
|
618
|
+
else:
|
|
619
|
+
try:
|
|
620
|
+
self.success = self.score >= self.threshold
|
|
621
|
+
except:
|
|
622
|
+
self.success = False
|
|
623
|
+
return self.success
|
|
624
|
+
|
|
625
|
+
@property
|
|
626
|
+
def __name__(self):
|
|
627
|
+
return "Turn Faithfulness"
|
|
@@ -2,9 +2,20 @@ from typing import List, Dict
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class TurnRelevancyTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_verdicts(sliding_window: List[Dict]):
|
|
7
15
|
return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether the LAST `assistant` message is relevant to context in messages. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
16
|
+
|
|
17
|
+
{TurnRelevancyTemplate.multimodal_rules}
|
|
18
|
+
|
|
8
19
|
The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the last `assistant` message is relevant according to the context in messages
|
|
9
20
|
Provide a 'reason' ONLY if the answer is 'no'.
|
|
10
21
|
You MUST USE the previous messages (if any) provided in the list of messages to make an informed judgement on relevancy.
|
|
@@ -52,6 +63,9 @@ JSON:
|
|
|
52
63
|
@staticmethod
|
|
53
64
|
def generate_reason(score, irrelevancies):
|
|
54
65
|
return f"""Below is a list of irrelevancies drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why the 'assistant' messages are irrelevant to the 'user' messages.
|
|
66
|
+
|
|
67
|
+
{TurnRelevancyTemplate.multimodal_rules}
|
|
68
|
+
|
|
55
69
|
Given the relevancy score, which is a 0-1 score indicating how irrelevant the OVERALL AI messages are in a conversation (higher the better), CONCISELY summarize the irrelevancies to justify the score.
|
|
56
70
|
|
|
57
71
|
**
|