deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type, Tuple
|
|
2
2
|
import asyncio
|
|
3
|
-
|
|
3
|
+
import itertools
|
|
4
4
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
5
|
from deepeval.metrics import BaseConversationalMetric
|
|
6
6
|
from deepeval.utils import (
|
|
@@ -12,7 +12,10 @@ from deepeval.metrics.utils import (
|
|
|
12
12
|
trimAndLoadJson,
|
|
13
13
|
check_conversational_test_case_params,
|
|
14
14
|
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
15
16
|
initialize_model,
|
|
17
|
+
generate_with_schema_and_extract,
|
|
18
|
+
a_generate_with_schema_and_extract,
|
|
16
19
|
)
|
|
17
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
18
21
|
from deepeval.metrics.turn_contextual_relevancy.template import (
|
|
@@ -30,6 +33,7 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
30
33
|
|
|
31
34
|
class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
32
35
|
_required_test_case_params: List[TurnParams] = [
|
|
36
|
+
TurnParams.ROLE,
|
|
33
37
|
TurnParams.CONTENT,
|
|
34
38
|
TurnParams.RETRIEVAL_CONTEXT,
|
|
35
39
|
]
|
|
@@ -42,6 +46,7 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
42
46
|
async_mode: bool = True,
|
|
43
47
|
strict_mode: bool = False,
|
|
44
48
|
verbose_mode: bool = False,
|
|
49
|
+
window_size: int = 10,
|
|
45
50
|
evaluation_template: Type[
|
|
46
51
|
TurnContextualRelevancyTemplate
|
|
47
52
|
] = TurnContextualRelevancyTemplate,
|
|
@@ -53,6 +58,7 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
53
58
|
self.async_mode = async_mode
|
|
54
59
|
self.strict_mode = strict_mode
|
|
55
60
|
self.verbose_mode = verbose_mode
|
|
61
|
+
self.window_size = window_size
|
|
56
62
|
self.evaluation_template = evaluation_template
|
|
57
63
|
|
|
58
64
|
def measure(
|
|
@@ -89,9 +95,19 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
89
95
|
)
|
|
90
96
|
else:
|
|
91
97
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
98
|
+
turns_windows: List[List[Turn]] = [
|
|
99
|
+
list(itertools.chain(*window))
|
|
100
|
+
for window in get_turns_in_sliding_window(
|
|
101
|
+
unit_interactions, self.window_size
|
|
102
|
+
)
|
|
103
|
+
]
|
|
104
|
+
scores = []
|
|
105
|
+
for window in turns_windows:
|
|
106
|
+
scores.extend(
|
|
107
|
+
self._get_contextual_relevancy_scores(
|
|
108
|
+
window, multimodal
|
|
109
|
+
)
|
|
110
|
+
)
|
|
95
111
|
self.score = self._calculate_score(scores)
|
|
96
112
|
self.success = self.score >= self.threshold
|
|
97
113
|
self.reason = self._generate_reason(scores)
|
|
@@ -137,9 +153,25 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
137
153
|
_in_component=_in_component,
|
|
138
154
|
):
|
|
139
155
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
156
|
+
turns_windows: List[List[Turn]] = [
|
|
157
|
+
list(itertools.chain(*window))
|
|
158
|
+
for window in get_turns_in_sliding_window(
|
|
159
|
+
unit_interactions, self.window_size
|
|
160
|
+
)
|
|
161
|
+
]
|
|
162
|
+
scores = []
|
|
163
|
+
tasks = []
|
|
164
|
+
|
|
165
|
+
async def get_individual_scores(window):
|
|
166
|
+
scores.extend(
|
|
167
|
+
await self._a_get_contextual_relevancy_scores(
|
|
168
|
+
window, multimodal
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
for window in turns_windows:
|
|
173
|
+
tasks.append(get_individual_scores(window))
|
|
174
|
+
await asyncio.gather(*tasks)
|
|
143
175
|
self.score = self._calculate_score(scores)
|
|
144
176
|
self.success = self.score >= self.threshold
|
|
145
177
|
self.reason = await self._a_generate_reason(scores)
|
|
@@ -160,69 +192,63 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
160
192
|
return self.score
|
|
161
193
|
|
|
162
194
|
async def _a_get_contextual_relevancy_scores(
|
|
163
|
-
self,
|
|
195
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
164
196
|
):
|
|
165
|
-
|
|
166
|
-
user_content = "User Message: "
|
|
167
|
-
retrieval_context = []
|
|
168
|
-
for turn in unit_interaction:
|
|
169
|
-
if turn.role == "user":
|
|
170
|
-
user_content += f"\n{turn.content} "
|
|
171
|
-
else:
|
|
172
|
-
retrieval_context.extend(turn.retrieval_context)
|
|
197
|
+
windows_scores = []
|
|
173
198
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
score=score,
|
|
183
|
-
reason=reason,
|
|
184
|
-
verdicts=verdicts,
|
|
185
|
-
)
|
|
186
|
-
return interaction_score
|
|
199
|
+
user_content = ""
|
|
200
|
+
retrieval_context = []
|
|
201
|
+
for turn in turns_window:
|
|
202
|
+
if turn.role == "user":
|
|
203
|
+
user_content += f"\n{turn.content} "
|
|
204
|
+
else:
|
|
205
|
+
if turn.retrieval_context is not None:
|
|
206
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
187
207
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
208
|
+
verdicts = await self._a_generate_verdicts(
|
|
209
|
+
user_content, retrieval_context, multimodal
|
|
210
|
+
)
|
|
211
|
+
score, reason = await self._a_get_interaction_score_and_reason(
|
|
212
|
+
user_content, verdicts, multimodal
|
|
213
|
+
)
|
|
214
|
+
interaction_score = InteractionContextualRelevancyScore(
|
|
215
|
+
score=score,
|
|
216
|
+
reason=reason,
|
|
217
|
+
verdicts=verdicts,
|
|
193
218
|
)
|
|
194
219
|
|
|
195
|
-
|
|
220
|
+
windows_scores.append(interaction_score)
|
|
221
|
+
|
|
222
|
+
return windows_scores
|
|
196
223
|
|
|
197
224
|
def _get_contextual_relevancy_scores(
|
|
198
|
-
self,
|
|
225
|
+
self, turns_window: List[Turn], multimodal: bool
|
|
199
226
|
):
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
227
|
+
windows_scores = []
|
|
228
|
+
|
|
229
|
+
user_content = ""
|
|
230
|
+
retrieval_context = []
|
|
231
|
+
for turn in turns_window:
|
|
232
|
+
if turn.role == "user":
|
|
233
|
+
user_content += f"\n{turn.content} "
|
|
234
|
+
else:
|
|
235
|
+
if turn.retrieval_context is not None:
|
|
209
236
|
retrieval_context.extend(turn.retrieval_context)
|
|
210
237
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
interaction_scores.append(interaction_score)
|
|
238
|
+
verdicts = self._generate_verdicts(
|
|
239
|
+
user_content, retrieval_context, multimodal
|
|
240
|
+
)
|
|
241
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
242
|
+
user_content, verdicts, multimodal
|
|
243
|
+
)
|
|
244
|
+
interaction_score = InteractionContextualRelevancyScore(
|
|
245
|
+
score=score,
|
|
246
|
+
reason=reason,
|
|
247
|
+
verdicts=verdicts,
|
|
248
|
+
)
|
|
249
|
+
windows_scores.append(interaction_score)
|
|
224
250
|
|
|
225
|
-
return
|
|
251
|
+
return windows_scores
|
|
226
252
|
|
|
227
253
|
async def _a_generate_verdicts(
|
|
228
254
|
self, input: str, retrieval_context: List[str], multimodal: bool
|
|
@@ -240,29 +266,15 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
240
266
|
multimodal=multimodal,
|
|
241
267
|
)
|
|
242
268
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
await self.model.a_generate(
|
|
253
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
254
|
-
)
|
|
255
|
-
)
|
|
256
|
-
verdicts.extend([item for item in res.verdicts])
|
|
257
|
-
except TypeError:
|
|
258
|
-
res = await self.model.a_generate(prompt)
|
|
259
|
-
data = trimAndLoadJson(res, self)
|
|
260
|
-
verdicts.extend(
|
|
261
|
-
[
|
|
262
|
-
ContextualRelevancyVerdict(**item)
|
|
263
|
-
for item in data["verdicts"]
|
|
264
|
-
]
|
|
265
|
-
)
|
|
269
|
+
result = await a_generate_with_schema_and_extract(
|
|
270
|
+
metric=self,
|
|
271
|
+
prompt=prompt,
|
|
272
|
+
schema_cls=ContextualRelevancyVerdicts,
|
|
273
|
+
extract_schema=lambda s: s.verdicts,
|
|
274
|
+
extract_json=lambda data: data["verdicts"],
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
verdicts.extend(result)
|
|
266
278
|
|
|
267
279
|
return verdicts
|
|
268
280
|
|
|
@@ -282,27 +294,15 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
282
294
|
multimodal=multimodal,
|
|
283
295
|
)
|
|
284
296
|
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
295
|
-
)
|
|
296
|
-
verdicts.extend([item for item in res.verdicts])
|
|
297
|
-
except TypeError:
|
|
298
|
-
res = self.model.generate(prompt)
|
|
299
|
-
data = trimAndLoadJson(res, self)
|
|
300
|
-
verdicts.extend(
|
|
301
|
-
[
|
|
302
|
-
ContextualRelevancyVerdict(**item)
|
|
303
|
-
for item in data["verdicts"]
|
|
304
|
-
]
|
|
305
|
-
)
|
|
297
|
+
result = generate_with_schema_and_extract(
|
|
298
|
+
metric=self,
|
|
299
|
+
prompt=prompt,
|
|
300
|
+
schema_cls=ContextualRelevancyVerdicts,
|
|
301
|
+
extract_schema=lambda s: s.verdicts,
|
|
302
|
+
extract_json=lambda data: data["verdicts"],
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
verdicts.extend(result)
|
|
306
306
|
|
|
307
307
|
return verdicts
|
|
308
308
|
|
|
@@ -313,7 +313,10 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
313
313
|
multimodal: bool,
|
|
314
314
|
) -> Tuple[float, str]:
|
|
315
315
|
if len(verdicts) == 0:
|
|
316
|
-
return
|
|
316
|
+
return (
|
|
317
|
+
1,
|
|
318
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
|
|
319
|
+
)
|
|
317
320
|
|
|
318
321
|
score = self._calculate_interaction_score(verdicts)
|
|
319
322
|
reason = await self._a_get_interaction_reason(
|
|
@@ -332,7 +335,10 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
332
335
|
multimodal: bool,
|
|
333
336
|
) -> Tuple[float, str]:
|
|
334
337
|
if len(verdicts) == 0:
|
|
335
|
-
return
|
|
338
|
+
return (
|
|
339
|
+
1,
|
|
340
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
|
|
341
|
+
)
|
|
336
342
|
|
|
337
343
|
score = self._calculate_interaction_score(verdicts)
|
|
338
344
|
reason = self._get_interaction_reason(
|
|
@@ -377,7 +383,6 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
377
383
|
if verdict.verdict.strip().lower() == "yes":
|
|
378
384
|
relevant_statements.append(verdict.statement)
|
|
379
385
|
else:
|
|
380
|
-
# Include the reason for irrelevance
|
|
381
386
|
irrelevant_statements.append(
|
|
382
387
|
f"{verdict.statement}: {verdict.reason}"
|
|
383
388
|
)
|
|
@@ -390,24 +395,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
390
395
|
multimodal=multimodal,
|
|
391
396
|
)
|
|
392
397
|
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
try:
|
|
401
|
-
res: ContextualRelevancyScoreReason = (
|
|
402
|
-
await self.model.a_generate(
|
|
403
|
-
prompt, schema=ContextualRelevancyScoreReason
|
|
404
|
-
)
|
|
405
|
-
)
|
|
406
|
-
return res.reason
|
|
407
|
-
except TypeError:
|
|
408
|
-
res = await self.model.a_generate(prompt)
|
|
409
|
-
data = trimAndLoadJson(res, self)
|
|
410
|
-
return data["reason"]
|
|
398
|
+
return await a_generate_with_schema_and_extract(
|
|
399
|
+
metric=self,
|
|
400
|
+
prompt=prompt,
|
|
401
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
402
|
+
extract_schema=lambda s: s.reason,
|
|
403
|
+
extract_json=lambda data: data["reason"],
|
|
404
|
+
)
|
|
411
405
|
|
|
412
406
|
def _get_interaction_reason(
|
|
413
407
|
self,
|
|
@@ -440,30 +434,21 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
440
434
|
multimodal=multimodal,
|
|
441
435
|
)
|
|
442
436
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
try:
|
|
451
|
-
res: ContextualRelevancyScoreReason = self.model.generate(
|
|
452
|
-
prompt, schema=ContextualRelevancyScoreReason
|
|
453
|
-
)
|
|
454
|
-
return res.reason
|
|
455
|
-
except TypeError:
|
|
456
|
-
res = self.model.generate(prompt)
|
|
457
|
-
data = trimAndLoadJson(res, self)
|
|
458
|
-
return data["reason"]
|
|
437
|
+
return generate_with_schema_and_extract(
|
|
438
|
+
metric=self,
|
|
439
|
+
prompt=prompt,
|
|
440
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
441
|
+
extract_schema=lambda s: s.reason,
|
|
442
|
+
extract_json=lambda data: data["reason"],
|
|
443
|
+
)
|
|
459
444
|
|
|
460
445
|
def _get_verbose_steps(
|
|
461
|
-
self,
|
|
446
|
+
self, windows_scores: List[InteractionContextualRelevancyScore]
|
|
462
447
|
):
|
|
463
448
|
steps = []
|
|
464
|
-
for index, interaction_score in enumerate(
|
|
449
|
+
for index, interaction_score in enumerate(windows_scores):
|
|
465
450
|
interaction_steps = [
|
|
466
|
-
f"
|
|
451
|
+
f"Window {index + 1} \n",
|
|
467
452
|
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
468
453
|
f"Score: {interaction_score.score} \n",
|
|
469
454
|
f"Reason: {interaction_score.reason} \n",
|
|
@@ -474,6 +459,12 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
474
459
|
def _generate_reason(
|
|
475
460
|
self, scores: List[InteractionContextualRelevancyScore]
|
|
476
461
|
) -> str:
|
|
462
|
+
if self.include_reason is False:
|
|
463
|
+
return None
|
|
464
|
+
|
|
465
|
+
if len(scores) == 0:
|
|
466
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
467
|
+
|
|
477
468
|
reasons = []
|
|
478
469
|
for score in scores:
|
|
479
470
|
reasons.append(score.reason)
|
|
@@ -482,17 +473,23 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
482
473
|
self.score, self.success, reasons
|
|
483
474
|
)
|
|
484
475
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
476
|
+
return generate_with_schema_and_extract(
|
|
477
|
+
metric=self,
|
|
478
|
+
prompt=prompt,
|
|
479
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
480
|
+
extract_schema=lambda s: s.reason,
|
|
481
|
+
extract_json=lambda data: data["reason"],
|
|
482
|
+
)
|
|
492
483
|
|
|
493
484
|
async def _a_generate_reason(
|
|
494
485
|
self, scores: List[InteractionContextualRelevancyScore]
|
|
495
486
|
) -> str:
|
|
487
|
+
if self.include_reason is False:
|
|
488
|
+
return None
|
|
489
|
+
|
|
490
|
+
if len(scores) == 0:
|
|
491
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
492
|
+
|
|
496
493
|
reasons = []
|
|
497
494
|
for score in scores:
|
|
498
495
|
reasons.append(score.reason)
|
|
@@ -501,13 +498,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
501
498
|
self.score, self.success, reasons
|
|
502
499
|
)
|
|
503
500
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
501
|
+
return await a_generate_with_schema_and_extract(
|
|
502
|
+
metric=self,
|
|
503
|
+
prompt=prompt,
|
|
504
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
505
|
+
extract_schema=lambda s: s.reason,
|
|
506
|
+
extract_json=lambda data: data["reason"],
|
|
507
|
+
)
|
|
511
508
|
|
|
512
509
|
def _calculate_score(
|
|
513
510
|
self, scores: List[InteractionContextualRelevancyScore]
|
|
@@ -187,6 +187,13 @@ class TurnFaithfulnessTemplate:
|
|
|
187
187
|
Context:
|
|
188
188
|
This metric evaluates conversational faithfulness by extracting truths from retrieval context, extracting claims from the assistant's output, and generating verdicts that compare each claim against the truths. Each interaction yields a reason indicating why a verdict failed or succeeded. You are given all those reasons.
|
|
189
189
|
|
|
190
|
+
**
|
|
191
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
192
|
+
Example JSON:
|
|
193
|
+
{{
|
|
194
|
+
"reason": "The score is <turn_faithfulness_score> because <your_reason>."
|
|
195
|
+
}}
|
|
196
|
+
|
|
190
197
|
Inputs:
|
|
191
198
|
- final_score: the averaged score across all interactions.
|
|
192
199
|
- success: whether the metric passed or failed
|
|
@@ -213,6 +220,6 @@ class TurnFaithfulnessTemplate:
|
|
|
213
220
|
|
|
214
221
|
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
215
222
|
|
|
216
|
-
|
|
223
|
+
JSON:
|
|
217
224
|
"""
|
|
218
225
|
)
|