deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type, Tuple
|
|
2
2
|
import asyncio
|
|
3
|
-
|
|
3
|
+
import itertools
|
|
4
4
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
5
|
from deepeval.metrics import BaseConversationalMetric
|
|
6
6
|
from deepeval.utils import (
|
|
@@ -12,7 +12,10 @@ from deepeval.metrics.utils import (
|
|
|
12
12
|
trimAndLoadJson,
|
|
13
13
|
check_conversational_test_case_params,
|
|
14
14
|
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
15
16
|
initialize_model,
|
|
17
|
+
a_generate_with_schema_and_extract,
|
|
18
|
+
generate_with_schema_and_extract,
|
|
16
19
|
)
|
|
17
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
18
21
|
from deepeval.metrics.turn_contextual_recall.template import (
|
|
@@ -30,6 +33,7 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
30
33
|
|
|
31
34
|
class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
32
35
|
_required_test_case_params: List[TurnParams] = [
|
|
36
|
+
TurnParams.ROLE,
|
|
33
37
|
TurnParams.CONTENT,
|
|
34
38
|
TurnParams.RETRIEVAL_CONTEXT,
|
|
35
39
|
TurnParams.EXPECTED_OUTCOME,
|
|
@@ -43,6 +47,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
43
47
|
async_mode: bool = True,
|
|
44
48
|
strict_mode: bool = False,
|
|
45
49
|
verbose_mode: bool = False,
|
|
50
|
+
window_size: int = 10,
|
|
46
51
|
evaluation_template: Type[
|
|
47
52
|
TurnContextualRecallTemplate
|
|
48
53
|
] = TurnContextualRecallTemplate,
|
|
@@ -54,6 +59,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
54
59
|
self.async_mode = async_mode
|
|
55
60
|
self.strict_mode = strict_mode
|
|
56
61
|
self.verbose_mode = verbose_mode
|
|
62
|
+
self.window_size = window_size
|
|
57
63
|
self.evaluation_template = evaluation_template
|
|
58
64
|
|
|
59
65
|
def measure(
|
|
@@ -90,9 +96,19 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
90
96
|
)
|
|
91
97
|
else:
|
|
92
98
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
99
|
+
turns_windows: List[List[Turn]] = [
|
|
100
|
+
list(itertools.chain(*window))
|
|
101
|
+
for window in get_turns_in_sliding_window(
|
|
102
|
+
unit_interactions, self.window_size
|
|
103
|
+
)
|
|
104
|
+
]
|
|
105
|
+
scores = []
|
|
106
|
+
for window in turns_windows:
|
|
107
|
+
scores.extend(
|
|
108
|
+
self._get_contextual_recall_scores(
|
|
109
|
+
window, test_case.expected_outcome, multimodal
|
|
110
|
+
)
|
|
111
|
+
)
|
|
96
112
|
self.score = self._calculate_score(scores)
|
|
97
113
|
self.success = self.score >= self.threshold
|
|
98
114
|
self.reason = self._generate_reason(scores)
|
|
@@ -138,9 +154,25 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
138
154
|
_in_component=_in_component,
|
|
139
155
|
):
|
|
140
156
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
157
|
+
turns_windows: List[List[Turn]] = [
|
|
158
|
+
list(itertools.chain(*window))
|
|
159
|
+
for window in get_turns_in_sliding_window(
|
|
160
|
+
unit_interactions, self.window_size
|
|
161
|
+
)
|
|
162
|
+
]
|
|
163
|
+
scores = []
|
|
164
|
+
tasks = []
|
|
165
|
+
|
|
166
|
+
async def get_individual_scores(window):
|
|
167
|
+
scores.extend(
|
|
168
|
+
await self._a_get_contextual_recall_scores(
|
|
169
|
+
window, test_case.multimodal, multimodal
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
for window in turns_windows:
|
|
174
|
+
tasks.append(get_individual_scores(window))
|
|
175
|
+
await asyncio.gather(*tasks)
|
|
144
176
|
self.score = self._calculate_score(scores)
|
|
145
177
|
self.success = self.score >= self.threshold
|
|
146
178
|
self.reason = await self._a_generate_reason(scores)
|
|
@@ -162,72 +194,67 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
162
194
|
|
|
163
195
|
async def _a_get_contextual_recall_scores(
|
|
164
196
|
self,
|
|
165
|
-
|
|
166
|
-
|
|
197
|
+
turns_window: List[Turn],
|
|
198
|
+
expected_outcome: str,
|
|
167
199
|
multimodal: bool,
|
|
168
200
|
):
|
|
169
|
-
|
|
170
|
-
retrieval_context = []
|
|
171
|
-
expected_outcome = (
|
|
172
|
-
f"Expected Assistant Message: \n{_expected_outcome}"
|
|
173
|
-
)
|
|
174
|
-
for turn in unit_interaction:
|
|
175
|
-
if turn.role == "assistant":
|
|
176
|
-
retrieval_context.extend(turn.retrieval_context)
|
|
201
|
+
windows_scores = []
|
|
177
202
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
reason=reason,
|
|
187
|
-
verdicts=verdicts,
|
|
188
|
-
)
|
|
189
|
-
return interaction_score
|
|
203
|
+
user_content = ""
|
|
204
|
+
retrieval_context = []
|
|
205
|
+
for turn in turns_window:
|
|
206
|
+
if turn.role == "user":
|
|
207
|
+
user_content += f"\n{turn.content} "
|
|
208
|
+
else:
|
|
209
|
+
if turn.retrieval_context is not None:
|
|
210
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
190
211
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
get_interaction_score(unit_interaction)
|
|
194
|
-
for unit_interaction in unit_interactions
|
|
195
|
-
]
|
|
212
|
+
verdicts = await self._a_generate_verdicts(
|
|
213
|
+
expected_outcome, retrieval_context, multimodal
|
|
196
214
|
)
|
|
215
|
+
score, reason = await self._a_get_interaction_score_and_reason(
|
|
216
|
+
expected_outcome, verdicts, multimodal
|
|
217
|
+
)
|
|
218
|
+
interaction_score = InteractionContextualRecallScore(
|
|
219
|
+
score=score,
|
|
220
|
+
reason=reason,
|
|
221
|
+
verdicts=verdicts,
|
|
222
|
+
)
|
|
223
|
+
windows_scores.append(interaction_score)
|
|
197
224
|
|
|
198
|
-
return
|
|
225
|
+
return windows_scores
|
|
199
226
|
|
|
200
227
|
def _get_contextual_recall_scores(
|
|
201
228
|
self,
|
|
202
|
-
|
|
203
|
-
|
|
229
|
+
turns_window: List[Turn],
|
|
230
|
+
expected_outcome: str,
|
|
204
231
|
multimodal: bool,
|
|
205
232
|
):
|
|
206
|
-
|
|
233
|
+
windows_scores = []
|
|
207
234
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
if turn.
|
|
235
|
+
user_content = ""
|
|
236
|
+
retrieval_context = []
|
|
237
|
+
for turn in turns_window:
|
|
238
|
+
if turn.role == "user":
|
|
239
|
+
user_content += f"\n{turn.content} "
|
|
240
|
+
else:
|
|
241
|
+
if turn.retrieval_context is not None:
|
|
215
242
|
retrieval_context.extend(turn.retrieval_context)
|
|
216
243
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
244
|
+
verdicts = self._generate_verdicts(
|
|
245
|
+
expected_outcome, retrieval_context, multimodal
|
|
246
|
+
)
|
|
247
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
248
|
+
expected_outcome, verdicts, multimodal
|
|
249
|
+
)
|
|
250
|
+
interaction_score = InteractionContextualRecallScore(
|
|
251
|
+
score=score,
|
|
252
|
+
reason=reason,
|
|
253
|
+
verdicts=verdicts,
|
|
254
|
+
)
|
|
255
|
+
windows_scores.append(interaction_score)
|
|
229
256
|
|
|
230
|
-
return
|
|
257
|
+
return windows_scores
|
|
231
258
|
|
|
232
259
|
async def _a_generate_verdicts(
|
|
233
260
|
self,
|
|
@@ -246,25 +273,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
246
273
|
multimodal=multimodal,
|
|
247
274
|
)
|
|
248
275
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
res: Verdicts = await self.model.a_generate(
|
|
257
|
-
prompt, schema=Verdicts
|
|
258
|
-
)
|
|
259
|
-
verdicts = [item for item in res.verdicts]
|
|
260
|
-
return verdicts
|
|
261
|
-
except TypeError:
|
|
262
|
-
res = await self.model.a_generate(prompt)
|
|
263
|
-
data = trimAndLoadJson(res, self)
|
|
264
|
-
verdicts = [
|
|
265
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
266
|
-
]
|
|
267
|
-
return verdicts
|
|
276
|
+
return await a_generate_with_schema_and_extract(
|
|
277
|
+
metric=self,
|
|
278
|
+
prompt=prompt,
|
|
279
|
+
schema_cls=Verdicts,
|
|
280
|
+
extract_schema=lambda s: s.verdicts,
|
|
281
|
+
extract_json=lambda data: data["verdicts"],
|
|
282
|
+
)
|
|
268
283
|
|
|
269
284
|
def _generate_verdicts(
|
|
270
285
|
self,
|
|
@@ -283,23 +298,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
283
298
|
multimodal=multimodal,
|
|
284
299
|
)
|
|
285
300
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
294
|
-
verdicts = [item for item in res.verdicts]
|
|
295
|
-
return verdicts
|
|
296
|
-
except TypeError:
|
|
297
|
-
res = self.model.generate(prompt)
|
|
298
|
-
data = trimAndLoadJson(res, self)
|
|
299
|
-
verdicts = [
|
|
300
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
301
|
-
]
|
|
302
|
-
return verdicts
|
|
301
|
+
return generate_with_schema_and_extract(
|
|
302
|
+
metric=self,
|
|
303
|
+
prompt=prompt,
|
|
304
|
+
schema_cls=Verdicts,
|
|
305
|
+
extract_schema=lambda s: s.verdicts,
|
|
306
|
+
extract_json=lambda data: data["verdicts"],
|
|
307
|
+
)
|
|
303
308
|
|
|
304
309
|
async def _a_get_interaction_score_and_reason(
|
|
305
310
|
self,
|
|
@@ -308,7 +313,10 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
308
313
|
multimodal: bool,
|
|
309
314
|
) -> Tuple[float, str]:
|
|
310
315
|
if len(verdicts) == 0:
|
|
311
|
-
return
|
|
316
|
+
return (
|
|
317
|
+
1,
|
|
318
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual recall.",
|
|
319
|
+
)
|
|
312
320
|
|
|
313
321
|
score = self._calculate_interaction_score(verdicts)
|
|
314
322
|
reason = await self._a_get_interaction_reason(
|
|
@@ -327,7 +335,10 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
327
335
|
multimodal: bool,
|
|
328
336
|
) -> Tuple[float, str]:
|
|
329
337
|
if len(verdicts) == 0:
|
|
330
|
-
return
|
|
338
|
+
return (
|
|
339
|
+
1,
|
|
340
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual recall.",
|
|
341
|
+
)
|
|
331
342
|
|
|
332
343
|
score = self._calculate_interaction_score(verdicts)
|
|
333
344
|
reason = self._get_interaction_reason(
|
|
@@ -381,22 +392,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
381
392
|
multimodal=multimodal,
|
|
382
393
|
)
|
|
383
394
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
try:
|
|
392
|
-
res: ContextualRecallScoreReason = await self.model.a_generate(
|
|
393
|
-
prompt, schema=ContextualRecallScoreReason
|
|
394
|
-
)
|
|
395
|
-
return res.reason
|
|
396
|
-
except TypeError:
|
|
397
|
-
res = await self.model.a_generate(prompt)
|
|
398
|
-
data = trimAndLoadJson(res, self)
|
|
399
|
-
return data["reason"]
|
|
395
|
+
return await a_generate_with_schema_and_extract(
|
|
396
|
+
metric=self,
|
|
397
|
+
prompt=prompt,
|
|
398
|
+
schema_cls=ContextualRecallScoreReason,
|
|
399
|
+
extract_schema=lambda s: s.reason,
|
|
400
|
+
extract_json=lambda data: data["reason"],
|
|
401
|
+
)
|
|
400
402
|
|
|
401
403
|
def _get_interaction_reason(
|
|
402
404
|
self,
|
|
@@ -425,22 +427,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
425
427
|
multimodal=multimodal,
|
|
426
428
|
)
|
|
427
429
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
try:
|
|
436
|
-
res: ContextualRecallScoreReason = self.model.generate(
|
|
437
|
-
prompt, schema=ContextualRecallScoreReason
|
|
438
|
-
)
|
|
439
|
-
return res.reason
|
|
440
|
-
except TypeError:
|
|
441
|
-
res = self.model.generate(prompt)
|
|
442
|
-
data = trimAndLoadJson(res, self)
|
|
443
|
-
return data["reason"]
|
|
430
|
+
return generate_with_schema_and_extract(
|
|
431
|
+
metric=self,
|
|
432
|
+
prompt=prompt,
|
|
433
|
+
schema_cls=ContextualRecallScoreReason,
|
|
434
|
+
extract_schema=lambda s: s.reason,
|
|
435
|
+
extract_json=lambda data: data["reason"],
|
|
436
|
+
)
|
|
444
437
|
|
|
445
438
|
def _get_verbose_steps(
|
|
446
439
|
self, interaction_scores: List[InteractionContextualRecallScore]
|
|
@@ -448,7 +441,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
448
441
|
steps = []
|
|
449
442
|
for index, interaction_score in enumerate(interaction_scores):
|
|
450
443
|
interaction_steps = [
|
|
451
|
-
f"
|
|
444
|
+
f"Window {index + 1} \n",
|
|
452
445
|
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
453
446
|
f"Score: {interaction_score.score} \n",
|
|
454
447
|
f"Reason: {interaction_score.reason} \n",
|
|
@@ -459,6 +452,12 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
459
452
|
def _generate_reason(
|
|
460
453
|
self, scores: List[InteractionContextualRecallScore]
|
|
461
454
|
) -> str:
|
|
455
|
+
if self.include_reason is False:
|
|
456
|
+
return None
|
|
457
|
+
|
|
458
|
+
if len(scores) == 0:
|
|
459
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
460
|
+
|
|
462
461
|
reasons = []
|
|
463
462
|
for score in scores:
|
|
464
463
|
reasons.append(score.reason)
|
|
@@ -467,17 +466,23 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
467
466
|
self.score, self.success, reasons
|
|
468
467
|
)
|
|
469
468
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
469
|
+
return generate_with_schema_and_extract(
|
|
470
|
+
metric=self,
|
|
471
|
+
prompt=prompt,
|
|
472
|
+
schema_cls=ContextualRecallScoreReason,
|
|
473
|
+
extract_schema=lambda s: s.reason,
|
|
474
|
+
extract_json=lambda data: data["reason"],
|
|
475
|
+
)
|
|
477
476
|
|
|
478
477
|
async def _a_generate_reason(
|
|
479
478
|
self, scores: List[InteractionContextualRecallScore]
|
|
480
479
|
) -> str:
|
|
480
|
+
if self.include_reason is False:
|
|
481
|
+
return None
|
|
482
|
+
|
|
483
|
+
if len(scores) == 0:
|
|
484
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
485
|
+
|
|
481
486
|
reasons = []
|
|
482
487
|
for score in scores:
|
|
483
488
|
reasons.append(score.reason)
|
|
@@ -486,13 +491,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
486
491
|
self.score, self.success, reasons
|
|
487
492
|
)
|
|
488
493
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
494
|
+
return await a_generate_with_schema_and_extract(
|
|
495
|
+
metric=self,
|
|
496
|
+
prompt=prompt,
|
|
497
|
+
schema_cls=ContextualRecallScoreReason,
|
|
498
|
+
extract_schema=lambda s: s.reason,
|
|
499
|
+
extract_json=lambda data: data["reason"],
|
|
500
|
+
)
|
|
496
501
|
|
|
497
502
|
def _calculate_score(
|
|
498
503
|
self, scores: List[InteractionContextualRecallScore]
|
|
@@ -18,5 +18,5 @@ class ContextualRelevancyScoreReason(BaseModel):
|
|
|
18
18
|
|
|
19
19
|
class InteractionContextualRelevancyScore(BaseModel):
|
|
20
20
|
score: float
|
|
21
|
-
reason: str
|
|
22
|
-
verdicts: List[ContextualRelevancyVerdict]
|
|
21
|
+
reason: Optional[str]
|
|
22
|
+
verdicts: Optional[List[ContextualRelevancyVerdict]]
|
|
@@ -130,6 +130,13 @@ class TurnContextualRelevancyTemplate:
|
|
|
130
130
|
Context:
|
|
131
131
|
This metric evaluates conversational contextual relevancy by determining whether statements in the retrieval context are relevant to the user message for each interaction. Each interaction yields a reason indicating which statements were relevant or irrelevant. You are given all those reasons.
|
|
132
132
|
|
|
133
|
+
**
|
|
134
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
135
|
+
Example JSON:
|
|
136
|
+
{{
|
|
137
|
+
"reason": "The score is <contextual_relevancy_score> because <your_reason>."
|
|
138
|
+
}}
|
|
139
|
+
|
|
133
140
|
Inputs:
|
|
134
141
|
- final_score: the averaged score across all interactions.
|
|
135
142
|
- success: whether the metric passed or failed
|
|
@@ -156,6 +163,6 @@ class TurnContextualRelevancyTemplate:
|
|
|
156
163
|
|
|
157
164
|
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
158
165
|
|
|
159
|
-
|
|
166
|
+
JSON:
|
|
160
167
|
"""
|
|
161
168
|
)
|