deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type, Tuple
|
|
2
2
|
import asyncio
|
|
3
|
-
|
|
3
|
+
import itertools
|
|
4
4
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
5
|
from deepeval.metrics import BaseConversationalMetric
|
|
6
6
|
from deepeval.utils import (
|
|
@@ -12,7 +12,10 @@ from deepeval.metrics.utils import (
|
|
|
12
12
|
trimAndLoadJson,
|
|
13
13
|
check_conversational_test_case_params,
|
|
14
14
|
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
15
16
|
initialize_model,
|
|
17
|
+
a_generate_with_schema_and_extract,
|
|
18
|
+
generate_with_schema_and_extract,
|
|
16
19
|
)
|
|
17
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
18
21
|
from deepeval.metrics.turn_contextual_precision.template import (
|
|
@@ -30,6 +33,7 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
30
33
|
|
|
31
34
|
class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
32
35
|
_required_test_case_params: List[TurnParams] = [
|
|
36
|
+
TurnParams.ROLE,
|
|
33
37
|
TurnParams.CONTENT,
|
|
34
38
|
TurnParams.RETRIEVAL_CONTEXT,
|
|
35
39
|
TurnParams.EXPECTED_OUTCOME,
|
|
@@ -43,6 +47,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
43
47
|
async_mode: bool = True,
|
|
44
48
|
strict_mode: bool = False,
|
|
45
49
|
verbose_mode: bool = False,
|
|
50
|
+
window_size: int = 10,
|
|
46
51
|
evaluation_template: Type[
|
|
47
52
|
TurnContextualPrecisionTemplate
|
|
48
53
|
] = TurnContextualPrecisionTemplate,
|
|
@@ -54,6 +59,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
54
59
|
self.async_mode = async_mode
|
|
55
60
|
self.strict_mode = strict_mode
|
|
56
61
|
self.verbose_mode = verbose_mode
|
|
62
|
+
self.window_size = window_size
|
|
57
63
|
self.evaluation_template = evaluation_template
|
|
58
64
|
|
|
59
65
|
def measure(
|
|
@@ -90,9 +96,19 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
90
96
|
)
|
|
91
97
|
else:
|
|
92
98
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
99
|
+
turns_windows: List[List[Turn]] = [
|
|
100
|
+
list(itertools.chain(*window))
|
|
101
|
+
for window in get_turns_in_sliding_window(
|
|
102
|
+
unit_interactions, self.window_size
|
|
103
|
+
)
|
|
104
|
+
]
|
|
105
|
+
scores = []
|
|
106
|
+
for window in turns_windows:
|
|
107
|
+
scores.extend(
|
|
108
|
+
self._get_contextual_precision_scores(
|
|
109
|
+
window, test_case.expected_outcome, multimodal
|
|
110
|
+
)
|
|
111
|
+
)
|
|
96
112
|
self.score = self._calculate_score(scores)
|
|
97
113
|
self.success = self.score >= self.threshold
|
|
98
114
|
self.reason = self._generate_reason(scores)
|
|
@@ -138,9 +154,25 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
138
154
|
_in_component=_in_component,
|
|
139
155
|
):
|
|
140
156
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
157
|
+
turns_windows: List[List[Turn]] = [
|
|
158
|
+
list(itertools.chain(*window))
|
|
159
|
+
for window in get_turns_in_sliding_window(
|
|
160
|
+
unit_interactions, self.window_size
|
|
161
|
+
)
|
|
162
|
+
]
|
|
163
|
+
scores = []
|
|
164
|
+
tasks = []
|
|
165
|
+
|
|
166
|
+
async def get_individual_scores(window):
|
|
167
|
+
scores.extend(
|
|
168
|
+
await self._a_get_contextual_precision_scores(
|
|
169
|
+
window, test_case.expected_outcome, multimodal
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
for window in turns_windows:
|
|
174
|
+
tasks.append(get_individual_scores(window))
|
|
175
|
+
await asyncio.gather(*tasks)
|
|
144
176
|
self.score = self._calculate_score(scores)
|
|
145
177
|
self.success = self.score >= self.threshold
|
|
146
178
|
self.reason = await self._a_generate_reason(scores)
|
|
@@ -162,78 +194,73 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
162
194
|
|
|
163
195
|
async def _a_get_contextual_precision_scores(
|
|
164
196
|
self,
|
|
165
|
-
|
|
166
|
-
|
|
197
|
+
turns_window: List[Turn],
|
|
198
|
+
expected_outcome: str,
|
|
167
199
|
multimodal: bool,
|
|
168
200
|
):
|
|
169
|
-
|
|
170
|
-
user_content = "User Message: "
|
|
171
|
-
retrieval_context = []
|
|
172
|
-
expected_outcome = (
|
|
173
|
-
f"Expected Assistant Message: \n{_expected_outcome}"
|
|
174
|
-
)
|
|
175
|
-
for turn in unit_interaction:
|
|
176
|
-
if turn.role == "user":
|
|
177
|
-
user_content += f"\n{turn.content} "
|
|
178
|
-
else:
|
|
179
|
-
retrieval_context.extend(turn.retrieval_context)
|
|
201
|
+
windows_scores = []
|
|
180
202
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
user_content
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
reason=reason,
|
|
190
|
-
verdicts=verdicts,
|
|
191
|
-
)
|
|
192
|
-
return interaction_score
|
|
203
|
+
user_content = ""
|
|
204
|
+
retrieval_context = []
|
|
205
|
+
for turn in turns_window:
|
|
206
|
+
if turn.role == "user":
|
|
207
|
+
user_content += f"\n{turn.content} "
|
|
208
|
+
else:
|
|
209
|
+
if turn.retrieval_context is not None:
|
|
210
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
193
211
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
212
|
+
verdicts = await self._a_generate_verdicts(
|
|
213
|
+
user_content,
|
|
214
|
+
expected_outcome,
|
|
215
|
+
retrieval_context,
|
|
216
|
+
multimodal,
|
|
199
217
|
)
|
|
218
|
+
score, reason = await self._a_get_interaction_score_and_reason(
|
|
219
|
+
user_content, verdicts, multimodal
|
|
220
|
+
)
|
|
221
|
+
interaction_score = InteractionContextualPrecisionScore(
|
|
222
|
+
score=score,
|
|
223
|
+
reason=reason,
|
|
224
|
+
verdicts=verdicts,
|
|
225
|
+
)
|
|
226
|
+
windows_scores.append(interaction_score)
|
|
200
227
|
|
|
201
|
-
return
|
|
228
|
+
return windows_scores
|
|
202
229
|
|
|
203
230
|
def _get_contextual_precision_scores(
|
|
204
231
|
self,
|
|
205
|
-
|
|
206
|
-
|
|
232
|
+
turns_window: List[Turn],
|
|
233
|
+
expected_outcome: str,
|
|
207
234
|
multimodal: bool,
|
|
208
235
|
):
|
|
209
|
-
|
|
236
|
+
windows_scores = []
|
|
210
237
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
f"
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
if turn.role == "user":
|
|
219
|
-
user_content += f"\n{turn.content} "
|
|
220
|
-
else:
|
|
238
|
+
user_content = ""
|
|
239
|
+
retrieval_context = []
|
|
240
|
+
for turn in turns_window:
|
|
241
|
+
if turn.role == "user":
|
|
242
|
+
user_content += f"\n{turn.content} "
|
|
243
|
+
else:
|
|
244
|
+
if turn.retrieval_context is not None:
|
|
221
245
|
retrieval_context.extend(turn.retrieval_context)
|
|
222
246
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
247
|
+
verdicts = self._generate_verdicts(
|
|
248
|
+
user_content,
|
|
249
|
+
expected_outcome,
|
|
250
|
+
retrieval_context,
|
|
251
|
+
multimodal,
|
|
252
|
+
)
|
|
253
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
254
|
+
user_content, verdicts, multimodal
|
|
255
|
+
)
|
|
256
|
+
interaction_score = InteractionContextualPrecisionScore(
|
|
257
|
+
score=score,
|
|
258
|
+
reason=reason,
|
|
259
|
+
verdicts=verdicts,
|
|
260
|
+
)
|
|
261
|
+
windows_scores.append(interaction_score)
|
|
235
262
|
|
|
236
|
-
return
|
|
263
|
+
return windows_scores
|
|
237
264
|
|
|
238
265
|
async def _a_generate_verdicts(
|
|
239
266
|
self,
|
|
@@ -254,26 +281,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
254
281
|
multimodal=multimodal,
|
|
255
282
|
)
|
|
256
283
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
res: Verdicts = await self.model.a_generate(
|
|
265
|
-
prompt, schema=Verdicts
|
|
266
|
-
)
|
|
267
|
-
verdicts = [item for item in res.verdicts]
|
|
268
|
-
return verdicts
|
|
269
|
-
except TypeError:
|
|
270
|
-
res = await self.model.a_generate(prompt)
|
|
271
|
-
data = trimAndLoadJson(res, self)
|
|
272
|
-
verdicts = [
|
|
273
|
-
ContextualPrecisionVerdict(**item)
|
|
274
|
-
for item in data["verdicts"]
|
|
275
|
-
]
|
|
276
|
-
return verdicts
|
|
284
|
+
return await a_generate_with_schema_and_extract(
|
|
285
|
+
metric=self,
|
|
286
|
+
prompt=prompt,
|
|
287
|
+
schema_cls=Verdicts,
|
|
288
|
+
extract_schema=lambda s: s.verdicts,
|
|
289
|
+
extract_json=lambda data: data["verdicts"],
|
|
290
|
+
)
|
|
277
291
|
|
|
278
292
|
def _generate_verdicts(
|
|
279
293
|
self,
|
|
@@ -294,24 +308,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
294
308
|
multimodal=multimodal,
|
|
295
309
|
)
|
|
296
310
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
305
|
-
verdicts = [item for item in res.verdicts]
|
|
306
|
-
return verdicts
|
|
307
|
-
except TypeError:
|
|
308
|
-
res = self.model.generate(prompt)
|
|
309
|
-
data = trimAndLoadJson(res, self)
|
|
310
|
-
verdicts = [
|
|
311
|
-
ContextualPrecisionVerdict(**item)
|
|
312
|
-
for item in data["verdicts"]
|
|
313
|
-
]
|
|
314
|
-
return verdicts
|
|
311
|
+
return generate_with_schema_and_extract(
|
|
312
|
+
metric=self,
|
|
313
|
+
prompt=prompt,
|
|
314
|
+
schema_cls=Verdicts,
|
|
315
|
+
extract_schema=lambda s: s.verdicts,
|
|
316
|
+
extract_json=lambda data: data["verdicts"],
|
|
317
|
+
)
|
|
315
318
|
|
|
316
319
|
async def _a_get_interaction_score_and_reason(
|
|
317
320
|
self,
|
|
@@ -320,7 +323,10 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
320
323
|
multimodal: bool,
|
|
321
324
|
) -> Tuple[float, str]:
|
|
322
325
|
if len(verdicts) == 0:
|
|
323
|
-
return
|
|
326
|
+
return (
|
|
327
|
+
1,
|
|
328
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual precision.",
|
|
329
|
+
)
|
|
324
330
|
|
|
325
331
|
score = self._calculate_interaction_score(verdicts)
|
|
326
332
|
reason = await self._a_get_interaction_reason(
|
|
@@ -339,7 +345,10 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
339
345
|
multimodal: bool,
|
|
340
346
|
) -> Tuple[float, str]:
|
|
341
347
|
if len(verdicts) == 0:
|
|
342
|
-
return
|
|
348
|
+
return (
|
|
349
|
+
1,
|
|
350
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual precision.",
|
|
351
|
+
)
|
|
343
352
|
|
|
344
353
|
score = self._calculate_interaction_score(verdicts)
|
|
345
354
|
reason = self._get_interaction_reason(
|
|
@@ -376,7 +385,6 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
376
385
|
if relevant_nodes_count == 0:
|
|
377
386
|
return 0
|
|
378
387
|
|
|
379
|
-
# Calculate Average Precision
|
|
380
388
|
score = sum_weighted_precision_at_k / relevant_nodes_count
|
|
381
389
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
382
390
|
|
|
@@ -408,24 +416,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
408
416
|
multimodal=multimodal,
|
|
409
417
|
)
|
|
410
418
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
try:
|
|
419
|
-
res: ContextualPrecisionScoreReason = (
|
|
420
|
-
await self.model.a_generate(
|
|
421
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
422
|
-
)
|
|
423
|
-
)
|
|
424
|
-
return res.reason
|
|
425
|
-
except TypeError:
|
|
426
|
-
res = await self.model.a_generate(prompt)
|
|
427
|
-
data = trimAndLoadJson(res, self)
|
|
428
|
-
return data["reason"]
|
|
419
|
+
return await a_generate_with_schema_and_extract(
|
|
420
|
+
metric=self,
|
|
421
|
+
prompt=prompt,
|
|
422
|
+
schema_cls=ContextualPrecisionScoreReason,
|
|
423
|
+
extract_schema=lambda s: s.reason,
|
|
424
|
+
extract_json=lambda data: data["reason"],
|
|
425
|
+
)
|
|
429
426
|
|
|
430
427
|
def _get_interaction_reason(
|
|
431
428
|
self,
|
|
@@ -455,22 +452,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
455
452
|
multimodal=multimodal,
|
|
456
453
|
)
|
|
457
454
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
try:
|
|
466
|
-
res: ContextualPrecisionScoreReason = self.model.generate(
|
|
467
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
468
|
-
)
|
|
469
|
-
return res.reason
|
|
470
|
-
except TypeError:
|
|
471
|
-
res = self.model.generate(prompt)
|
|
472
|
-
data = trimAndLoadJson(res, self)
|
|
473
|
-
return data["reason"]
|
|
455
|
+
return generate_with_schema_and_extract(
|
|
456
|
+
metric=self,
|
|
457
|
+
prompt=prompt,
|
|
458
|
+
schema_cls=ContextualPrecisionScoreReason,
|
|
459
|
+
extract_schema=lambda s: s.reason,
|
|
460
|
+
extract_json=lambda data: data["reason"],
|
|
461
|
+
)
|
|
474
462
|
|
|
475
463
|
def _get_verbose_steps(
|
|
476
464
|
self, interaction_scores: List[InteractionContextualPrecisionScore]
|
|
@@ -478,7 +466,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
478
466
|
steps = []
|
|
479
467
|
for index, interaction_score in enumerate(interaction_scores):
|
|
480
468
|
interaction_steps = [
|
|
481
|
-
f"
|
|
469
|
+
f"Window {index + 1} \n",
|
|
482
470
|
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
483
471
|
f"Score: {interaction_score.score} \n",
|
|
484
472
|
f"Reason: {interaction_score.reason} \n",
|
|
@@ -489,6 +477,12 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
489
477
|
def _generate_reason(
|
|
490
478
|
self, scores: List[InteractionContextualPrecisionScore]
|
|
491
479
|
) -> str:
|
|
480
|
+
if self.include_reason is False:
|
|
481
|
+
return None
|
|
482
|
+
|
|
483
|
+
if len(scores) == 0:
|
|
484
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
485
|
+
|
|
492
486
|
reasons = []
|
|
493
487
|
for score in scores:
|
|
494
488
|
reasons.append(score.reason)
|
|
@@ -497,17 +491,23 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
497
491
|
self.score, self.success, reasons
|
|
498
492
|
)
|
|
499
493
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
494
|
+
return generate_with_schema_and_extract(
|
|
495
|
+
metric=self,
|
|
496
|
+
prompt=prompt,
|
|
497
|
+
schema_cls=ContextualPrecisionScoreReason,
|
|
498
|
+
extract_schema=lambda s: s.reason,
|
|
499
|
+
extract_json=lambda data: data["reason"],
|
|
500
|
+
)
|
|
507
501
|
|
|
508
502
|
async def _a_generate_reason(
|
|
509
503
|
self, scores: List[InteractionContextualPrecisionScore]
|
|
510
504
|
) -> str:
|
|
505
|
+
if self.include_reason is False:
|
|
506
|
+
return None
|
|
507
|
+
|
|
508
|
+
if len(scores) == 0:
|
|
509
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
510
|
+
|
|
511
511
|
reasons = []
|
|
512
512
|
for score in scores:
|
|
513
513
|
reasons.append(score.reason)
|
|
@@ -516,13 +516,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
516
516
|
self.score, self.success, reasons
|
|
517
517
|
)
|
|
518
518
|
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
519
|
+
return await a_generate_with_schema_and_extract(
|
|
520
|
+
metric=self,
|
|
521
|
+
prompt=prompt,
|
|
522
|
+
schema_cls=ContextualPrecisionScoreReason,
|
|
523
|
+
extract_schema=lambda s: s.reason,
|
|
524
|
+
extract_json=lambda data: data["reason"],
|
|
525
|
+
)
|
|
526
526
|
|
|
527
527
|
def _calculate_score(
|
|
528
528
|
self, scores: List[InteractionContextualPrecisionScore]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
from pydantic import BaseModel
|
|
3
3
|
|
|
4
4
|
|
|
@@ -17,5 +17,5 @@ class ContextualRecallScoreReason(BaseModel):
|
|
|
17
17
|
|
|
18
18
|
class InteractionContextualRecallScore(BaseModel):
|
|
19
19
|
score: float
|
|
20
|
-
reason: str
|
|
21
|
-
verdicts: List[ContextualRecallVerdict]
|
|
20
|
+
reason: Optional[str]
|
|
21
|
+
verdicts: Optional[List[ContextualRecallVerdict]]
|
|
@@ -125,6 +125,13 @@ class TurnContextualRecallTemplate:
|
|
|
125
125
|
Context:
|
|
126
126
|
This metric evaluates conversational contextual recall by determining whether sentences in the assistant output can be attributed to the retrieval context for each interaction. Each interaction yields a reason indicating which sentences were supported or unsupported. You are given all those reasons.
|
|
127
127
|
|
|
128
|
+
**
|
|
129
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
130
|
+
Example JSON:
|
|
131
|
+
{{
|
|
132
|
+
"reason": "The score is <contextual_recall_score> because <your_reason>."
|
|
133
|
+
}}
|
|
134
|
+
|
|
128
135
|
Inputs:
|
|
129
136
|
- final_score: the averaged score across all interactions.
|
|
130
137
|
- success: whether the metric passed or failed
|
|
@@ -151,7 +158,7 @@ class TurnContextualRecallTemplate:
|
|
|
151
158
|
|
|
152
159
|
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
153
160
|
|
|
154
|
-
|
|
161
|
+
JSON:
|
|
155
162
|
"""
|
|
156
163
|
)
|
|
157
164
|
|