deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type, Tuple
|
|
2
2
|
import asyncio
|
|
3
|
-
|
|
3
|
+
import itertools
|
|
4
4
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
5
|
from deepeval.metrics import BaseConversationalMetric
|
|
6
6
|
from deepeval.utils import (
|
|
@@ -12,6 +12,7 @@ from deepeval.metrics.utils import (
|
|
|
12
12
|
trimAndLoadJson,
|
|
13
13
|
check_conversational_test_case_params,
|
|
14
14
|
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
15
16
|
initialize_model,
|
|
16
17
|
)
|
|
17
18
|
from deepeval.models import DeepEvalBaseLLM
|
|
@@ -30,6 +31,7 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
30
31
|
|
|
31
32
|
class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
32
33
|
_required_test_case_params: List[TurnParams] = [
|
|
34
|
+
TurnParams.ROLE,
|
|
33
35
|
TurnParams.CONTENT,
|
|
34
36
|
TurnParams.RETRIEVAL_CONTEXT,
|
|
35
37
|
TurnParams.EXPECTED_OUTCOME,
|
|
@@ -43,6 +45,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
43
45
|
async_mode: bool = True,
|
|
44
46
|
strict_mode: bool = False,
|
|
45
47
|
verbose_mode: bool = False,
|
|
48
|
+
window_size: int = 10,
|
|
46
49
|
evaluation_template: Type[
|
|
47
50
|
TurnContextualPrecisionTemplate
|
|
48
51
|
] = TurnContextualPrecisionTemplate,
|
|
@@ -54,6 +57,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
54
57
|
self.async_mode = async_mode
|
|
55
58
|
self.strict_mode = strict_mode
|
|
56
59
|
self.verbose_mode = verbose_mode
|
|
60
|
+
self.window_size = window_size
|
|
57
61
|
self.evaluation_template = evaluation_template
|
|
58
62
|
|
|
59
63
|
def measure(
|
|
@@ -90,9 +94,19 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
90
94
|
)
|
|
91
95
|
else:
|
|
92
96
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
97
|
+
turns_windows: List[List[Turn]] = [
|
|
98
|
+
list(itertools.chain(*window))
|
|
99
|
+
for window in get_turns_in_sliding_window(
|
|
100
|
+
unit_interactions, self.window_size
|
|
101
|
+
)
|
|
102
|
+
]
|
|
103
|
+
scores = []
|
|
104
|
+
for window in turns_windows:
|
|
105
|
+
scores.extend(
|
|
106
|
+
self._get_contextual_precision_scores(
|
|
107
|
+
window, test_case.expected_outcome, multimodal
|
|
108
|
+
)
|
|
109
|
+
)
|
|
96
110
|
self.score = self._calculate_score(scores)
|
|
97
111
|
self.success = self.score >= self.threshold
|
|
98
112
|
self.reason = self._generate_reason(scores)
|
|
@@ -138,9 +152,25 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
138
152
|
_in_component=_in_component,
|
|
139
153
|
):
|
|
140
154
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
155
|
+
turns_windows: List[List[Turn]] = [
|
|
156
|
+
list(itertools.chain(*window))
|
|
157
|
+
for window in get_turns_in_sliding_window(
|
|
158
|
+
unit_interactions, self.window_size
|
|
159
|
+
)
|
|
160
|
+
]
|
|
161
|
+
scores = []
|
|
162
|
+
tasks = []
|
|
163
|
+
|
|
164
|
+
async def get_individual_scores(window):
|
|
165
|
+
scores.extend(
|
|
166
|
+
await self._a_get_contextual_precision_scores(
|
|
167
|
+
window, test_case.expected_outcome, multimodal
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
for window in turns_windows:
|
|
172
|
+
tasks.append(get_individual_scores(window))
|
|
173
|
+
await asyncio.gather(*tasks)
|
|
144
174
|
self.score = self._calculate_score(scores)
|
|
145
175
|
self.success = self.score >= self.threshold
|
|
146
176
|
self.reason = await self._a_generate_reason(scores)
|
|
@@ -162,78 +192,73 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
162
192
|
|
|
163
193
|
async def _a_get_contextual_precision_scores(
|
|
164
194
|
self,
|
|
165
|
-
|
|
166
|
-
|
|
195
|
+
turns_window: List[Turn],
|
|
196
|
+
expected_outcome: str,
|
|
167
197
|
multimodal: bool,
|
|
168
198
|
):
|
|
169
|
-
|
|
170
|
-
user_content = "User Message: "
|
|
171
|
-
retrieval_context = []
|
|
172
|
-
expected_outcome = (
|
|
173
|
-
f"Expected Assistant Message: \n{_expected_outcome}"
|
|
174
|
-
)
|
|
175
|
-
for turn in unit_interaction:
|
|
176
|
-
if turn.role == "user":
|
|
177
|
-
user_content += f"\n{turn.content} "
|
|
178
|
-
else:
|
|
179
|
-
retrieval_context.extend(turn.retrieval_context)
|
|
199
|
+
windows_scores = []
|
|
180
200
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
user_content
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
reason=reason,
|
|
190
|
-
verdicts=verdicts,
|
|
191
|
-
)
|
|
192
|
-
return interaction_score
|
|
201
|
+
user_content = ""
|
|
202
|
+
retrieval_context = []
|
|
203
|
+
for turn in turns_window:
|
|
204
|
+
if turn.role == "user":
|
|
205
|
+
user_content += f"\n{turn.content} "
|
|
206
|
+
else:
|
|
207
|
+
if turn.retrieval_context is not None:
|
|
208
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
193
209
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
210
|
+
verdicts = await self._a_generate_verdicts(
|
|
211
|
+
user_content,
|
|
212
|
+
expected_outcome,
|
|
213
|
+
retrieval_context,
|
|
214
|
+
multimodal,
|
|
199
215
|
)
|
|
216
|
+
score, reason = await self._a_get_interaction_score_and_reason(
|
|
217
|
+
user_content, verdicts, multimodal
|
|
218
|
+
)
|
|
219
|
+
interaction_score = InteractionContextualPrecisionScore(
|
|
220
|
+
score=score,
|
|
221
|
+
reason=reason,
|
|
222
|
+
verdicts=verdicts,
|
|
223
|
+
)
|
|
224
|
+
windows_scores.append(interaction_score)
|
|
200
225
|
|
|
201
|
-
return
|
|
226
|
+
return windows_scores
|
|
202
227
|
|
|
203
228
|
def _get_contextual_precision_scores(
|
|
204
229
|
self,
|
|
205
|
-
|
|
206
|
-
|
|
230
|
+
turns_window: List[Turn],
|
|
231
|
+
expected_outcome: str,
|
|
207
232
|
multimodal: bool,
|
|
208
233
|
):
|
|
209
|
-
|
|
234
|
+
windows_scores = []
|
|
210
235
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
f"
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
if turn.role == "user":
|
|
219
|
-
user_content += f"\n{turn.content} "
|
|
220
|
-
else:
|
|
236
|
+
user_content = ""
|
|
237
|
+
retrieval_context = []
|
|
238
|
+
for turn in turns_window:
|
|
239
|
+
if turn.role == "user":
|
|
240
|
+
user_content += f"\n{turn.content} "
|
|
241
|
+
else:
|
|
242
|
+
if turn.retrieval_context is not None:
|
|
221
243
|
retrieval_context.extend(turn.retrieval_context)
|
|
222
244
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
245
|
+
verdicts = self._generate_verdicts(
|
|
246
|
+
user_content,
|
|
247
|
+
expected_outcome,
|
|
248
|
+
retrieval_context,
|
|
249
|
+
multimodal,
|
|
250
|
+
)
|
|
251
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
252
|
+
user_content, verdicts, multimodal
|
|
253
|
+
)
|
|
254
|
+
interaction_score = InteractionContextualPrecisionScore(
|
|
255
|
+
score=score,
|
|
256
|
+
reason=reason,
|
|
257
|
+
verdicts=verdicts,
|
|
258
|
+
)
|
|
259
|
+
windows_scores.append(interaction_score)
|
|
235
260
|
|
|
236
|
-
return
|
|
261
|
+
return windows_scores
|
|
237
262
|
|
|
238
263
|
async def _a_generate_verdicts(
|
|
239
264
|
self,
|
|
@@ -320,7 +345,10 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
320
345
|
multimodal: bool,
|
|
321
346
|
) -> Tuple[float, str]:
|
|
322
347
|
if len(verdicts) == 0:
|
|
323
|
-
return
|
|
348
|
+
return (
|
|
349
|
+
1,
|
|
350
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual precision.",
|
|
351
|
+
)
|
|
324
352
|
|
|
325
353
|
score = self._calculate_interaction_score(verdicts)
|
|
326
354
|
reason = await self._a_get_interaction_reason(
|
|
@@ -339,7 +367,10 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
339
367
|
multimodal: bool,
|
|
340
368
|
) -> Tuple[float, str]:
|
|
341
369
|
if len(verdicts) == 0:
|
|
342
|
-
return
|
|
370
|
+
return (
|
|
371
|
+
1,
|
|
372
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual precision.",
|
|
373
|
+
)
|
|
343
374
|
|
|
344
375
|
score = self._calculate_interaction_score(verdicts)
|
|
345
376
|
reason = self._get_interaction_reason(
|
|
@@ -376,7 +407,6 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
376
407
|
if relevant_nodes_count == 0:
|
|
377
408
|
return 0
|
|
378
409
|
|
|
379
|
-
# Calculate Average Precision
|
|
380
410
|
score = sum_weighted_precision_at_k / relevant_nodes_count
|
|
381
411
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
382
412
|
|
|
@@ -478,7 +508,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
478
508
|
steps = []
|
|
479
509
|
for index, interaction_score in enumerate(interaction_scores):
|
|
480
510
|
interaction_steps = [
|
|
481
|
-
f"
|
|
511
|
+
f"Window {index + 1} \n",
|
|
482
512
|
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
483
513
|
f"Score: {interaction_score.score} \n",
|
|
484
514
|
f"Reason: {interaction_score.reason} \n",
|
|
@@ -489,6 +519,12 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
489
519
|
def _generate_reason(
|
|
490
520
|
self, scores: List[InteractionContextualPrecisionScore]
|
|
491
521
|
) -> str:
|
|
522
|
+
if self.include_reason is False:
|
|
523
|
+
return None
|
|
524
|
+
|
|
525
|
+
if len(scores) == 0:
|
|
526
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
527
|
+
|
|
492
528
|
reasons = []
|
|
493
529
|
for score in scores:
|
|
494
530
|
reasons.append(score.reason)
|
|
@@ -508,6 +544,12 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
508
544
|
async def _a_generate_reason(
|
|
509
545
|
self, scores: List[InteractionContextualPrecisionScore]
|
|
510
546
|
) -> str:
|
|
547
|
+
if self.include_reason is False:
|
|
548
|
+
return None
|
|
549
|
+
|
|
550
|
+
if len(scores) == 0:
|
|
551
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
552
|
+
|
|
511
553
|
reasons = []
|
|
512
554
|
for score in scores:
|
|
513
555
|
reasons.append(score.reason)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
from pydantic import BaseModel
|
|
3
3
|
|
|
4
4
|
|
|
@@ -17,5 +17,5 @@ class ContextualRecallScoreReason(BaseModel):
|
|
|
17
17
|
|
|
18
18
|
class InteractionContextualRecallScore(BaseModel):
|
|
19
19
|
score: float
|
|
20
|
-
reason: str
|
|
21
|
-
verdicts: List[ContextualRecallVerdict]
|
|
20
|
+
reason: Optional[str]
|
|
21
|
+
verdicts: Optional[List[ContextualRecallVerdict]]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type, Tuple
|
|
2
2
|
import asyncio
|
|
3
|
-
|
|
3
|
+
import itertools
|
|
4
4
|
from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
|
|
5
5
|
from deepeval.metrics import BaseConversationalMetric
|
|
6
6
|
from deepeval.utils import (
|
|
@@ -12,6 +12,7 @@ from deepeval.metrics.utils import (
|
|
|
12
12
|
trimAndLoadJson,
|
|
13
13
|
check_conversational_test_case_params,
|
|
14
14
|
get_unit_interactions,
|
|
15
|
+
get_turns_in_sliding_window,
|
|
15
16
|
initialize_model,
|
|
16
17
|
)
|
|
17
18
|
from deepeval.models import DeepEvalBaseLLM
|
|
@@ -30,6 +31,7 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
30
31
|
|
|
31
32
|
class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
32
33
|
_required_test_case_params: List[TurnParams] = [
|
|
34
|
+
TurnParams.ROLE,
|
|
33
35
|
TurnParams.CONTENT,
|
|
34
36
|
TurnParams.RETRIEVAL_CONTEXT,
|
|
35
37
|
TurnParams.EXPECTED_OUTCOME,
|
|
@@ -43,6 +45,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
43
45
|
async_mode: bool = True,
|
|
44
46
|
strict_mode: bool = False,
|
|
45
47
|
verbose_mode: bool = False,
|
|
48
|
+
window_size: int = 10,
|
|
46
49
|
evaluation_template: Type[
|
|
47
50
|
TurnContextualRecallTemplate
|
|
48
51
|
] = TurnContextualRecallTemplate,
|
|
@@ -54,6 +57,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
54
57
|
self.async_mode = async_mode
|
|
55
58
|
self.strict_mode = strict_mode
|
|
56
59
|
self.verbose_mode = verbose_mode
|
|
60
|
+
self.window_size = window_size
|
|
57
61
|
self.evaluation_template = evaluation_template
|
|
58
62
|
|
|
59
63
|
def measure(
|
|
@@ -90,9 +94,19 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
90
94
|
)
|
|
91
95
|
else:
|
|
92
96
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
97
|
+
turns_windows: List[List[Turn]] = [
|
|
98
|
+
list(itertools.chain(*window))
|
|
99
|
+
for window in get_turns_in_sliding_window(
|
|
100
|
+
unit_interactions, self.window_size
|
|
101
|
+
)
|
|
102
|
+
]
|
|
103
|
+
scores = []
|
|
104
|
+
for window in turns_windows:
|
|
105
|
+
scores.extend(
|
|
106
|
+
self._get_contextual_recall_scores(
|
|
107
|
+
window, test_case.expected_outcome, multimodal
|
|
108
|
+
)
|
|
109
|
+
)
|
|
96
110
|
self.score = self._calculate_score(scores)
|
|
97
111
|
self.success = self.score >= self.threshold
|
|
98
112
|
self.reason = self._generate_reason(scores)
|
|
@@ -138,9 +152,25 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
138
152
|
_in_component=_in_component,
|
|
139
153
|
):
|
|
140
154
|
unit_interactions = get_unit_interactions(test_case.turns)
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
155
|
+
turns_windows: List[List[Turn]] = [
|
|
156
|
+
list(itertools.chain(*window))
|
|
157
|
+
for window in get_turns_in_sliding_window(
|
|
158
|
+
unit_interactions, self.window_size
|
|
159
|
+
)
|
|
160
|
+
]
|
|
161
|
+
scores = []
|
|
162
|
+
tasks = []
|
|
163
|
+
|
|
164
|
+
async def get_individual_scores(window):
|
|
165
|
+
scores.extend(
|
|
166
|
+
await self._a_get_contextual_recall_scores(
|
|
167
|
+
window, test_case.multimodal, multimodal
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
for window in turns_windows:
|
|
172
|
+
tasks.append(get_individual_scores(window))
|
|
173
|
+
await asyncio.gather(*tasks)
|
|
144
174
|
self.score = self._calculate_score(scores)
|
|
145
175
|
self.success = self.score >= self.threshold
|
|
146
176
|
self.reason = await self._a_generate_reason(scores)
|
|
@@ -162,72 +192,67 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
162
192
|
|
|
163
193
|
async def _a_get_contextual_recall_scores(
|
|
164
194
|
self,
|
|
165
|
-
|
|
166
|
-
|
|
195
|
+
turns_window: List[Turn],
|
|
196
|
+
expected_outcome: str,
|
|
167
197
|
multimodal: bool,
|
|
168
198
|
):
|
|
169
|
-
|
|
170
|
-
retrieval_context = []
|
|
171
|
-
expected_outcome = (
|
|
172
|
-
f"Expected Assistant Message: \n{_expected_outcome}"
|
|
173
|
-
)
|
|
174
|
-
for turn in unit_interaction:
|
|
175
|
-
if turn.role == "assistant":
|
|
176
|
-
retrieval_context.extend(turn.retrieval_context)
|
|
199
|
+
windows_scores = []
|
|
177
200
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
reason=reason,
|
|
187
|
-
verdicts=verdicts,
|
|
188
|
-
)
|
|
189
|
-
return interaction_score
|
|
201
|
+
user_content = ""
|
|
202
|
+
retrieval_context = []
|
|
203
|
+
for turn in turns_window:
|
|
204
|
+
if turn.role == "user":
|
|
205
|
+
user_content += f"\n{turn.content} "
|
|
206
|
+
else:
|
|
207
|
+
if turn.retrieval_context is not None:
|
|
208
|
+
retrieval_context.extend(turn.retrieval_context)
|
|
190
209
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
get_interaction_score(unit_interaction)
|
|
194
|
-
for unit_interaction in unit_interactions
|
|
195
|
-
]
|
|
210
|
+
verdicts = await self._a_generate_verdicts(
|
|
211
|
+
expected_outcome, retrieval_context, multimodal
|
|
196
212
|
)
|
|
213
|
+
score, reason = await self._a_get_interaction_score_and_reason(
|
|
214
|
+
expected_outcome, verdicts, multimodal
|
|
215
|
+
)
|
|
216
|
+
interaction_score = InteractionContextualRecallScore(
|
|
217
|
+
score=score,
|
|
218
|
+
reason=reason,
|
|
219
|
+
verdicts=verdicts,
|
|
220
|
+
)
|
|
221
|
+
windows_scores.append(interaction_score)
|
|
197
222
|
|
|
198
|
-
return
|
|
223
|
+
return windows_scores
|
|
199
224
|
|
|
200
225
|
def _get_contextual_recall_scores(
|
|
201
226
|
self,
|
|
202
|
-
|
|
203
|
-
|
|
227
|
+
turns_window: List[Turn],
|
|
228
|
+
expected_outcome: str,
|
|
204
229
|
multimodal: bool,
|
|
205
230
|
):
|
|
206
|
-
|
|
231
|
+
windows_scores = []
|
|
207
232
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
if turn.
|
|
233
|
+
user_content = ""
|
|
234
|
+
retrieval_context = []
|
|
235
|
+
for turn in turns_window:
|
|
236
|
+
if turn.role == "user":
|
|
237
|
+
user_content += f"\n{turn.content} "
|
|
238
|
+
else:
|
|
239
|
+
if turn.retrieval_context is not None:
|
|
215
240
|
retrieval_context.extend(turn.retrieval_context)
|
|
216
241
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
242
|
+
verdicts = self._generate_verdicts(
|
|
243
|
+
expected_outcome, retrieval_context, multimodal
|
|
244
|
+
)
|
|
245
|
+
score, reason = self._get_interaction_score_and_reason(
|
|
246
|
+
expected_outcome, verdicts, multimodal
|
|
247
|
+
)
|
|
248
|
+
interaction_score = InteractionContextualRecallScore(
|
|
249
|
+
score=score,
|
|
250
|
+
reason=reason,
|
|
251
|
+
verdicts=verdicts,
|
|
252
|
+
)
|
|
253
|
+
windows_scores.append(interaction_score)
|
|
229
254
|
|
|
230
|
-
return
|
|
255
|
+
return windows_scores
|
|
231
256
|
|
|
232
257
|
async def _a_generate_verdicts(
|
|
233
258
|
self,
|
|
@@ -308,7 +333,10 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
308
333
|
multimodal: bool,
|
|
309
334
|
) -> Tuple[float, str]:
|
|
310
335
|
if len(verdicts) == 0:
|
|
311
|
-
return
|
|
336
|
+
return (
|
|
337
|
+
1,
|
|
338
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual recall.",
|
|
339
|
+
)
|
|
312
340
|
|
|
313
341
|
score = self._calculate_interaction_score(verdicts)
|
|
314
342
|
reason = await self._a_get_interaction_reason(
|
|
@@ -327,7 +355,10 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
327
355
|
multimodal: bool,
|
|
328
356
|
) -> Tuple[float, str]:
|
|
329
357
|
if len(verdicts) == 0:
|
|
330
|
-
return
|
|
358
|
+
return (
|
|
359
|
+
1,
|
|
360
|
+
"There were no retrieval contexts in the given turns to evaluate the contextual recall.",
|
|
361
|
+
)
|
|
331
362
|
|
|
332
363
|
score = self._calculate_interaction_score(verdicts)
|
|
333
364
|
reason = self._get_interaction_reason(
|
|
@@ -448,7 +479,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
448
479
|
steps = []
|
|
449
480
|
for index, interaction_score in enumerate(interaction_scores):
|
|
450
481
|
interaction_steps = [
|
|
451
|
-
f"
|
|
482
|
+
f"Window {index + 1} \n",
|
|
452
483
|
f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
|
|
453
484
|
f"Score: {interaction_score.score} \n",
|
|
454
485
|
f"Reason: {interaction_score.reason} \n",
|
|
@@ -459,6 +490,12 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
459
490
|
def _generate_reason(
|
|
460
491
|
self, scores: List[InteractionContextualRecallScore]
|
|
461
492
|
) -> str:
|
|
493
|
+
if self.include_reason is False:
|
|
494
|
+
return None
|
|
495
|
+
|
|
496
|
+
if len(scores) == 0:
|
|
497
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
498
|
+
|
|
462
499
|
reasons = []
|
|
463
500
|
for score in scores:
|
|
464
501
|
reasons.append(score.reason)
|
|
@@ -478,6 +515,12 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
478
515
|
async def _a_generate_reason(
|
|
479
516
|
self, scores: List[InteractionContextualRecallScore]
|
|
480
517
|
) -> str:
|
|
518
|
+
if self.include_reason is False:
|
|
519
|
+
return None
|
|
520
|
+
|
|
521
|
+
if len(scores) == 0:
|
|
522
|
+
return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
|
|
523
|
+
|
|
481
524
|
reasons = []
|
|
482
525
|
for score in scores:
|
|
483
526
|
reasons.append(score.reason)
|
|
@@ -18,5 +18,5 @@ class ContextualRelevancyScoreReason(BaseModel):
|
|
|
18
18
|
|
|
19
19
|
class InteractionContextualRelevancyScore(BaseModel):
|
|
20
20
|
score: float
|
|
21
|
-
reason: str
|
|
22
|
-
verdicts: List[ContextualRelevancyVerdict]
|
|
21
|
+
reason: Optional[str]
|
|
22
|
+
verdicts: Optional[List[ContextualRelevancyVerdict]]
|