deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional, Union, Type, Tuple
2
2
  import asyncio
3
-
3
+ import itertools
4
4
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
5
  from deepeval.metrics import BaseConversationalMetric
6
6
  from deepeval.utils import (
@@ -12,6 +12,7 @@ from deepeval.metrics.utils import (
12
12
  trimAndLoadJson,
13
13
  check_conversational_test_case_params,
14
14
  get_unit_interactions,
15
+ get_turns_in_sliding_window,
15
16
  initialize_model,
16
17
  )
17
18
  from deepeval.models import DeepEvalBaseLLM
@@ -30,6 +31,7 @@ from deepeval.metrics.api import metric_data_manager
30
31
 
31
32
  class TurnContextualPrecisionMetric(BaseConversationalMetric):
32
33
  _required_test_case_params: List[TurnParams] = [
34
+ TurnParams.ROLE,
33
35
  TurnParams.CONTENT,
34
36
  TurnParams.RETRIEVAL_CONTEXT,
35
37
  TurnParams.EXPECTED_OUTCOME,
@@ -43,6 +45,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
43
45
  async_mode: bool = True,
44
46
  strict_mode: bool = False,
45
47
  verbose_mode: bool = False,
48
+ window_size: int = 10,
46
49
  evaluation_template: Type[
47
50
  TurnContextualPrecisionTemplate
48
51
  ] = TurnContextualPrecisionTemplate,
@@ -54,6 +57,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
54
57
  self.async_mode = async_mode
55
58
  self.strict_mode = strict_mode
56
59
  self.verbose_mode = verbose_mode
60
+ self.window_size = window_size
57
61
  self.evaluation_template = evaluation_template
58
62
 
59
63
  def measure(
@@ -90,9 +94,19 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
90
94
  )
91
95
  else:
92
96
  unit_interactions = get_unit_interactions(test_case.turns)
93
- scores = self._get_contextual_precision_scores(
94
- unit_interactions, test_case.expected_outcome, multimodal
95
- )
97
+ turns_windows: List[List[Turn]] = [
98
+ list(itertools.chain(*window))
99
+ for window in get_turns_in_sliding_window(
100
+ unit_interactions, self.window_size
101
+ )
102
+ ]
103
+ scores = []
104
+ for window in turns_windows:
105
+ scores.extend(
106
+ self._get_contextual_precision_scores(
107
+ window, test_case.expected_outcome, multimodal
108
+ )
109
+ )
96
110
  self.score = self._calculate_score(scores)
97
111
  self.success = self.score >= self.threshold
98
112
  self.reason = self._generate_reason(scores)
@@ -138,9 +152,25 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
138
152
  _in_component=_in_component,
139
153
  ):
140
154
  unit_interactions = get_unit_interactions(test_case.turns)
141
- scores = await self._a_get_contextual_precision_scores(
142
- unit_interactions, test_case.expected_outcome, multimodal
143
- )
155
+ turns_windows: List[List[Turn]] = [
156
+ list(itertools.chain(*window))
157
+ for window in get_turns_in_sliding_window(
158
+ unit_interactions, self.window_size
159
+ )
160
+ ]
161
+ scores = []
162
+ tasks = []
163
+
164
+ async def get_individual_scores(window):
165
+ scores.extend(
166
+ await self._a_get_contextual_precision_scores(
167
+ window, test_case.expected_outcome, multimodal
168
+ )
169
+ )
170
+
171
+ for window in turns_windows:
172
+ tasks.append(get_individual_scores(window))
173
+ await asyncio.gather(*tasks)
144
174
  self.score = self._calculate_score(scores)
145
175
  self.success = self.score >= self.threshold
146
176
  self.reason = await self._a_generate_reason(scores)
@@ -162,78 +192,73 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
162
192
 
163
193
  async def _a_get_contextual_precision_scores(
164
194
  self,
165
- unit_interactions: List[List[Turn]],
166
- _expected_outcome: str,
195
+ turns_window: List[Turn],
196
+ expected_outcome: str,
167
197
  multimodal: bool,
168
198
  ):
169
- async def get_interaction_score(unit_interaction: List[Turn]):
170
- user_content = "User Message: "
171
- retrieval_context = []
172
- expected_outcome = (
173
- f"Expected Assistant Message: \n{_expected_outcome}"
174
- )
175
- for turn in unit_interaction:
176
- if turn.role == "user":
177
- user_content += f"\n{turn.content} "
178
- else:
179
- retrieval_context.extend(turn.retrieval_context)
199
+ windows_scores = []
180
200
 
181
- verdicts = await self._a_generate_verdicts(
182
- user_content, expected_outcome, retrieval_context, multimodal
183
- )
184
- score, reason = await self._a_get_interaction_score_and_reason(
185
- user_content, verdicts, multimodal
186
- )
187
- interaction_score = InteractionContextualPrecisionScore(
188
- score=score,
189
- reason=reason,
190
- verdicts=verdicts,
191
- )
192
- return interaction_score
201
+ user_content = ""
202
+ retrieval_context = []
203
+ for turn in turns_window:
204
+ if turn.role == "user":
205
+ user_content += f"\n{turn.content} "
206
+ else:
207
+ if turn.retrieval_context is not None:
208
+ retrieval_context.extend(turn.retrieval_context)
193
209
 
194
- final_scores = await asyncio.gather(
195
- *[
196
- get_interaction_score(unit_interaction)
197
- for unit_interaction in unit_interactions
198
- ]
210
+ verdicts = await self._a_generate_verdicts(
211
+ user_content,
212
+ expected_outcome,
213
+ retrieval_context,
214
+ multimodal,
199
215
  )
216
+ score, reason = await self._a_get_interaction_score_and_reason(
217
+ user_content, verdicts, multimodal
218
+ )
219
+ interaction_score = InteractionContextualPrecisionScore(
220
+ score=score,
221
+ reason=reason,
222
+ verdicts=verdicts,
223
+ )
224
+ windows_scores.append(interaction_score)
200
225
 
201
- return final_scores
226
+ return windows_scores
202
227
 
203
228
  def _get_contextual_precision_scores(
204
229
  self,
205
- unit_interactions: List[List[Turn]],
206
- _expected_outcome: str,
230
+ turns_window: List[Turn],
231
+ expected_outcome: str,
207
232
  multimodal: bool,
208
233
  ):
209
- interaction_scores = []
234
+ windows_scores = []
210
235
 
211
- for unit_interaction in unit_interactions:
212
- user_content = "User Message: "
213
- retrieval_context = []
214
- expected_outcome = (
215
- f"Expected Assistant Message: \n{_expected_outcome}"
216
- )
217
- for turn in unit_interaction:
218
- if turn.role == "user":
219
- user_content += f"\n{turn.content} "
220
- else:
236
+ user_content = ""
237
+ retrieval_context = []
238
+ for turn in turns_window:
239
+ if turn.role == "user":
240
+ user_content += f"\n{turn.content} "
241
+ else:
242
+ if turn.retrieval_context is not None:
221
243
  retrieval_context.extend(turn.retrieval_context)
222
244
 
223
- verdicts = self._generate_verdicts(
224
- user_content, expected_outcome, retrieval_context, multimodal
225
- )
226
- score, reason = self._get_interaction_score_and_reason(
227
- user_content, verdicts, multimodal
228
- )
229
- interaction_score = InteractionContextualPrecisionScore(
230
- score=score,
231
- reason=reason,
232
- verdicts=verdicts,
233
- )
234
- interaction_scores.append(interaction_score)
245
+ verdicts = self._generate_verdicts(
246
+ user_content,
247
+ expected_outcome,
248
+ retrieval_context,
249
+ multimodal,
250
+ )
251
+ score, reason = self._get_interaction_score_and_reason(
252
+ user_content, verdicts, multimodal
253
+ )
254
+ interaction_score = InteractionContextualPrecisionScore(
255
+ score=score,
256
+ reason=reason,
257
+ verdicts=verdicts,
258
+ )
259
+ windows_scores.append(interaction_score)
235
260
 
236
- return interaction_scores
261
+ return windows_scores
237
262
 
238
263
  async def _a_generate_verdicts(
239
264
  self,
@@ -320,7 +345,10 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
320
345
  multimodal: bool,
321
346
  ) -> Tuple[float, str]:
322
347
  if len(verdicts) == 0:
323
- return 1, None
348
+ return (
349
+ 1,
350
+ "There were no retrieval contexts in the given turns to evaluate the contextual precision.",
351
+ )
324
352
 
325
353
  score = self._calculate_interaction_score(verdicts)
326
354
  reason = await self._a_get_interaction_reason(
@@ -339,7 +367,10 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
339
367
  multimodal: bool,
340
368
  ) -> Tuple[float, str]:
341
369
  if len(verdicts) == 0:
342
- return 1, None
370
+ return (
371
+ 1,
372
+ "There were no retrieval contexts in the given turns to evaluate the contextual precision.",
373
+ )
343
374
 
344
375
  score = self._calculate_interaction_score(verdicts)
345
376
  reason = self._get_interaction_reason(
@@ -376,7 +407,6 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
376
407
  if relevant_nodes_count == 0:
377
408
  return 0
378
409
 
379
- # Calculate Average Precision
380
410
  score = sum_weighted_precision_at_k / relevant_nodes_count
381
411
  return 0 if self.strict_mode and score < self.threshold else score
382
412
 
@@ -478,7 +508,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
478
508
  steps = []
479
509
  for index, interaction_score in enumerate(interaction_scores):
480
510
  interaction_steps = [
481
- f"Interaction {index + 1} \n",
511
+ f"Window {index + 1} \n",
482
512
  f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
483
513
  f"Score: {interaction_score.score} \n",
484
514
  f"Reason: {interaction_score.reason} \n",
@@ -489,6 +519,12 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
489
519
  def _generate_reason(
490
520
  self, scores: List[InteractionContextualPrecisionScore]
491
521
  ) -> str:
522
+ if self.include_reason is False:
523
+ return None
524
+
525
+ if len(scores) == 0:
526
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
527
+
492
528
  reasons = []
493
529
  for score in scores:
494
530
  reasons.append(score.reason)
@@ -508,6 +544,12 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
508
544
  async def _a_generate_reason(
509
545
  self, scores: List[InteractionContextualPrecisionScore]
510
546
  ) -> str:
547
+ if self.include_reason is False:
548
+ return None
549
+
550
+ if len(scores) == 0:
551
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
552
+
511
553
  reasons = []
512
554
  for score in scores:
513
555
  reasons.append(score.reason)
@@ -1,4 +1,4 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
  from pydantic import BaseModel
3
3
 
4
4
 
@@ -17,5 +17,5 @@ class ContextualRecallScoreReason(BaseModel):
17
17
 
18
18
  class InteractionContextualRecallScore(BaseModel):
19
19
  score: float
20
- reason: str
21
- verdicts: List[ContextualRecallVerdict]
20
+ reason: Optional[str]
21
+ verdicts: Optional[List[ContextualRecallVerdict]]
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional, Union, Type, Tuple
2
2
  import asyncio
3
-
3
+ import itertools
4
4
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
5
  from deepeval.metrics import BaseConversationalMetric
6
6
  from deepeval.utils import (
@@ -12,6 +12,7 @@ from deepeval.metrics.utils import (
12
12
  trimAndLoadJson,
13
13
  check_conversational_test_case_params,
14
14
  get_unit_interactions,
15
+ get_turns_in_sliding_window,
15
16
  initialize_model,
16
17
  )
17
18
  from deepeval.models import DeepEvalBaseLLM
@@ -30,6 +31,7 @@ from deepeval.metrics.api import metric_data_manager
30
31
 
31
32
  class TurnContextualRecallMetric(BaseConversationalMetric):
32
33
  _required_test_case_params: List[TurnParams] = [
34
+ TurnParams.ROLE,
33
35
  TurnParams.CONTENT,
34
36
  TurnParams.RETRIEVAL_CONTEXT,
35
37
  TurnParams.EXPECTED_OUTCOME,
@@ -43,6 +45,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
43
45
  async_mode: bool = True,
44
46
  strict_mode: bool = False,
45
47
  verbose_mode: bool = False,
48
+ window_size: int = 10,
46
49
  evaluation_template: Type[
47
50
  TurnContextualRecallTemplate
48
51
  ] = TurnContextualRecallTemplate,
@@ -54,6 +57,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
54
57
  self.async_mode = async_mode
55
58
  self.strict_mode = strict_mode
56
59
  self.verbose_mode = verbose_mode
60
+ self.window_size = window_size
57
61
  self.evaluation_template = evaluation_template
58
62
 
59
63
  def measure(
@@ -90,9 +94,19 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
90
94
  )
91
95
  else:
92
96
  unit_interactions = get_unit_interactions(test_case.turns)
93
- scores = self._get_contextual_recall_scores(
94
- unit_interactions, test_case.expected_outcome, multimodal
95
- )
97
+ turns_windows: List[List[Turn]] = [
98
+ list(itertools.chain(*window))
99
+ for window in get_turns_in_sliding_window(
100
+ unit_interactions, self.window_size
101
+ )
102
+ ]
103
+ scores = []
104
+ for window in turns_windows:
105
+ scores.extend(
106
+ self._get_contextual_recall_scores(
107
+ window, test_case.expected_outcome, multimodal
108
+ )
109
+ )
96
110
  self.score = self._calculate_score(scores)
97
111
  self.success = self.score >= self.threshold
98
112
  self.reason = self._generate_reason(scores)
@@ -138,9 +152,25 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
138
152
  _in_component=_in_component,
139
153
  ):
140
154
  unit_interactions = get_unit_interactions(test_case.turns)
141
- scores = await self._a_get_contextual_recall_scores(
142
- unit_interactions, test_case.expected_outcome, multimodal
143
- )
155
+ turns_windows: List[List[Turn]] = [
156
+ list(itertools.chain(*window))
157
+ for window in get_turns_in_sliding_window(
158
+ unit_interactions, self.window_size
159
+ )
160
+ ]
161
+ scores = []
162
+ tasks = []
163
+
164
+ async def get_individual_scores(window):
165
+ scores.extend(
166
+ await self._a_get_contextual_recall_scores(
167
+ window, test_case.multimodal, multimodal
168
+ )
169
+ )
170
+
171
+ for window in turns_windows:
172
+ tasks.append(get_individual_scores(window))
173
+ await asyncio.gather(*tasks)
144
174
  self.score = self._calculate_score(scores)
145
175
  self.success = self.score >= self.threshold
146
176
  self.reason = await self._a_generate_reason(scores)
@@ -162,72 +192,67 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
162
192
 
163
193
  async def _a_get_contextual_recall_scores(
164
194
  self,
165
- unit_interactions: List[List[Turn]],
166
- _expected_outcome: str,
195
+ turns_window: List[Turn],
196
+ expected_outcome: str,
167
197
  multimodal: bool,
168
198
  ):
169
- async def get_interaction_score(unit_interaction: List[Turn]):
170
- retrieval_context = []
171
- expected_outcome = (
172
- f"Expected Assistant Message: \n{_expected_outcome}"
173
- )
174
- for turn in unit_interaction:
175
- if turn.role == "assistant":
176
- retrieval_context.extend(turn.retrieval_context)
199
+ windows_scores = []
177
200
 
178
- verdicts = await self._a_generate_verdicts(
179
- expected_outcome, retrieval_context, multimodal
180
- )
181
- score, reason = await self._a_get_interaction_score_and_reason(
182
- expected_outcome, verdicts, multimodal
183
- )
184
- interaction_score = InteractionContextualRecallScore(
185
- score=score,
186
- reason=reason,
187
- verdicts=verdicts,
188
- )
189
- return interaction_score
201
+ user_content = ""
202
+ retrieval_context = []
203
+ for turn in turns_window:
204
+ if turn.role == "user":
205
+ user_content += f"\n{turn.content} "
206
+ else:
207
+ if turn.retrieval_context is not None:
208
+ retrieval_context.extend(turn.retrieval_context)
190
209
 
191
- final_scores = await asyncio.gather(
192
- *[
193
- get_interaction_score(unit_interaction)
194
- for unit_interaction in unit_interactions
195
- ]
210
+ verdicts = await self._a_generate_verdicts(
211
+ expected_outcome, retrieval_context, multimodal
196
212
  )
213
+ score, reason = await self._a_get_interaction_score_and_reason(
214
+ expected_outcome, verdicts, multimodal
215
+ )
216
+ interaction_score = InteractionContextualRecallScore(
217
+ score=score,
218
+ reason=reason,
219
+ verdicts=verdicts,
220
+ )
221
+ windows_scores.append(interaction_score)
197
222
 
198
- return final_scores
223
+ return windows_scores
199
224
 
200
225
  def _get_contextual_recall_scores(
201
226
  self,
202
- unit_interactions: List[List[Turn]],
203
- _expected_outcome: str,
227
+ turns_window: List[Turn],
228
+ expected_outcome: str,
204
229
  multimodal: bool,
205
230
  ):
206
- interaction_scores = []
231
+ windows_scores = []
207
232
 
208
- for unit_interaction in unit_interactions:
209
- retrieval_context = []
210
- expected_outcome = (
211
- f"Expected Assistant Message: \n{_expected_outcome}"
212
- )
213
- for turn in unit_interaction:
214
- if turn.role == "assistant":
233
+ user_content = ""
234
+ retrieval_context = []
235
+ for turn in turns_window:
236
+ if turn.role == "user":
237
+ user_content += f"\n{turn.content} "
238
+ else:
239
+ if turn.retrieval_context is not None:
215
240
  retrieval_context.extend(turn.retrieval_context)
216
241
 
217
- verdicts = self._generate_verdicts(
218
- expected_outcome, retrieval_context, multimodal
219
- )
220
- score, reason = self._get_interaction_score_and_reason(
221
- expected_outcome, verdicts, multimodal
222
- )
223
- interaction_score = InteractionContextualRecallScore(
224
- score=score,
225
- reason=reason,
226
- verdicts=verdicts,
227
- )
228
- interaction_scores.append(interaction_score)
242
+ verdicts = self._generate_verdicts(
243
+ expected_outcome, retrieval_context, multimodal
244
+ )
245
+ score, reason = self._get_interaction_score_and_reason(
246
+ expected_outcome, verdicts, multimodal
247
+ )
248
+ interaction_score = InteractionContextualRecallScore(
249
+ score=score,
250
+ reason=reason,
251
+ verdicts=verdicts,
252
+ )
253
+ windows_scores.append(interaction_score)
229
254
 
230
- return interaction_scores
255
+ return windows_scores
231
256
 
232
257
  async def _a_generate_verdicts(
233
258
  self,
@@ -308,7 +333,10 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
308
333
  multimodal: bool,
309
334
  ) -> Tuple[float, str]:
310
335
  if len(verdicts) == 0:
311
- return 1, None
336
+ return (
337
+ 1,
338
+ "There were no retrieval contexts in the given turns to evaluate the contextual recall.",
339
+ )
312
340
 
313
341
  score = self._calculate_interaction_score(verdicts)
314
342
  reason = await self._a_get_interaction_reason(
@@ -327,7 +355,10 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
327
355
  multimodal: bool,
328
356
  ) -> Tuple[float, str]:
329
357
  if len(verdicts) == 0:
330
- return 1, None
358
+ return (
359
+ 1,
360
+ "There were no retrieval contexts in the given turns to evaluate the contextual recall.",
361
+ )
331
362
 
332
363
  score = self._calculate_interaction_score(verdicts)
333
364
  reason = self._get_interaction_reason(
@@ -448,7 +479,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
448
479
  steps = []
449
480
  for index, interaction_score in enumerate(interaction_scores):
450
481
  interaction_steps = [
451
- f"Interaction {index + 1} \n",
482
+ f"Window {index + 1} \n",
452
483
  f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
453
484
  f"Score: {interaction_score.score} \n",
454
485
  f"Reason: {interaction_score.reason} \n",
@@ -459,6 +490,12 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
459
490
  def _generate_reason(
460
491
  self, scores: List[InteractionContextualRecallScore]
461
492
  ) -> str:
493
+ if self.include_reason is False:
494
+ return None
495
+
496
+ if len(scores) == 0:
497
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
498
+
462
499
  reasons = []
463
500
  for score in scores:
464
501
  reasons.append(score.reason)
@@ -478,6 +515,12 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
478
515
  async def _a_generate_reason(
479
516
  self, scores: List[InteractionContextualRecallScore]
480
517
  ) -> str:
518
+ if self.include_reason is False:
519
+ return None
520
+
521
+ if len(scores) == 0:
522
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
523
+
481
524
  reasons = []
482
525
  for score in scores:
483
526
  reasons.append(score.reason)
@@ -18,5 +18,5 @@ class ContextualRelevancyScoreReason(BaseModel):
18
18
 
19
19
  class InteractionContextualRelevancyScore(BaseModel):
20
20
  score: float
21
- reason: str
22
- verdicts: List[ContextualRelevancyVerdict]
21
+ reason: Optional[str]
22
+ verdicts: Optional[List[ContextualRelevancyVerdict]]