deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional, Union, Type, Tuple
2
2
  import asyncio
3
-
3
+ import itertools
4
4
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
5
  from deepeval.metrics import BaseConversationalMetric
6
6
  from deepeval.utils import (
@@ -12,6 +12,7 @@ from deepeval.metrics.utils import (
12
12
  trimAndLoadJson,
13
13
  check_conversational_test_case_params,
14
14
  get_unit_interactions,
15
+ get_turns_in_sliding_window,
15
16
  initialize_model,
16
17
  )
17
18
  from deepeval.models import DeepEvalBaseLLM
@@ -30,6 +31,7 @@ from deepeval.metrics.api import metric_data_manager
30
31
 
31
32
  class TurnContextualRelevancyMetric(BaseConversationalMetric):
32
33
  _required_test_case_params: List[TurnParams] = [
34
+ TurnParams.ROLE,
33
35
  TurnParams.CONTENT,
34
36
  TurnParams.RETRIEVAL_CONTEXT,
35
37
  ]
@@ -42,6 +44,7 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
42
44
  async_mode: bool = True,
43
45
  strict_mode: bool = False,
44
46
  verbose_mode: bool = False,
47
+ window_size: int = 10,
45
48
  evaluation_template: Type[
46
49
  TurnContextualRelevancyTemplate
47
50
  ] = TurnContextualRelevancyTemplate,
@@ -53,6 +56,7 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
53
56
  self.async_mode = async_mode
54
57
  self.strict_mode = strict_mode
55
58
  self.verbose_mode = verbose_mode
59
+ self.window_size = window_size
56
60
  self.evaluation_template = evaluation_template
57
61
 
58
62
  def measure(
@@ -89,9 +93,19 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
89
93
  )
90
94
  else:
91
95
  unit_interactions = get_unit_interactions(test_case.turns)
92
- scores = self._get_contextual_relevancy_scores(
93
- unit_interactions, multimodal
94
- )
96
+ turns_windows: List[List[Turn]] = [
97
+ list(itertools.chain(*window))
98
+ for window in get_turns_in_sliding_window(
99
+ unit_interactions, self.window_size
100
+ )
101
+ ]
102
+ scores = []
103
+ for window in turns_windows:
104
+ scores.extend(
105
+ self._get_contextual_relevancy_scores(
106
+ window, multimodal
107
+ )
108
+ )
95
109
  self.score = self._calculate_score(scores)
96
110
  self.success = self.score >= self.threshold
97
111
  self.reason = self._generate_reason(scores)
@@ -137,9 +151,25 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
137
151
  _in_component=_in_component,
138
152
  ):
139
153
  unit_interactions = get_unit_interactions(test_case.turns)
140
- scores = await self._a_get_contextual_relevancy_scores(
141
- unit_interactions, multimodal
142
- )
154
+ turns_windows: List[List[Turn]] = [
155
+ list(itertools.chain(*window))
156
+ for window in get_turns_in_sliding_window(
157
+ unit_interactions, self.window_size
158
+ )
159
+ ]
160
+ scores = []
161
+ tasks = []
162
+
163
+ async def get_individual_scores(window):
164
+ scores.extend(
165
+ await self._a_get_contextual_relevancy_scores(
166
+ window, multimodal
167
+ )
168
+ )
169
+
170
+ for window in turns_windows:
171
+ tasks.append(get_individual_scores(window))
172
+ await asyncio.gather(*tasks)
143
173
  self.score = self._calculate_score(scores)
144
174
  self.success = self.score >= self.threshold
145
175
  self.reason = await self._a_generate_reason(scores)
@@ -160,69 +190,63 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
160
190
  return self.score
161
191
 
162
192
  async def _a_get_contextual_relevancy_scores(
163
- self, unit_interactions: List[List[Turn]], multimodal: bool
193
+ self, turns_window: List[Turn], multimodal: bool
164
194
  ):
165
- async def get_interaction_score(unit_interaction: List[Turn]):
166
- user_content = "User Message: "
167
- retrieval_context = []
168
- for turn in unit_interaction:
169
- if turn.role == "user":
170
- user_content += f"\n{turn.content} "
171
- else:
172
- retrieval_context.extend(turn.retrieval_context)
195
+ windows_scores = []
173
196
 
174
- # Generate verdicts for each retrieval context
175
- verdicts = await self._a_generate_verdicts(
176
- user_content, retrieval_context, multimodal
177
- )
178
- score, reason = await self._a_get_interaction_score_and_reason(
179
- user_content, verdicts, multimodal
180
- )
181
- interaction_score = InteractionContextualRelevancyScore(
182
- score=score,
183
- reason=reason,
184
- verdicts=verdicts,
185
- )
186
- return interaction_score
197
+ user_content = ""
198
+ retrieval_context = []
199
+ for turn in turns_window:
200
+ if turn.role == "user":
201
+ user_content += f"\n{turn.content} "
202
+ else:
203
+ if turn.retrieval_context is not None:
204
+ retrieval_context.extend(turn.retrieval_context)
187
205
 
188
- final_scores = await asyncio.gather(
189
- *[
190
- get_interaction_score(unit_interaction)
191
- for unit_interaction in unit_interactions
192
- ]
206
+ verdicts = await self._a_generate_verdicts(
207
+ user_content, retrieval_context, multimodal
208
+ )
209
+ score, reason = await self._a_get_interaction_score_and_reason(
210
+ user_content, verdicts, multimodal
211
+ )
212
+ interaction_score = InteractionContextualRelevancyScore(
213
+ score=score,
214
+ reason=reason,
215
+ verdicts=verdicts,
193
216
  )
194
217
 
195
- return final_scores
218
+ windows_scores.append(interaction_score)
219
+
220
+ return windows_scores
196
221
 
197
222
  def _get_contextual_relevancy_scores(
198
- self, unit_interactions: List[List[Turn]], multimodal: bool
223
+ self, turns_window: List[Turn], multimodal: bool
199
224
  ):
200
- interaction_scores = []
201
-
202
- for unit_interaction in unit_interactions:
203
- user_content = "User Message: "
204
- retrieval_context = []
205
- for turn in unit_interaction:
206
- if turn.role == "user":
207
- user_content += f"\n{turn.content} "
208
- else:
225
+ windows_scores = []
226
+
227
+ user_content = ""
228
+ retrieval_context = []
229
+ for turn in turns_window:
230
+ if turn.role == "user":
231
+ user_content += f"\n{turn.content} "
232
+ else:
233
+ if turn.retrieval_context is not None:
209
234
  retrieval_context.extend(turn.retrieval_context)
210
235
 
211
- # Generate verdicts for each retrieval context
212
- verdicts = self._generate_verdicts(
213
- user_content, retrieval_context, multimodal
214
- )
215
- score, reason = self._get_interaction_score_and_reason(
216
- user_content, verdicts, multimodal
217
- )
218
- interaction_score = InteractionContextualRelevancyScore(
219
- score=score,
220
- reason=reason,
221
- verdicts=verdicts,
222
- )
223
- interaction_scores.append(interaction_score)
236
+ verdicts = self._generate_verdicts(
237
+ user_content, retrieval_context, multimodal
238
+ )
239
+ score, reason = self._get_interaction_score_and_reason(
240
+ user_content, verdicts, multimodal
241
+ )
242
+ interaction_score = InteractionContextualRelevancyScore(
243
+ score=score,
244
+ reason=reason,
245
+ verdicts=verdicts,
246
+ )
247
+ windows_scores.append(interaction_score)
224
248
 
225
- return interaction_scores
249
+ return windows_scores
226
250
 
227
251
  async def _a_generate_verdicts(
228
252
  self, input: str, retrieval_context: List[str], multimodal: bool
@@ -313,7 +337,10 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
313
337
  multimodal: bool,
314
338
  ) -> Tuple[float, str]:
315
339
  if len(verdicts) == 0:
316
- return 1, None
340
+ return (
341
+ 1,
342
+ "There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
343
+ )
317
344
 
318
345
  score = self._calculate_interaction_score(verdicts)
319
346
  reason = await self._a_get_interaction_reason(
@@ -332,7 +359,10 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
332
359
  multimodal: bool,
333
360
  ) -> Tuple[float, str]:
334
361
  if len(verdicts) == 0:
335
- return 1, None
362
+ return (
363
+ 1,
364
+ "There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
365
+ )
336
366
 
337
367
  score = self._calculate_interaction_score(verdicts)
338
368
  reason = self._get_interaction_reason(
@@ -377,7 +407,6 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
377
407
  if verdict.verdict.strip().lower() == "yes":
378
408
  relevant_statements.append(verdict.statement)
379
409
  else:
380
- # Include the reason for irrelevance
381
410
  irrelevant_statements.append(
382
411
  f"{verdict.statement}: {verdict.reason}"
383
412
  )
@@ -458,12 +487,12 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
458
487
  return data["reason"]
459
488
 
460
489
  def _get_verbose_steps(
461
- self, interaction_scores: List[InteractionContextualRelevancyScore]
490
+ self, windows_scores: List[InteractionContextualRelevancyScore]
462
491
  ):
463
492
  steps = []
464
- for index, interaction_score in enumerate(interaction_scores):
493
+ for index, interaction_score in enumerate(windows_scores):
465
494
  interaction_steps = [
466
- f"Interaction {index + 1} \n",
495
+ f"Window {index + 1} \n",
467
496
  f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
468
497
  f"Score: {interaction_score.score} \n",
469
498
  f"Reason: {interaction_score.reason} \n",
@@ -474,6 +503,12 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
474
503
  def _generate_reason(
475
504
  self, scores: List[InteractionContextualRelevancyScore]
476
505
  ) -> str:
506
+ if self.include_reason is False:
507
+ return None
508
+
509
+ if len(scores) == 0:
510
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
511
+
477
512
  reasons = []
478
513
  for score in scores:
479
514
  reasons.append(score.reason)
@@ -493,6 +528,12 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
493
528
  async def _a_generate_reason(
494
529
  self, scores: List[InteractionContextualRelevancyScore]
495
530
  ) -> str:
531
+ if self.include_reason is False:
532
+ return None
533
+
534
+ if len(scores) == 0:
535
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
536
+
496
537
  reasons = []
497
538
  for score in scores:
498
539
  reasons.append(score.reason)
@@ -25,7 +25,7 @@ class FaithfulnessScoreReason(BaseModel):
25
25
 
26
26
  class InteractionFaithfulnessScore(BaseModel):
27
27
  score: float
28
- reason: str
28
+ reason: Optional[str]
29
29
  claims: List[str]
30
30
  truths: List[str]
31
31
  verdicts: List[FaithfulnessVerdict]
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional, Union, Type, Tuple
2
2
  import asyncio
3
-
3
+ import itertools
4
4
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
5
  from deepeval.metrics import BaseConversationalMetric
6
6
  from deepeval.utils import (
@@ -12,6 +12,7 @@ from deepeval.metrics.utils import (
12
12
  trimAndLoadJson,
13
13
  check_conversational_test_case_params,
14
14
  get_unit_interactions,
15
+ get_turns_in_sliding_window,
15
16
  initialize_model,
16
17
  )
17
18
  from deepeval.models import DeepEvalBaseLLM
@@ -32,6 +33,7 @@ from deepeval.metrics.api import metric_data_manager
32
33
 
33
34
  class TurnFaithfulnessMetric(BaseConversationalMetric):
34
35
  _required_test_case_params: List[TurnParams] = [
36
+ TurnParams.ROLE,
35
37
  TurnParams.CONTENT,
36
38
  TurnParams.RETRIEVAL_CONTEXT,
37
39
  ]
@@ -46,6 +48,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
46
48
  verbose_mode: bool = False,
47
49
  truths_extraction_limit: Optional[int] = None,
48
50
  penalize_ambiguous_claims: bool = False,
51
+ window_size: int = 10,
49
52
  evaluation_template: Type[
50
53
  TurnFaithfulnessTemplate
51
54
  ] = TurnFaithfulnessTemplate,
@@ -59,6 +62,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
59
62
  self.verbose_mode = verbose_mode
60
63
  self.evaluation_template = evaluation_template
61
64
  self.penalize_ambiguous_claims = penalize_ambiguous_claims
65
+ self.window_size = window_size
62
66
 
63
67
  self.truths_extraction_limit = truths_extraction_limit
64
68
  if self.truths_extraction_limit is not None:
@@ -98,9 +102,17 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
98
102
  )
99
103
  else:
100
104
  unit_interactions = get_unit_interactions(test_case.turns)
101
- scores = self._get_faithfulness_scores(
102
- unit_interactions, multimodal
103
- )
105
+ turns_windows: List[List[Turn]] = [
106
+ list(itertools.chain(*window))
107
+ for window in get_turns_in_sliding_window(
108
+ unit_interactions, self.window_size
109
+ )
110
+ ]
111
+ scores = []
112
+ for window in turns_windows:
113
+ scores.extend(
114
+ self._get_faithfulness_scores(window, multimodal)
115
+ )
104
116
  self.score = self._calculate_score(scores)
105
117
  self.success = self.score >= self.threshold
106
118
  self.reason = self._generate_reason(scores)
@@ -146,9 +158,23 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
146
158
  _in_component=_in_component,
147
159
  ):
148
160
  unit_interactions = get_unit_interactions(test_case.turns)
149
- scores = await self._a_get_faithfulness_scores(
150
- unit_interactions, multimodal
151
- )
161
+ turns_windows: List[List[Turn]] = [
162
+ list(itertools.chain(*window))
163
+ for window in get_turns_in_sliding_window(
164
+ unit_interactions, self.window_size
165
+ )
166
+ ]
167
+ scores = []
168
+ tasks = []
169
+
170
+ async def get_individual_scores(window):
171
+ scores.extend(
172
+ await self._a_get_faithfulness_scores(window, multimodal)
173
+ )
174
+
175
+ for window in turns_windows:
176
+ tasks.append(get_individual_scores(window))
177
+ await asyncio.gather(*tasks)
152
178
  self.score = self._calculate_score(scores)
153
179
  self.success = self.score >= self.threshold
154
180
  self.reason = await self._a_generate_reason(scores)
@@ -169,82 +195,75 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
169
195
  return self.score
170
196
 
171
197
  async def _a_get_faithfulness_scores(
172
- self, unit_interactions: List[List[Turn]], multimodal: bool
198
+ self, turns_window: List[Turn], multimodal: bool
173
199
  ):
174
200
 
175
- async def get_interaction_score(unit_interaction: List[Turn]):
176
- user_content = "User Message: "
177
- retrieval_context = []
178
- assistant_content = "Assistant Message: "
179
- for turn in unit_interaction:
180
- if turn.role == "user":
181
- user_content += f"\n{turn.content} "
182
- else:
183
- assistant_content += f"\n{turn.content} "
201
+ windows_scores = []
202
+
203
+ user_content = ""
204
+ assistant_content = ""
205
+ retrieval_context = []
206
+ for turn in turns_window:
207
+ if turn.role == "user":
208
+ user_content += f"\n{turn.content} "
209
+ else:
210
+ assistant_content += f"\n{turn.content}"
211
+ if turn.retrieval_context is not None:
184
212
  retrieval_context.extend(turn.retrieval_context)
185
- truths = await self._a_generate_truths(
186
- retrieval_context, multimodal
187
- )
188
- claims = await self._a_generate_claims(
189
- user_content, assistant_content, multimodal
190
- )
191
- verdicts = await self._a_generate_verdicts(
192
- claims, truths, multimodal
193
- )
194
- score, reason = self._get_interaction_score_and_reason(
195
- verdicts, multimodal
196
- )
197
- interaction_score = InteractionFaithfulnessScore(
198
- score=score,
199
- reason=reason,
200
- claims=claims,
201
- truths=truths,
202
- verdicts=verdicts,
203
- )
204
- return interaction_score
205
213
 
206
- final_scores = await asyncio.gather(
207
- *[
208
- get_interaction_score(unit_interaction)
209
- for unit_interaction in unit_interactions
210
- ]
214
+ truths = await self._a_generate_truths(retrieval_context, multimodal)
215
+ claims = await self._a_generate_claims(
216
+ user_content, assistant_content, multimodal
217
+ )
218
+ verdicts = await self._a_generate_verdicts(claims, truths, multimodal)
219
+ score, reason = self._get_interaction_score_and_reason(
220
+ verdicts, multimodal
211
221
  )
222
+ interaction_score = InteractionFaithfulnessScore(
223
+ score=score,
224
+ reason=reason,
225
+ claims=claims,
226
+ truths=truths,
227
+ verdicts=verdicts,
228
+ )
229
+ windows_scores.append(interaction_score)
212
230
 
213
- return final_scores
231
+ return windows_scores
214
232
 
215
233
  def _get_faithfulness_scores(
216
- self, unit_interactions: List[List[Turn]], multimodal: bool
234
+ self, turns_window: List[Turn], multimodal: bool
217
235
  ):
218
- interaction_scores = []
219
-
220
- for unit_interaction in unit_interactions:
221
- user_content = "User Message: "
222
- retrieval_context = []
223
- assistant_content = "Assistant Message: "
224
- for turn in unit_interaction:
225
- if turn.role == "user":
226
- user_content += f"\n{turn.content} "
227
- else:
228
- assistant_content += f"\n{turn.content} "
236
+ windows_scores = []
237
+
238
+ user_content = ""
239
+ assistant_content = ""
240
+ retrieval_context = []
241
+ for turn in turns_window:
242
+ if turn.role == "user":
243
+ user_content += f"\n{turn.content} "
244
+ else:
245
+ assistant_content += f"\n{turn.content}"
246
+ if turn.retrieval_context is not None:
229
247
  retrieval_context.extend(turn.retrieval_context)
230
- truths = self._generate_truths(retrieval_context, multimodal)
231
- claims = self._generate_claims(
232
- user_content, assistant_content, multimodal
233
- )
234
- verdicts = self._generate_verdicts(claims, truths, multimodal)
235
- score, reason = self._get_interaction_score_and_reason(
236
- verdicts, multimodal
237
- )
238
- interaction_score = InteractionFaithfulnessScore(
239
- score=score,
240
- reason=reason,
241
- claims=claims,
242
- truths=truths,
243
- verdicts=verdicts,
244
- )
245
- interaction_scores.append(interaction_score)
246
248
 
247
- return interaction_scores
249
+ truths = self._generate_truths(retrieval_context, multimodal)
250
+ claims = self._generate_claims(
251
+ user_content, assistant_content, multimodal
252
+ )
253
+ verdicts = self._generate_verdicts(claims, truths, multimodal)
254
+ score, reason = self._get_interaction_score_and_reason(
255
+ verdicts, multimodal
256
+ )
257
+ interaction_score = InteractionFaithfulnessScore(
258
+ score=score,
259
+ reason=reason,
260
+ claims=claims,
261
+ truths=truths,
262
+ verdicts=verdicts,
263
+ )
264
+ windows_scores.append(interaction_score)
265
+
266
+ return windows_scores
248
267
 
249
268
  async def _a_generate_truths(
250
269
  self, retrieval_context: str, multimodal: bool
@@ -522,7 +541,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
522
541
  steps = []
523
542
  for index, interaction_score in enumerate(interaction_scores):
524
543
  interaction_steps = [
525
- f"Interaction {index + 1} \n",
544
+ f"Window {index + 1} \n",
526
545
  f"Truths: {prettify_list(interaction_score.truths)} \n",
527
546
  f"Claims: {prettify_list(interaction_score.claims)} \n",
528
547
  f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
@@ -535,6 +554,12 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
535
554
  def _generate_reason(
536
555
  self, scores: List[InteractionFaithfulnessScore]
537
556
  ) -> str:
557
+ if self.include_reason is False:
558
+ return None
559
+
560
+ if len(scores) == 0:
561
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
562
+
538
563
  reasons = []
539
564
  for score in scores:
540
565
  reasons.append(score.reason)
@@ -554,6 +579,12 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
554
579
  async def _a_generate_reason(
555
580
  self, scores: List[InteractionFaithfulnessScore]
556
581
  ) -> str:
582
+ if self.include_reason is False:
583
+ return None
584
+
585
+ if len(scores) == 0:
586
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
587
+
557
588
  reasons = []
558
589
  for score in scores:
559
590
  reasons.append(score.reason)
@@ -2,9 +2,20 @@ from typing import List, Dict
2
2
 
3
3
 
4
4
  class TurnRelevancyTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_verdicts(sliding_window: List[Dict]):
7
15
  return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether the LAST `assistant` message is relevant to context in messages. The JSON will have 2 fields: 'verdict' and 'reason'.
16
+
17
+ {TurnRelevancyTemplate.multimodal_rules}
18
+
8
19
  The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the last `assistant` message is relevant according to the context in messages
9
20
  Provide a 'reason' ONLY if the answer is 'no'.
10
21
  You MUST USE the previous messages (if any) provided in the list of messages to make an informed judgement on relevancy.
@@ -52,6 +63,9 @@ JSON:
52
63
  @staticmethod
53
64
  def generate_reason(score, irrelevancies):
54
65
  return f"""Below is a list of irrelevancies drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why the 'assistant' messages are irrelevant to the 'user' messages.
66
+
67
+ {TurnRelevancyTemplate.multimodal_rules}
68
+
55
69
  Given the relevancy score, which is a 0-1 score indicating how irrelevant the OVERALL AI messages are in a conversation (higher the better), CONCISELY summarize the irrelevancies to justify the score.
56
70
 
57
71
  **