deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional, Union, Type, Tuple
2
2
  import asyncio
3
-
3
+ import itertools
4
4
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
5
  from deepeval.metrics import BaseConversationalMetric
6
6
  from deepeval.utils import (
@@ -12,7 +12,10 @@ from deepeval.metrics.utils import (
12
12
  trimAndLoadJson,
13
13
  check_conversational_test_case_params,
14
14
  get_unit_interactions,
15
+ get_turns_in_sliding_window,
15
16
  initialize_model,
17
+ generate_with_schema_and_extract,
18
+ a_generate_with_schema_and_extract,
16
19
  )
17
20
  from deepeval.models import DeepEvalBaseLLM
18
21
  from deepeval.metrics.turn_faithfulness.template import (
@@ -32,6 +35,7 @@ from deepeval.metrics.api import metric_data_manager
32
35
 
33
36
  class TurnFaithfulnessMetric(BaseConversationalMetric):
34
37
  _required_test_case_params: List[TurnParams] = [
38
+ TurnParams.ROLE,
35
39
  TurnParams.CONTENT,
36
40
  TurnParams.RETRIEVAL_CONTEXT,
37
41
  ]
@@ -46,6 +50,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
46
50
  verbose_mode: bool = False,
47
51
  truths_extraction_limit: Optional[int] = None,
48
52
  penalize_ambiguous_claims: bool = False,
53
+ window_size: int = 10,
49
54
  evaluation_template: Type[
50
55
  TurnFaithfulnessTemplate
51
56
  ] = TurnFaithfulnessTemplate,
@@ -59,6 +64,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
59
64
  self.verbose_mode = verbose_mode
60
65
  self.evaluation_template = evaluation_template
61
66
  self.penalize_ambiguous_claims = penalize_ambiguous_claims
67
+ self.window_size = window_size
62
68
 
63
69
  self.truths_extraction_limit = truths_extraction_limit
64
70
  if self.truths_extraction_limit is not None:
@@ -98,9 +104,17 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
98
104
  )
99
105
  else:
100
106
  unit_interactions = get_unit_interactions(test_case.turns)
101
- scores = self._get_faithfulness_scores(
102
- unit_interactions, multimodal
103
- )
107
+ turns_windows: List[List[Turn]] = [
108
+ list(itertools.chain(*window))
109
+ for window in get_turns_in_sliding_window(
110
+ unit_interactions, self.window_size
111
+ )
112
+ ]
113
+ scores = []
114
+ for window in turns_windows:
115
+ scores.extend(
116
+ self._get_faithfulness_scores(window, multimodal)
117
+ )
104
118
  self.score = self._calculate_score(scores)
105
119
  self.success = self.score >= self.threshold
106
120
  self.reason = self._generate_reason(scores)
@@ -146,9 +160,23 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
146
160
  _in_component=_in_component,
147
161
  ):
148
162
  unit_interactions = get_unit_interactions(test_case.turns)
149
- scores = await self._a_get_faithfulness_scores(
150
- unit_interactions, multimodal
151
- )
163
+ turns_windows: List[List[Turn]] = [
164
+ list(itertools.chain(*window))
165
+ for window in get_turns_in_sliding_window(
166
+ unit_interactions, self.window_size
167
+ )
168
+ ]
169
+ scores = []
170
+ tasks = []
171
+
172
+ async def get_individual_scores(window):
173
+ scores.extend(
174
+ await self._a_get_faithfulness_scores(window, multimodal)
175
+ )
176
+
177
+ for window in turns_windows:
178
+ tasks.append(get_individual_scores(window))
179
+ await asyncio.gather(*tasks)
152
180
  self.score = self._calculate_score(scores)
153
181
  self.success = self.score >= self.threshold
154
182
  self.reason = await self._a_generate_reason(scores)
@@ -169,82 +197,75 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
169
197
  return self.score
170
198
 
171
199
  async def _a_get_faithfulness_scores(
172
- self, unit_interactions: List[List[Turn]], multimodal: bool
200
+ self, turns_window: List[Turn], multimodal: bool
173
201
  ):
174
202
 
175
- async def get_interaction_score(unit_interaction: List[Turn]):
176
- user_content = "User Message: "
177
- retrieval_context = []
178
- assistant_content = "Assistant Message: "
179
- for turn in unit_interaction:
180
- if turn.role == "user":
181
- user_content += f"\n{turn.content} "
182
- else:
183
- assistant_content += f"\n{turn.content} "
203
+ windows_scores = []
204
+
205
+ user_content = ""
206
+ assistant_content = ""
207
+ retrieval_context = []
208
+ for turn in turns_window:
209
+ if turn.role == "user":
210
+ user_content += f"\n{turn.content} "
211
+ else:
212
+ assistant_content += f"\n{turn.content}"
213
+ if turn.retrieval_context is not None:
184
214
  retrieval_context.extend(turn.retrieval_context)
185
- truths = await self._a_generate_truths(
186
- retrieval_context, multimodal
187
- )
188
- claims = await self._a_generate_claims(
189
- user_content, assistant_content, multimodal
190
- )
191
- verdicts = await self._a_generate_verdicts(
192
- claims, truths, multimodal
193
- )
194
- score, reason = self._get_interaction_score_and_reason(
195
- verdicts, multimodal
196
- )
197
- interaction_score = InteractionFaithfulnessScore(
198
- score=score,
199
- reason=reason,
200
- claims=claims,
201
- truths=truths,
202
- verdicts=verdicts,
203
- )
204
- return interaction_score
205
215
 
206
- final_scores = await asyncio.gather(
207
- *[
208
- get_interaction_score(unit_interaction)
209
- for unit_interaction in unit_interactions
210
- ]
216
+ truths = await self._a_generate_truths(retrieval_context, multimodal)
217
+ claims = await self._a_generate_claims(
218
+ user_content, assistant_content, multimodal
219
+ )
220
+ verdicts = await self._a_generate_verdicts(claims, truths, multimodal)
221
+ score, reason = self._get_interaction_score_and_reason(
222
+ verdicts, multimodal
211
223
  )
224
+ interaction_score = InteractionFaithfulnessScore(
225
+ score=score,
226
+ reason=reason,
227
+ claims=claims,
228
+ truths=truths,
229
+ verdicts=verdicts,
230
+ )
231
+ windows_scores.append(interaction_score)
212
232
 
213
- return final_scores
233
+ return windows_scores
214
234
 
215
235
  def _get_faithfulness_scores(
216
- self, unit_interactions: List[List[Turn]], multimodal: bool
236
+ self, turns_window: List[Turn], multimodal: bool
217
237
  ):
218
- interaction_scores = []
219
-
220
- for unit_interaction in unit_interactions:
221
- user_content = "User Message: "
222
- retrieval_context = []
223
- assistant_content = "Assistant Message: "
224
- for turn in unit_interaction:
225
- if turn.role == "user":
226
- user_content += f"\n{turn.content} "
227
- else:
228
- assistant_content += f"\n{turn.content} "
238
+ windows_scores = []
239
+
240
+ user_content = ""
241
+ assistant_content = ""
242
+ retrieval_context = []
243
+ for turn in turns_window:
244
+ if turn.role == "user":
245
+ user_content += f"\n{turn.content} "
246
+ else:
247
+ assistant_content += f"\n{turn.content}"
248
+ if turn.retrieval_context is not None:
229
249
  retrieval_context.extend(turn.retrieval_context)
230
- truths = self._generate_truths(retrieval_context, multimodal)
231
- claims = self._generate_claims(
232
- user_content, assistant_content, multimodal
233
- )
234
- verdicts = self._generate_verdicts(claims, truths, multimodal)
235
- score, reason = self._get_interaction_score_and_reason(
236
- verdicts, multimodal
237
- )
238
- interaction_score = InteractionFaithfulnessScore(
239
- score=score,
240
- reason=reason,
241
- claims=claims,
242
- truths=truths,
243
- verdicts=verdicts,
244
- )
245
- interaction_scores.append(interaction_score)
246
250
 
247
- return interaction_scores
251
+ truths = self._generate_truths(retrieval_context, multimodal)
252
+ claims = self._generate_claims(
253
+ user_content, assistant_content, multimodal
254
+ )
255
+ verdicts = self._generate_verdicts(claims, truths, multimodal)
256
+ score, reason = self._get_interaction_score_and_reason(
257
+ verdicts, multimodal
258
+ )
259
+ interaction_score = InteractionFaithfulnessScore(
260
+ score=score,
261
+ reason=reason,
262
+ claims=claims,
263
+ truths=truths,
264
+ verdicts=verdicts,
265
+ )
266
+ windows_scores.append(interaction_score)
267
+
268
+ return windows_scores
248
269
 
249
270
  async def _a_generate_truths(
250
271
  self, retrieval_context: str, multimodal: bool
@@ -254,18 +275,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
254
275
  extraction_limit=self.truths_extraction_limit,
255
276
  multimodal=multimodal,
256
277
  )
257
- if self.using_native_model:
258
- res, cost = await self.model.a_generate(prompt, schema=Truths)
259
- self.evaluation_cost += cost
260
- return res.truths
261
- else:
262
- try:
263
- res: Truths = await self.model.a_generate(prompt, schema=Truths)
264
- return res.truths
265
- except TypeError:
266
- res = await self.model.a_generate(prompt)
267
- data = trimAndLoadJson(res, self)
268
- return data["truths"]
278
+
279
+ return await a_generate_with_schema_and_extract(
280
+ metric=self,
281
+ prompt=prompt,
282
+ schema_cls=Truths,
283
+ extract_schema=lambda s: s.truths,
284
+ extract_json=lambda data: data["truths"],
285
+ )
269
286
 
270
287
  def _generate_truths(
271
288
  self, retrieval_context: str, multimodal: bool
@@ -275,18 +292,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
275
292
  extraction_limit=self.truths_extraction_limit,
276
293
  multimodal=multimodal,
277
294
  )
278
- if self.using_native_model:
279
- res, cost = self.model.generate(prompt, schema=Truths)
280
- self.evaluation_cost += cost
281
- return res.truths
282
- else:
283
- try:
284
- res: Truths = self.model.generate(prompt, schema=Truths)
285
- return res.truths
286
- except TypeError:
287
- res = self.model.generate(prompt)
288
- data = trimAndLoadJson(res, self)
289
- return data["truths"]
295
+
296
+ return generate_with_schema_and_extract(
297
+ metric=self,
298
+ prompt=prompt,
299
+ schema_cls=Truths,
300
+ extract_schema=lambda s: s.truths,
301
+ extract_json=lambda data: data["truths"],
302
+ )
290
303
 
291
304
  async def _a_generate_claims(
292
305
  self, user_content: str, assistant_content: str, multimodal: bool
@@ -296,18 +309,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
296
309
  assistant_output=assistant_content,
297
310
  multimodal=multimodal,
298
311
  )
299
- if self.using_native_model:
300
- res, cost = await self.model.a_generate(prompt, schema=Claims)
301
- self.evaluation_cost += cost
302
- return res.claims
303
- else:
304
- try:
305
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
306
- return res.claims
307
- except TypeError:
308
- res = await self.model.a_generate(prompt)
309
- data = trimAndLoadJson(res, self)
310
- return data["claims"]
312
+
313
+ return await a_generate_with_schema_and_extract(
314
+ metric=self,
315
+ prompt=prompt,
316
+ schema_cls=Claims,
317
+ extract_schema=lambda s: s.claims,
318
+ extract_json=lambda data: data["claims"],
319
+ )
311
320
 
312
321
  def _generate_claims(
313
322
  self, user_content: str, assistant_content: str, multimodal: bool
@@ -317,18 +326,14 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
317
326
  assistant_output=assistant_content,
318
327
  multimodal=multimodal,
319
328
  )
320
- if self.using_native_model:
321
- res, cost = self.model.generate(prompt, schema=Claims)
322
- self.evaluation_cost += cost
323
- return res.claims
324
- else:
325
- try:
326
- res: Claims = self.model.generate(prompt, schema=Claims)
327
- return res.claims
328
- except TypeError:
329
- res = self.model.generate(prompt)
330
- data = trimAndLoadJson(res, self)
331
- return data["claims"]
329
+
330
+ return generate_with_schema_and_extract(
331
+ metric=self,
332
+ prompt=prompt,
333
+ schema_cls=Claims,
334
+ extract_schema=lambda s: s.claims,
335
+ extract_json=lambda data: data["claims"],
336
+ )
332
337
 
333
338
  async def _a_generate_verdicts(
334
339
  self, claims: Claims, truths: Truths, multimodal: bool
@@ -344,25 +349,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
344
349
  multimodal=multimodal,
345
350
  )
346
351
 
347
- if self.using_native_model:
348
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
349
- self.evaluation_cost += cost
350
- verdicts = [item for item in res.verdicts]
351
- return verdicts
352
- else:
353
- try:
354
- res: Verdicts = await self.model.a_generate(
355
- prompt, schema=Verdicts
356
- )
357
- verdicts = [item for item in res.verdicts]
358
- return verdicts
359
- except TypeError:
360
- res = await self.model.a_generate(prompt)
361
- data = trimAndLoadJson(res, self)
362
- verdicts = [
363
- FaithfulnessVerdict(**item) for item in data["verdicts"]
364
- ]
365
- return verdicts
352
+ return await a_generate_with_schema_and_extract(
353
+ metric=self,
354
+ prompt=prompt,
355
+ schema_cls=Verdicts,
356
+ extract_schema=lambda s: s.verdicts,
357
+ extract_json=lambda data: data["verdicts"],
358
+ )
366
359
 
367
360
  def _generate_verdicts(
368
361
  self, claims: Claims, truths: Truths, multimodal: bool
@@ -378,23 +371,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
378
371
  multimodal=multimodal,
379
372
  )
380
373
 
381
- if self.using_native_model:
382
- res, cost = self.model.generate(prompt, schema=Verdicts)
383
- self.evaluation_cost += cost
384
- verdicts = [item for item in res.verdicts]
385
- return verdicts
386
- else:
387
- try:
388
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
389
- verdicts = [item for item in res.verdicts]
390
- return verdicts
391
- except TypeError:
392
- res = self.model.generate(prompt)
393
- data = trimAndLoadJson(res, self)
394
- verdicts = [
395
- FaithfulnessVerdict(**item) for item in data["verdicts"]
396
- ]
397
- return verdicts
374
+ return generate_with_schema_and_extract(
375
+ metric=self,
376
+ prompt=prompt,
377
+ schema_cls=Verdicts,
378
+ extract_schema=lambda s: s.verdicts,
379
+ extract_json=lambda data: data["verdicts"],
380
+ )
398
381
 
399
382
  def _get_interaction_score_and_reason(
400
383
  self, verdicts, multimodal: bool
@@ -467,22 +450,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
467
450
  multimodal=multimodal,
468
451
  )
469
452
 
470
- if self.using_native_model:
471
- res, cost = await self.model.a_generate(
472
- prompt, schema=FaithfulnessScoreReason
473
- )
474
- self.evaluation_cost += cost
475
- return res.reason
476
- else:
477
- try:
478
- res: FaithfulnessScoreReason = await self.model.a_generate(
479
- prompt, schema=FaithfulnessScoreReason
480
- )
481
- return res.reason
482
- except TypeError:
483
- res = await self.model.a_generate(prompt)
484
- data = trimAndLoadJson(res, self)
485
- return data["reason"]
453
+ return await a_generate_with_schema_and_extract(
454
+ metric=self,
455
+ prompt=prompt,
456
+ schema_cls=FaithfulnessScoreReason,
457
+ extract_schema=lambda s: s.reason,
458
+ extract_json=lambda data: data["reason"],
459
+ )
486
460
 
487
461
  def _get_interaction_reason(self, score, verdicts, multimodal: bool) -> str:
488
462
  if self.include_reason is False:
@@ -499,22 +473,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
499
473
  multimodal=multimodal,
500
474
  )
501
475
 
502
- if self.using_native_model:
503
- res, cost = self.model.generate(
504
- prompt, schema=FaithfulnessScoreReason
505
- )
506
- self.evaluation_cost += cost
507
- return res.reason
508
- else:
509
- try:
510
- res: FaithfulnessScoreReason = self.model.generate(
511
- prompt, schema=FaithfulnessScoreReason
512
- )
513
- return res.reason
514
- except TypeError:
515
- res = self.model.generate(prompt)
516
- data = trimAndLoadJson(res, self)
517
- return data["reason"]
476
+ return generate_with_schema_and_extract(
477
+ metric=self,
478
+ prompt=prompt,
479
+ schema_cls=FaithfulnessScoreReason,
480
+ extract_schema=lambda s: s.reason,
481
+ extract_json=lambda data: data["reason"],
482
+ )
518
483
 
519
484
  def _get_verbose_steps(
520
485
  self, interaction_scores: List[InteractionFaithfulnessScore]
@@ -522,7 +487,7 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
522
487
  steps = []
523
488
  for index, interaction_score in enumerate(interaction_scores):
524
489
  interaction_steps = [
525
- f"Interaction {index + 1} \n",
490
+ f"Window {index + 1} \n",
526
491
  f"Truths: {prettify_list(interaction_score.truths)} \n",
527
492
  f"Claims: {prettify_list(interaction_score.claims)} \n",
528
493
  f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
@@ -535,6 +500,12 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
535
500
  def _generate_reason(
536
501
  self, scores: List[InteractionFaithfulnessScore]
537
502
  ) -> str:
503
+ if self.include_reason is False:
504
+ return None
505
+
506
+ if len(scores) == 0:
507
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
508
+
538
509
  reasons = []
539
510
  for score in scores:
540
511
  reasons.append(score.reason)
@@ -543,17 +514,23 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
543
514
  self.score, self.success, reasons
544
515
  )
545
516
 
546
- if self.using_native_model:
547
- res, cost = self.model.generate(prompt)
548
- self.evaluation_cost += cost
549
- return res
550
- else:
551
- res = self.model.generate(prompt)
552
- return res
517
+ return generate_with_schema_and_extract(
518
+ metric=self,
519
+ prompt=prompt,
520
+ schema_cls=FaithfulnessScoreReason,
521
+ extract_schema=lambda s: s.reason,
522
+ extract_json=lambda data: data["reason"],
523
+ )
553
524
 
554
525
  async def _a_generate_reason(
555
526
  self, scores: List[InteractionFaithfulnessScore]
556
527
  ) -> str:
528
+ if self.include_reason is False:
529
+ return None
530
+
531
+ if len(scores) == 0:
532
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
533
+
557
534
  reasons = []
558
535
  for score in scores:
559
536
  reasons.append(score.reason)
@@ -562,13 +539,13 @@ class TurnFaithfulnessMetric(BaseConversationalMetric):
562
539
  self.score, self.success, reasons
563
540
  )
564
541
 
565
- if self.using_native_model:
566
- res, cost = await self.model.a_generate(prompt)
567
- self.evaluation_cost += cost
568
- return res
569
- else:
570
- res = await self.model.a_generate(prompt)
571
- return res
542
+ return await a_generate_with_schema_and_extract(
543
+ metric=self,
544
+ prompt=prompt,
545
+ schema_cls=FaithfulnessScoreReason,
546
+ extract_schema=lambda s: s.reason,
547
+ extract_json=lambda data: data["reason"],
548
+ )
572
549
 
573
550
  def _calculate_score(
574
551
  self, scores: List[InteractionFaithfulnessScore]
@@ -2,9 +2,20 @@ from typing import List, Dict
2
2
 
3
3
 
4
4
  class TurnRelevancyTemplate:
5
+ multimodal_rules = """
6
+ --- MULTIMODAL INPUT RULES ---
7
+ - Treat image content as factual evidence.
8
+ - Only reference visual details that are explicitly and clearly visible.
9
+ - Do not infer or guess objects, text, or details not visibly present.
10
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
11
+ """
12
+
5
13
  @staticmethod
6
14
  def generate_verdicts(sliding_window: List[Dict]):
7
15
  return f"""Based on the given list of message exchanges between a user and an LLM, generate a JSON object to indicate whether the LAST `assistant` message is relevant to context in messages. The JSON will have 2 fields: 'verdict' and 'reason'.
16
+
17
+ {TurnRelevancyTemplate.multimodal_rules}
18
+
8
19
  The 'verdict' key should STRICTLY be either 'yes' or 'no', which states whether the last `assistant` message is relevant according to the context in messages
9
20
  Provide a 'reason' ONLY if the answer is 'no'.
10
21
  You MUST USE the previous messages (if any) provided in the list of messages to make an informed judgement on relevancy.
@@ -52,6 +63,9 @@ JSON:
52
63
  @staticmethod
53
64
  def generate_reason(score, irrelevancies):
54
65
  return f"""Below is a list of irrelevancies drawn from some messages in a conversation, which you have minimal knowledge of. It is a list of strings explaining why the 'assistant' messages are irrelevant to the 'user' messages.
66
+
67
+ {TurnRelevancyTemplate.multimodal_rules}
68
+
55
69
  Given the relevancy score, which is a 0-1 score indicating how irrelevant the OVERALL AI messages are in a conversation (higher the better), CONCISELY summarize the irrelevancies to justify the score.
56
70
 
57
71
  **