deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional, Union, Type, Tuple
2
2
  import asyncio
3
-
3
+ import itertools
4
4
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
5
  from deepeval.metrics import BaseConversationalMetric
6
6
  from deepeval.utils import (
@@ -12,7 +12,10 @@ from deepeval.metrics.utils import (
12
12
  trimAndLoadJson,
13
13
  check_conversational_test_case_params,
14
14
  get_unit_interactions,
15
+ get_turns_in_sliding_window,
15
16
  initialize_model,
17
+ generate_with_schema_and_extract,
18
+ a_generate_with_schema_and_extract,
16
19
  )
17
20
  from deepeval.models import DeepEvalBaseLLM
18
21
  from deepeval.metrics.turn_contextual_relevancy.template import (
@@ -30,6 +33,7 @@ from deepeval.metrics.api import metric_data_manager
30
33
 
31
34
  class TurnContextualRelevancyMetric(BaseConversationalMetric):
32
35
  _required_test_case_params: List[TurnParams] = [
36
+ TurnParams.ROLE,
33
37
  TurnParams.CONTENT,
34
38
  TurnParams.RETRIEVAL_CONTEXT,
35
39
  ]
@@ -42,6 +46,7 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
42
46
  async_mode: bool = True,
43
47
  strict_mode: bool = False,
44
48
  verbose_mode: bool = False,
49
+ window_size: int = 10,
45
50
  evaluation_template: Type[
46
51
  TurnContextualRelevancyTemplate
47
52
  ] = TurnContextualRelevancyTemplate,
@@ -53,6 +58,7 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
53
58
  self.async_mode = async_mode
54
59
  self.strict_mode = strict_mode
55
60
  self.verbose_mode = verbose_mode
61
+ self.window_size = window_size
56
62
  self.evaluation_template = evaluation_template
57
63
 
58
64
  def measure(
@@ -89,9 +95,19 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
89
95
  )
90
96
  else:
91
97
  unit_interactions = get_unit_interactions(test_case.turns)
92
- scores = self._get_contextual_relevancy_scores(
93
- unit_interactions, multimodal
94
- )
98
+ turns_windows: List[List[Turn]] = [
99
+ list(itertools.chain(*window))
100
+ for window in get_turns_in_sliding_window(
101
+ unit_interactions, self.window_size
102
+ )
103
+ ]
104
+ scores = []
105
+ for window in turns_windows:
106
+ scores.extend(
107
+ self._get_contextual_relevancy_scores(
108
+ window, multimodal
109
+ )
110
+ )
95
111
  self.score = self._calculate_score(scores)
96
112
  self.success = self.score >= self.threshold
97
113
  self.reason = self._generate_reason(scores)
@@ -137,9 +153,25 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
137
153
  _in_component=_in_component,
138
154
  ):
139
155
  unit_interactions = get_unit_interactions(test_case.turns)
140
- scores = await self._a_get_contextual_relevancy_scores(
141
- unit_interactions, multimodal
142
- )
156
+ turns_windows: List[List[Turn]] = [
157
+ list(itertools.chain(*window))
158
+ for window in get_turns_in_sliding_window(
159
+ unit_interactions, self.window_size
160
+ )
161
+ ]
162
+ scores = []
163
+ tasks = []
164
+
165
+ async def get_individual_scores(window):
166
+ scores.extend(
167
+ await self._a_get_contextual_relevancy_scores(
168
+ window, multimodal
169
+ )
170
+ )
171
+
172
+ for window in turns_windows:
173
+ tasks.append(get_individual_scores(window))
174
+ await asyncio.gather(*tasks)
143
175
  self.score = self._calculate_score(scores)
144
176
  self.success = self.score >= self.threshold
145
177
  self.reason = await self._a_generate_reason(scores)
@@ -160,69 +192,63 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
160
192
  return self.score
161
193
 
162
194
  async def _a_get_contextual_relevancy_scores(
163
- self, unit_interactions: List[List[Turn]], multimodal: bool
195
+ self, turns_window: List[Turn], multimodal: bool
164
196
  ):
165
- async def get_interaction_score(unit_interaction: List[Turn]):
166
- user_content = "User Message: "
167
- retrieval_context = []
168
- for turn in unit_interaction:
169
- if turn.role == "user":
170
- user_content += f"\n{turn.content} "
171
- else:
172
- retrieval_context.extend(turn.retrieval_context)
197
+ windows_scores = []
173
198
 
174
- # Generate verdicts for each retrieval context
175
- verdicts = await self._a_generate_verdicts(
176
- user_content, retrieval_context, multimodal
177
- )
178
- score, reason = await self._a_get_interaction_score_and_reason(
179
- user_content, verdicts, multimodal
180
- )
181
- interaction_score = InteractionContextualRelevancyScore(
182
- score=score,
183
- reason=reason,
184
- verdicts=verdicts,
185
- )
186
- return interaction_score
199
+ user_content = ""
200
+ retrieval_context = []
201
+ for turn in turns_window:
202
+ if turn.role == "user":
203
+ user_content += f"\n{turn.content} "
204
+ else:
205
+ if turn.retrieval_context is not None:
206
+ retrieval_context.extend(turn.retrieval_context)
187
207
 
188
- final_scores = await asyncio.gather(
189
- *[
190
- get_interaction_score(unit_interaction)
191
- for unit_interaction in unit_interactions
192
- ]
208
+ verdicts = await self._a_generate_verdicts(
209
+ user_content, retrieval_context, multimodal
210
+ )
211
+ score, reason = await self._a_get_interaction_score_and_reason(
212
+ user_content, verdicts, multimodal
213
+ )
214
+ interaction_score = InteractionContextualRelevancyScore(
215
+ score=score,
216
+ reason=reason,
217
+ verdicts=verdicts,
193
218
  )
194
219
 
195
- return final_scores
220
+ windows_scores.append(interaction_score)
221
+
222
+ return windows_scores
196
223
 
197
224
  def _get_contextual_relevancy_scores(
198
- self, unit_interactions: List[List[Turn]], multimodal: bool
225
+ self, turns_window: List[Turn], multimodal: bool
199
226
  ):
200
- interaction_scores = []
201
-
202
- for unit_interaction in unit_interactions:
203
- user_content = "User Message: "
204
- retrieval_context = []
205
- for turn in unit_interaction:
206
- if turn.role == "user":
207
- user_content += f"\n{turn.content} "
208
- else:
227
+ windows_scores = []
228
+
229
+ user_content = ""
230
+ retrieval_context = []
231
+ for turn in turns_window:
232
+ if turn.role == "user":
233
+ user_content += f"\n{turn.content} "
234
+ else:
235
+ if turn.retrieval_context is not None:
209
236
  retrieval_context.extend(turn.retrieval_context)
210
237
 
211
- # Generate verdicts for each retrieval context
212
- verdicts = self._generate_verdicts(
213
- user_content, retrieval_context, multimodal
214
- )
215
- score, reason = self._get_interaction_score_and_reason(
216
- user_content, verdicts, multimodal
217
- )
218
- interaction_score = InteractionContextualRelevancyScore(
219
- score=score,
220
- reason=reason,
221
- verdicts=verdicts,
222
- )
223
- interaction_scores.append(interaction_score)
238
+ verdicts = self._generate_verdicts(
239
+ user_content, retrieval_context, multimodal
240
+ )
241
+ score, reason = self._get_interaction_score_and_reason(
242
+ user_content, verdicts, multimodal
243
+ )
244
+ interaction_score = InteractionContextualRelevancyScore(
245
+ score=score,
246
+ reason=reason,
247
+ verdicts=verdicts,
248
+ )
249
+ windows_scores.append(interaction_score)
224
250
 
225
- return interaction_scores
251
+ return windows_scores
226
252
 
227
253
  async def _a_generate_verdicts(
228
254
  self, input: str, retrieval_context: List[str], multimodal: bool
@@ -240,29 +266,15 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
240
266
  multimodal=multimodal,
241
267
  )
242
268
 
243
- if self.using_native_model:
244
- res, cost = await self.model.a_generate(
245
- prompt, schema=ContextualRelevancyVerdicts
246
- )
247
- self.evaluation_cost += cost
248
- verdicts.extend([item for item in res.verdicts])
249
- else:
250
- try:
251
- res: ContextualRelevancyVerdicts = (
252
- await self.model.a_generate(
253
- prompt, schema=ContextualRelevancyVerdicts
254
- )
255
- )
256
- verdicts.extend([item for item in res.verdicts])
257
- except TypeError:
258
- res = await self.model.a_generate(prompt)
259
- data = trimAndLoadJson(res, self)
260
- verdicts.extend(
261
- [
262
- ContextualRelevancyVerdict(**item)
263
- for item in data["verdicts"]
264
- ]
265
- )
269
+ result = await a_generate_with_schema_and_extract(
270
+ metric=self,
271
+ prompt=prompt,
272
+ schema_cls=ContextualRelevancyVerdicts,
273
+ extract_schema=lambda s: s.verdicts,
274
+ extract_json=lambda data: data["verdicts"],
275
+ )
276
+
277
+ verdicts.extend(result)
266
278
 
267
279
  return verdicts
268
280
 
@@ -282,27 +294,15 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
282
294
  multimodal=multimodal,
283
295
  )
284
296
 
285
- if self.using_native_model:
286
- res, cost = self.model.generate(
287
- prompt, schema=ContextualRelevancyVerdicts
288
- )
289
- self.evaluation_cost += cost
290
- verdicts.extend([item for item in res.verdicts])
291
- else:
292
- try:
293
- res: ContextualRelevancyVerdicts = self.model.generate(
294
- prompt, schema=ContextualRelevancyVerdicts
295
- )
296
- verdicts.extend([item for item in res.verdicts])
297
- except TypeError:
298
- res = self.model.generate(prompt)
299
- data = trimAndLoadJson(res, self)
300
- verdicts.extend(
301
- [
302
- ContextualRelevancyVerdict(**item)
303
- for item in data["verdicts"]
304
- ]
305
- )
297
+ result = generate_with_schema_and_extract(
298
+ metric=self,
299
+ prompt=prompt,
300
+ schema_cls=ContextualRelevancyVerdicts,
301
+ extract_schema=lambda s: s.verdicts,
302
+ extract_json=lambda data: data["verdicts"],
303
+ )
304
+
305
+ verdicts.extend(result)
306
306
 
307
307
  return verdicts
308
308
 
@@ -313,7 +313,10 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
313
313
  multimodal: bool,
314
314
  ) -> Tuple[float, str]:
315
315
  if len(verdicts) == 0:
316
- return 1, None
316
+ return (
317
+ 1,
318
+ "There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
319
+ )
317
320
 
318
321
  score = self._calculate_interaction_score(verdicts)
319
322
  reason = await self._a_get_interaction_reason(
@@ -332,7 +335,10 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
332
335
  multimodal: bool,
333
336
  ) -> Tuple[float, str]:
334
337
  if len(verdicts) == 0:
335
- return 1, None
338
+ return (
339
+ 1,
340
+ "There were no retrieval contexts in the given turns to evaluate the contextual relevancy.",
341
+ )
336
342
 
337
343
  score = self._calculate_interaction_score(verdicts)
338
344
  reason = self._get_interaction_reason(
@@ -377,7 +383,6 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
377
383
  if verdict.verdict.strip().lower() == "yes":
378
384
  relevant_statements.append(verdict.statement)
379
385
  else:
380
- # Include the reason for irrelevance
381
386
  irrelevant_statements.append(
382
387
  f"{verdict.statement}: {verdict.reason}"
383
388
  )
@@ -390,24 +395,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
390
395
  multimodal=multimodal,
391
396
  )
392
397
 
393
- if self.using_native_model:
394
- res, cost = await self.model.a_generate(
395
- prompt, schema=ContextualRelevancyScoreReason
396
- )
397
- self.evaluation_cost += cost
398
- return res.reason
399
- else:
400
- try:
401
- res: ContextualRelevancyScoreReason = (
402
- await self.model.a_generate(
403
- prompt, schema=ContextualRelevancyScoreReason
404
- )
405
- )
406
- return res.reason
407
- except TypeError:
408
- res = await self.model.a_generate(prompt)
409
- data = trimAndLoadJson(res, self)
410
- return data["reason"]
398
+ return await a_generate_with_schema_and_extract(
399
+ metric=self,
400
+ prompt=prompt,
401
+ schema_cls=ContextualRelevancyScoreReason,
402
+ extract_schema=lambda s: s.reason,
403
+ extract_json=lambda data: data["reason"],
404
+ )
411
405
 
412
406
  def _get_interaction_reason(
413
407
  self,
@@ -440,30 +434,21 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
440
434
  multimodal=multimodal,
441
435
  )
442
436
 
443
- if self.using_native_model:
444
- res, cost = self.model.generate(
445
- prompt, schema=ContextualRelevancyScoreReason
446
- )
447
- self.evaluation_cost += cost
448
- return res.reason
449
- else:
450
- try:
451
- res: ContextualRelevancyScoreReason = self.model.generate(
452
- prompt, schema=ContextualRelevancyScoreReason
453
- )
454
- return res.reason
455
- except TypeError:
456
- res = self.model.generate(prompt)
457
- data = trimAndLoadJson(res, self)
458
- return data["reason"]
437
+ return generate_with_schema_and_extract(
438
+ metric=self,
439
+ prompt=prompt,
440
+ schema_cls=ContextualRelevancyScoreReason,
441
+ extract_schema=lambda s: s.reason,
442
+ extract_json=lambda data: data["reason"],
443
+ )
459
444
 
460
445
  def _get_verbose_steps(
461
- self, interaction_scores: List[InteractionContextualRelevancyScore]
446
+ self, windows_scores: List[InteractionContextualRelevancyScore]
462
447
  ):
463
448
  steps = []
464
- for index, interaction_score in enumerate(interaction_scores):
449
+ for index, interaction_score in enumerate(windows_scores):
465
450
  interaction_steps = [
466
- f"Interaction {index + 1} \n",
451
+ f"Window {index + 1} \n",
467
452
  f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
468
453
  f"Score: {interaction_score.score} \n",
469
454
  f"Reason: {interaction_score.reason} \n",
@@ -474,6 +459,12 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
474
459
  def _generate_reason(
475
460
  self, scores: List[InteractionContextualRelevancyScore]
476
461
  ) -> str:
462
+ if self.include_reason is False:
463
+ return None
464
+
465
+ if len(scores) == 0:
466
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
467
+
477
468
  reasons = []
478
469
  for score in scores:
479
470
  reasons.append(score.reason)
@@ -482,17 +473,23 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
482
473
  self.score, self.success, reasons
483
474
  )
484
475
 
485
- if self.using_native_model:
486
- res, cost = self.model.generate(prompt)
487
- self.evaluation_cost += cost
488
- return res
489
- else:
490
- res = self.model.generate(prompt)
491
- return res
476
+ return generate_with_schema_and_extract(
477
+ metric=self,
478
+ prompt=prompt,
479
+ schema_cls=ContextualRelevancyScoreReason,
480
+ extract_schema=lambda s: s.reason,
481
+ extract_json=lambda data: data["reason"],
482
+ )
492
483
 
493
484
  async def _a_generate_reason(
494
485
  self, scores: List[InteractionContextualRelevancyScore]
495
486
  ) -> str:
487
+ if self.include_reason is False:
488
+ return None
489
+
490
+ if len(scores) == 0:
491
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
492
+
496
493
  reasons = []
497
494
  for score in scores:
498
495
  reasons.append(score.reason)
@@ -501,13 +498,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
501
498
  self.score, self.success, reasons
502
499
  )
503
500
 
504
- if self.using_native_model:
505
- res, cost = await self.model.a_generate(prompt)
506
- self.evaluation_cost += cost
507
- return res
508
- else:
509
- res = await self.model.a_generate(prompt)
510
- return res
501
+ return await a_generate_with_schema_and_extract(
502
+ metric=self,
503
+ prompt=prompt,
504
+ schema_cls=ContextualRelevancyScoreReason,
505
+ extract_schema=lambda s: s.reason,
506
+ extract_json=lambda data: data["reason"],
507
+ )
511
508
 
512
509
  def _calculate_score(
513
510
  self, scores: List[InteractionContextualRelevancyScore]
@@ -25,7 +25,7 @@ class FaithfulnessScoreReason(BaseModel):
25
25
 
26
26
  class InteractionFaithfulnessScore(BaseModel):
27
27
  score: float
28
- reason: str
28
+ reason: Optional[str]
29
29
  claims: List[str]
30
30
  truths: List[str]
31
31
  verdicts: List[FaithfulnessVerdict]
@@ -187,6 +187,13 @@ class TurnFaithfulnessTemplate:
187
187
  Context:
188
188
  This metric evaluates conversational faithfulness by extracting truths from retrieval context, extracting claims from the assistant's output, and generating verdicts that compare each claim against the truths. Each interaction yields a reason indicating why a verdict failed or succeeded. You are given all those reasons.
189
189
 
190
+ **
191
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
192
+ Example JSON:
193
+ {{
194
+ "reason": "The score is <turn_faithfulness_score> because <your_reason>."
195
+ }}
196
+
190
197
  Inputs:
191
198
  - final_score: the averaged score across all interactions.
192
199
  - success: whether the metric passed or failed
@@ -213,6 +220,6 @@ class TurnFaithfulnessTemplate:
213
220
 
214
221
  Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
215
222
 
216
- The final reason:
223
+ JSON:
217
224
  """
218
225
  )