deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional, Union, Type, Tuple
2
2
  import asyncio
3
-
3
+ import itertools
4
4
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
5
  from deepeval.metrics import BaseConversationalMetric
6
6
  from deepeval.utils import (
@@ -12,7 +12,10 @@ from deepeval.metrics.utils import (
12
12
  trimAndLoadJson,
13
13
  check_conversational_test_case_params,
14
14
  get_unit_interactions,
15
+ get_turns_in_sliding_window,
15
16
  initialize_model,
17
+ a_generate_with_schema_and_extract,
18
+ generate_with_schema_and_extract,
16
19
  )
17
20
  from deepeval.models import DeepEvalBaseLLM
18
21
  from deepeval.metrics.turn_contextual_recall.template import (
@@ -30,6 +33,7 @@ from deepeval.metrics.api import metric_data_manager
30
33
 
31
34
  class TurnContextualRecallMetric(BaseConversationalMetric):
32
35
  _required_test_case_params: List[TurnParams] = [
36
+ TurnParams.ROLE,
33
37
  TurnParams.CONTENT,
34
38
  TurnParams.RETRIEVAL_CONTEXT,
35
39
  TurnParams.EXPECTED_OUTCOME,
@@ -43,6 +47,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
43
47
  async_mode: bool = True,
44
48
  strict_mode: bool = False,
45
49
  verbose_mode: bool = False,
50
+ window_size: int = 10,
46
51
  evaluation_template: Type[
47
52
  TurnContextualRecallTemplate
48
53
  ] = TurnContextualRecallTemplate,
@@ -54,6 +59,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
54
59
  self.async_mode = async_mode
55
60
  self.strict_mode = strict_mode
56
61
  self.verbose_mode = verbose_mode
62
+ self.window_size = window_size
57
63
  self.evaluation_template = evaluation_template
58
64
 
59
65
  def measure(
@@ -90,9 +96,19 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
90
96
  )
91
97
  else:
92
98
  unit_interactions = get_unit_interactions(test_case.turns)
93
- scores = self._get_contextual_recall_scores(
94
- unit_interactions, test_case.expected_outcome, multimodal
95
- )
99
+ turns_windows: List[List[Turn]] = [
100
+ list(itertools.chain(*window))
101
+ for window in get_turns_in_sliding_window(
102
+ unit_interactions, self.window_size
103
+ )
104
+ ]
105
+ scores = []
106
+ for window in turns_windows:
107
+ scores.extend(
108
+ self._get_contextual_recall_scores(
109
+ window, test_case.expected_outcome, multimodal
110
+ )
111
+ )
96
112
  self.score = self._calculate_score(scores)
97
113
  self.success = self.score >= self.threshold
98
114
  self.reason = self._generate_reason(scores)
@@ -138,9 +154,25 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
138
154
  _in_component=_in_component,
139
155
  ):
140
156
  unit_interactions = get_unit_interactions(test_case.turns)
141
- scores = await self._a_get_contextual_recall_scores(
142
- unit_interactions, test_case.expected_outcome, multimodal
143
- )
157
+ turns_windows: List[List[Turn]] = [
158
+ list(itertools.chain(*window))
159
+ for window in get_turns_in_sliding_window(
160
+ unit_interactions, self.window_size
161
+ )
162
+ ]
163
+ scores = []
164
+ tasks = []
165
+
166
+ async def get_individual_scores(window):
167
+ scores.extend(
168
+ await self._a_get_contextual_recall_scores(
169
+ window, test_case.multimodal, multimodal
170
+ )
171
+ )
172
+
173
+ for window in turns_windows:
174
+ tasks.append(get_individual_scores(window))
175
+ await asyncio.gather(*tasks)
144
176
  self.score = self._calculate_score(scores)
145
177
  self.success = self.score >= self.threshold
146
178
  self.reason = await self._a_generate_reason(scores)
@@ -162,72 +194,67 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
162
194
 
163
195
  async def _a_get_contextual_recall_scores(
164
196
  self,
165
- unit_interactions: List[List[Turn]],
166
- _expected_outcome: str,
197
+ turns_window: List[Turn],
198
+ expected_outcome: str,
167
199
  multimodal: bool,
168
200
  ):
169
- async def get_interaction_score(unit_interaction: List[Turn]):
170
- retrieval_context = []
171
- expected_outcome = (
172
- f"Expected Assistant Message: \n{_expected_outcome}"
173
- )
174
- for turn in unit_interaction:
175
- if turn.role == "assistant":
176
- retrieval_context.extend(turn.retrieval_context)
201
+ windows_scores = []
177
202
 
178
- verdicts = await self._a_generate_verdicts(
179
- expected_outcome, retrieval_context, multimodal
180
- )
181
- score, reason = await self._a_get_interaction_score_and_reason(
182
- expected_outcome, verdicts, multimodal
183
- )
184
- interaction_score = InteractionContextualRecallScore(
185
- score=score,
186
- reason=reason,
187
- verdicts=verdicts,
188
- )
189
- return interaction_score
203
+ user_content = ""
204
+ retrieval_context = []
205
+ for turn in turns_window:
206
+ if turn.role == "user":
207
+ user_content += f"\n{turn.content} "
208
+ else:
209
+ if turn.retrieval_context is not None:
210
+ retrieval_context.extend(turn.retrieval_context)
190
211
 
191
- final_scores = await asyncio.gather(
192
- *[
193
- get_interaction_score(unit_interaction)
194
- for unit_interaction in unit_interactions
195
- ]
212
+ verdicts = await self._a_generate_verdicts(
213
+ expected_outcome, retrieval_context, multimodal
196
214
  )
215
+ score, reason = await self._a_get_interaction_score_and_reason(
216
+ expected_outcome, verdicts, multimodal
217
+ )
218
+ interaction_score = InteractionContextualRecallScore(
219
+ score=score,
220
+ reason=reason,
221
+ verdicts=verdicts,
222
+ )
223
+ windows_scores.append(interaction_score)
197
224
 
198
- return final_scores
225
+ return windows_scores
199
226
 
200
227
  def _get_contextual_recall_scores(
201
228
  self,
202
- unit_interactions: List[List[Turn]],
203
- _expected_outcome: str,
229
+ turns_window: List[Turn],
230
+ expected_outcome: str,
204
231
  multimodal: bool,
205
232
  ):
206
- interaction_scores = []
233
+ windows_scores = []
207
234
 
208
- for unit_interaction in unit_interactions:
209
- retrieval_context = []
210
- expected_outcome = (
211
- f"Expected Assistant Message: \n{_expected_outcome}"
212
- )
213
- for turn in unit_interaction:
214
- if turn.role == "assistant":
235
+ user_content = ""
236
+ retrieval_context = []
237
+ for turn in turns_window:
238
+ if turn.role == "user":
239
+ user_content += f"\n{turn.content} "
240
+ else:
241
+ if turn.retrieval_context is not None:
215
242
  retrieval_context.extend(turn.retrieval_context)
216
243
 
217
- verdicts = self._generate_verdicts(
218
- expected_outcome, retrieval_context, multimodal
219
- )
220
- score, reason = self._get_interaction_score_and_reason(
221
- expected_outcome, verdicts, multimodal
222
- )
223
- interaction_score = InteractionContextualRecallScore(
224
- score=score,
225
- reason=reason,
226
- verdicts=verdicts,
227
- )
228
- interaction_scores.append(interaction_score)
244
+ verdicts = self._generate_verdicts(
245
+ expected_outcome, retrieval_context, multimodal
246
+ )
247
+ score, reason = self._get_interaction_score_and_reason(
248
+ expected_outcome, verdicts, multimodal
249
+ )
250
+ interaction_score = InteractionContextualRecallScore(
251
+ score=score,
252
+ reason=reason,
253
+ verdicts=verdicts,
254
+ )
255
+ windows_scores.append(interaction_score)
229
256
 
230
- return interaction_scores
257
+ return windows_scores
231
258
 
232
259
  async def _a_generate_verdicts(
233
260
  self,
@@ -246,25 +273,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
246
273
  multimodal=multimodal,
247
274
  )
248
275
 
249
- if self.using_native_model:
250
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
251
- self.evaluation_cost += cost
252
- verdicts = [item for item in res.verdicts]
253
- return verdicts
254
- else:
255
- try:
256
- res: Verdicts = await self.model.a_generate(
257
- prompt, schema=Verdicts
258
- )
259
- verdicts = [item for item in res.verdicts]
260
- return verdicts
261
- except TypeError:
262
- res = await self.model.a_generate(prompt)
263
- data = trimAndLoadJson(res, self)
264
- verdicts = [
265
- ContextualRecallVerdict(**item) for item in data["verdicts"]
266
- ]
267
- return verdicts
276
+ return await a_generate_with_schema_and_extract(
277
+ metric=self,
278
+ prompt=prompt,
279
+ schema_cls=Verdicts,
280
+ extract_schema=lambda s: s.verdicts,
281
+ extract_json=lambda data: data["verdicts"],
282
+ )
268
283
 
269
284
  def _generate_verdicts(
270
285
  self,
@@ -283,23 +298,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
283
298
  multimodal=multimodal,
284
299
  )
285
300
 
286
- if self.using_native_model:
287
- res, cost = self.model.generate(prompt, schema=Verdicts)
288
- self.evaluation_cost += cost
289
- verdicts = [item for item in res.verdicts]
290
- return verdicts
291
- else:
292
- try:
293
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
294
- verdicts = [item for item in res.verdicts]
295
- return verdicts
296
- except TypeError:
297
- res = self.model.generate(prompt)
298
- data = trimAndLoadJson(res, self)
299
- verdicts = [
300
- ContextualRecallVerdict(**item) for item in data["verdicts"]
301
- ]
302
- return verdicts
301
+ return generate_with_schema_and_extract(
302
+ metric=self,
303
+ prompt=prompt,
304
+ schema_cls=Verdicts,
305
+ extract_schema=lambda s: s.verdicts,
306
+ extract_json=lambda data: data["verdicts"],
307
+ )
303
308
 
304
309
  async def _a_get_interaction_score_and_reason(
305
310
  self,
@@ -308,7 +313,10 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
308
313
  multimodal: bool,
309
314
  ) -> Tuple[float, str]:
310
315
  if len(verdicts) == 0:
311
- return 1, None
316
+ return (
317
+ 1,
318
+ "There were no retrieval contexts in the given turns to evaluate the contextual recall.",
319
+ )
312
320
 
313
321
  score = self._calculate_interaction_score(verdicts)
314
322
  reason = await self._a_get_interaction_reason(
@@ -327,7 +335,10 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
327
335
  multimodal: bool,
328
336
  ) -> Tuple[float, str]:
329
337
  if len(verdicts) == 0:
330
- return 1, None
338
+ return (
339
+ 1,
340
+ "There were no retrieval contexts in the given turns to evaluate the contextual recall.",
341
+ )
331
342
 
332
343
  score = self._calculate_interaction_score(verdicts)
333
344
  reason = self._get_interaction_reason(
@@ -381,22 +392,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
381
392
  multimodal=multimodal,
382
393
  )
383
394
 
384
- if self.using_native_model:
385
- res, cost = await self.model.a_generate(
386
- prompt, schema=ContextualRecallScoreReason
387
- )
388
- self.evaluation_cost += cost
389
- return res.reason
390
- else:
391
- try:
392
- res: ContextualRecallScoreReason = await self.model.a_generate(
393
- prompt, schema=ContextualRecallScoreReason
394
- )
395
- return res.reason
396
- except TypeError:
397
- res = await self.model.a_generate(prompt)
398
- data = trimAndLoadJson(res, self)
399
- return data["reason"]
395
+ return await a_generate_with_schema_and_extract(
396
+ metric=self,
397
+ prompt=prompt,
398
+ schema_cls=ContextualRecallScoreReason,
399
+ extract_schema=lambda s: s.reason,
400
+ extract_json=lambda data: data["reason"],
401
+ )
400
402
 
401
403
  def _get_interaction_reason(
402
404
  self,
@@ -425,22 +427,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
425
427
  multimodal=multimodal,
426
428
  )
427
429
 
428
- if self.using_native_model:
429
- res, cost = self.model.generate(
430
- prompt, schema=ContextualRecallScoreReason
431
- )
432
- self.evaluation_cost += cost
433
- return res.reason
434
- else:
435
- try:
436
- res: ContextualRecallScoreReason = self.model.generate(
437
- prompt, schema=ContextualRecallScoreReason
438
- )
439
- return res.reason
440
- except TypeError:
441
- res = self.model.generate(prompt)
442
- data = trimAndLoadJson(res, self)
443
- return data["reason"]
430
+ return generate_with_schema_and_extract(
431
+ metric=self,
432
+ prompt=prompt,
433
+ schema_cls=ContextualRecallScoreReason,
434
+ extract_schema=lambda s: s.reason,
435
+ extract_json=lambda data: data["reason"],
436
+ )
444
437
 
445
438
  def _get_verbose_steps(
446
439
  self, interaction_scores: List[InteractionContextualRecallScore]
@@ -448,7 +441,7 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
448
441
  steps = []
449
442
  for index, interaction_score in enumerate(interaction_scores):
450
443
  interaction_steps = [
451
- f"Interaction {index + 1} \n",
444
+ f"Window {index + 1} \n",
452
445
  f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
453
446
  f"Score: {interaction_score.score} \n",
454
447
  f"Reason: {interaction_score.reason} \n",
@@ -459,6 +452,12 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
459
452
  def _generate_reason(
460
453
  self, scores: List[InteractionContextualRecallScore]
461
454
  ) -> str:
455
+ if self.include_reason is False:
456
+ return None
457
+
458
+ if len(scores) == 0:
459
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
460
+
462
461
  reasons = []
463
462
  for score in scores:
464
463
  reasons.append(score.reason)
@@ -467,17 +466,23 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
467
466
  self.score, self.success, reasons
468
467
  )
469
468
 
470
- if self.using_native_model:
471
- res, cost = self.model.generate(prompt)
472
- self.evaluation_cost += cost
473
- return res
474
- else:
475
- res = self.model.generate(prompt)
476
- return res
469
+ return generate_with_schema_and_extract(
470
+ metric=self,
471
+ prompt=prompt,
472
+ schema_cls=ContextualRecallScoreReason,
473
+ extract_schema=lambda s: s.reason,
474
+ extract_json=lambda data: data["reason"],
475
+ )
477
476
 
478
477
  async def _a_generate_reason(
479
478
  self, scores: List[InteractionContextualRecallScore]
480
479
  ) -> str:
480
+ if self.include_reason is False:
481
+ return None
482
+
483
+ if len(scores) == 0:
484
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
485
+
481
486
  reasons = []
482
487
  for score in scores:
483
488
  reasons.append(score.reason)
@@ -486,13 +491,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
486
491
  self.score, self.success, reasons
487
492
  )
488
493
 
489
- if self.using_native_model:
490
- res, cost = await self.model.a_generate(prompt)
491
- self.evaluation_cost += cost
492
- return res
493
- else:
494
- res = await self.model.a_generate(prompt)
495
- return res
494
+ return await a_generate_with_schema_and_extract(
495
+ metric=self,
496
+ prompt=prompt,
497
+ schema_cls=ContextualRecallScoreReason,
498
+ extract_schema=lambda s: s.reason,
499
+ extract_json=lambda data: data["reason"],
500
+ )
496
501
 
497
502
  def _calculate_score(
498
503
  self, scores: List[InteractionContextualRecallScore]
@@ -18,5 +18,5 @@ class ContextualRelevancyScoreReason(BaseModel):
18
18
 
19
19
  class InteractionContextualRelevancyScore(BaseModel):
20
20
  score: float
21
- reason: str
22
- verdicts: List[ContextualRelevancyVerdict]
21
+ reason: Optional[str]
22
+ verdicts: Optional[List[ContextualRelevancyVerdict]]
@@ -130,6 +130,13 @@ class TurnContextualRelevancyTemplate:
130
130
  Context:
131
131
  This metric evaluates conversational contextual relevancy by determining whether statements in the retrieval context are relevant to the user message for each interaction. Each interaction yields a reason indicating which statements were relevant or irrelevant. You are given all those reasons.
132
132
 
133
+ **
134
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
135
+ Example JSON:
136
+ {{
137
+ "reason": "The score is <contextual_relevancy_score> because <your_reason>."
138
+ }}
139
+
133
140
  Inputs:
134
141
  - final_score: the averaged score across all interactions.
135
142
  - success: whether the metric passed or failed
@@ -156,6 +163,6 @@ class TurnContextualRelevancyTemplate:
156
163
 
157
164
  Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
158
165
 
159
- The final reason:
166
+ JSON:
160
167
  """
161
168
  )