deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from typing import List, Optional, Union, Type, Tuple
2
2
  import asyncio
3
-
3
+ import itertools
4
4
  from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
5
5
  from deepeval.metrics import BaseConversationalMetric
6
6
  from deepeval.utils import (
@@ -12,7 +12,10 @@ from deepeval.metrics.utils import (
12
12
  trimAndLoadJson,
13
13
  check_conversational_test_case_params,
14
14
  get_unit_interactions,
15
+ get_turns_in_sliding_window,
15
16
  initialize_model,
17
+ a_generate_with_schema_and_extract,
18
+ generate_with_schema_and_extract,
16
19
  )
17
20
  from deepeval.models import DeepEvalBaseLLM
18
21
  from deepeval.metrics.turn_contextual_precision.template import (
@@ -30,6 +33,7 @@ from deepeval.metrics.api import metric_data_manager
30
33
 
31
34
  class TurnContextualPrecisionMetric(BaseConversationalMetric):
32
35
  _required_test_case_params: List[TurnParams] = [
36
+ TurnParams.ROLE,
33
37
  TurnParams.CONTENT,
34
38
  TurnParams.RETRIEVAL_CONTEXT,
35
39
  TurnParams.EXPECTED_OUTCOME,
@@ -43,6 +47,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
43
47
  async_mode: bool = True,
44
48
  strict_mode: bool = False,
45
49
  verbose_mode: bool = False,
50
+ window_size: int = 10,
46
51
  evaluation_template: Type[
47
52
  TurnContextualPrecisionTemplate
48
53
  ] = TurnContextualPrecisionTemplate,
@@ -54,6 +59,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
54
59
  self.async_mode = async_mode
55
60
  self.strict_mode = strict_mode
56
61
  self.verbose_mode = verbose_mode
62
+ self.window_size = window_size
57
63
  self.evaluation_template = evaluation_template
58
64
 
59
65
  def measure(
@@ -90,9 +96,19 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
90
96
  )
91
97
  else:
92
98
  unit_interactions = get_unit_interactions(test_case.turns)
93
- scores = self._get_contextual_precision_scores(
94
- unit_interactions, test_case.expected_outcome, multimodal
95
- )
99
+ turns_windows: List[List[Turn]] = [
100
+ list(itertools.chain(*window))
101
+ for window in get_turns_in_sliding_window(
102
+ unit_interactions, self.window_size
103
+ )
104
+ ]
105
+ scores = []
106
+ for window in turns_windows:
107
+ scores.extend(
108
+ self._get_contextual_precision_scores(
109
+ window, test_case.expected_outcome, multimodal
110
+ )
111
+ )
96
112
  self.score = self._calculate_score(scores)
97
113
  self.success = self.score >= self.threshold
98
114
  self.reason = self._generate_reason(scores)
@@ -138,9 +154,25 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
138
154
  _in_component=_in_component,
139
155
  ):
140
156
  unit_interactions = get_unit_interactions(test_case.turns)
141
- scores = await self._a_get_contextual_precision_scores(
142
- unit_interactions, test_case.expected_outcome, multimodal
143
- )
157
+ turns_windows: List[List[Turn]] = [
158
+ list(itertools.chain(*window))
159
+ for window in get_turns_in_sliding_window(
160
+ unit_interactions, self.window_size
161
+ )
162
+ ]
163
+ scores = []
164
+ tasks = []
165
+
166
+ async def get_individual_scores(window):
167
+ scores.extend(
168
+ await self._a_get_contextual_precision_scores(
169
+ window, test_case.expected_outcome, multimodal
170
+ )
171
+ )
172
+
173
+ for window in turns_windows:
174
+ tasks.append(get_individual_scores(window))
175
+ await asyncio.gather(*tasks)
144
176
  self.score = self._calculate_score(scores)
145
177
  self.success = self.score >= self.threshold
146
178
  self.reason = await self._a_generate_reason(scores)
@@ -162,78 +194,73 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
162
194
 
163
195
  async def _a_get_contextual_precision_scores(
164
196
  self,
165
- unit_interactions: List[List[Turn]],
166
- _expected_outcome: str,
197
+ turns_window: List[Turn],
198
+ expected_outcome: str,
167
199
  multimodal: bool,
168
200
  ):
169
- async def get_interaction_score(unit_interaction: List[Turn]):
170
- user_content = "User Message: "
171
- retrieval_context = []
172
- expected_outcome = (
173
- f"Expected Assistant Message: \n{_expected_outcome}"
174
- )
175
- for turn in unit_interaction:
176
- if turn.role == "user":
177
- user_content += f"\n{turn.content} "
178
- else:
179
- retrieval_context.extend(turn.retrieval_context)
201
+ windows_scores = []
180
202
 
181
- verdicts = await self._a_generate_verdicts(
182
- user_content, expected_outcome, retrieval_context, multimodal
183
- )
184
- score, reason = await self._a_get_interaction_score_and_reason(
185
- user_content, verdicts, multimodal
186
- )
187
- interaction_score = InteractionContextualPrecisionScore(
188
- score=score,
189
- reason=reason,
190
- verdicts=verdicts,
191
- )
192
- return interaction_score
203
+ user_content = ""
204
+ retrieval_context = []
205
+ for turn in turns_window:
206
+ if turn.role == "user":
207
+ user_content += f"\n{turn.content} "
208
+ else:
209
+ if turn.retrieval_context is not None:
210
+ retrieval_context.extend(turn.retrieval_context)
193
211
 
194
- final_scores = await asyncio.gather(
195
- *[
196
- get_interaction_score(unit_interaction)
197
- for unit_interaction in unit_interactions
198
- ]
212
+ verdicts = await self._a_generate_verdicts(
213
+ user_content,
214
+ expected_outcome,
215
+ retrieval_context,
216
+ multimodal,
199
217
  )
218
+ score, reason = await self._a_get_interaction_score_and_reason(
219
+ user_content, verdicts, multimodal
220
+ )
221
+ interaction_score = InteractionContextualPrecisionScore(
222
+ score=score,
223
+ reason=reason,
224
+ verdicts=verdicts,
225
+ )
226
+ windows_scores.append(interaction_score)
200
227
 
201
- return final_scores
228
+ return windows_scores
202
229
 
203
230
  def _get_contextual_precision_scores(
204
231
  self,
205
- unit_interactions: List[List[Turn]],
206
- _expected_outcome: str,
232
+ turns_window: List[Turn],
233
+ expected_outcome: str,
207
234
  multimodal: bool,
208
235
  ):
209
- interaction_scores = []
236
+ windows_scores = []
210
237
 
211
- for unit_interaction in unit_interactions:
212
- user_content = "User Message: "
213
- retrieval_context = []
214
- expected_outcome = (
215
- f"Expected Assistant Message: \n{_expected_outcome}"
216
- )
217
- for turn in unit_interaction:
218
- if turn.role == "user":
219
- user_content += f"\n{turn.content} "
220
- else:
238
+ user_content = ""
239
+ retrieval_context = []
240
+ for turn in turns_window:
241
+ if turn.role == "user":
242
+ user_content += f"\n{turn.content} "
243
+ else:
244
+ if turn.retrieval_context is not None:
221
245
  retrieval_context.extend(turn.retrieval_context)
222
246
 
223
- verdicts = self._generate_verdicts(
224
- user_content, expected_outcome, retrieval_context, multimodal
225
- )
226
- score, reason = self._get_interaction_score_and_reason(
227
- user_content, verdicts, multimodal
228
- )
229
- interaction_score = InteractionContextualPrecisionScore(
230
- score=score,
231
- reason=reason,
232
- verdicts=verdicts,
233
- )
234
- interaction_scores.append(interaction_score)
247
+ verdicts = self._generate_verdicts(
248
+ user_content,
249
+ expected_outcome,
250
+ retrieval_context,
251
+ multimodal,
252
+ )
253
+ score, reason = self._get_interaction_score_and_reason(
254
+ user_content, verdicts, multimodal
255
+ )
256
+ interaction_score = InteractionContextualPrecisionScore(
257
+ score=score,
258
+ reason=reason,
259
+ verdicts=verdicts,
260
+ )
261
+ windows_scores.append(interaction_score)
235
262
 
236
- return interaction_scores
263
+ return windows_scores
237
264
 
238
265
  async def _a_generate_verdicts(
239
266
  self,
@@ -254,26 +281,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
254
281
  multimodal=multimodal,
255
282
  )
256
283
 
257
- if self.using_native_model:
258
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
259
- self.evaluation_cost += cost
260
- verdicts = [item for item in res.verdicts]
261
- return verdicts
262
- else:
263
- try:
264
- res: Verdicts = await self.model.a_generate(
265
- prompt, schema=Verdicts
266
- )
267
- verdicts = [item for item in res.verdicts]
268
- return verdicts
269
- except TypeError:
270
- res = await self.model.a_generate(prompt)
271
- data = trimAndLoadJson(res, self)
272
- verdicts = [
273
- ContextualPrecisionVerdict(**item)
274
- for item in data["verdicts"]
275
- ]
276
- return verdicts
284
+ return await a_generate_with_schema_and_extract(
285
+ metric=self,
286
+ prompt=prompt,
287
+ schema_cls=Verdicts,
288
+ extract_schema=lambda s: s.verdicts,
289
+ extract_json=lambda data: data["verdicts"],
290
+ )
277
291
 
278
292
  def _generate_verdicts(
279
293
  self,
@@ -294,24 +308,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
294
308
  multimodal=multimodal,
295
309
  )
296
310
 
297
- if self.using_native_model:
298
- res, cost = self.model.generate(prompt, schema=Verdicts)
299
- self.evaluation_cost += cost
300
- verdicts = [item for item in res.verdicts]
301
- return verdicts
302
- else:
303
- try:
304
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
305
- verdicts = [item for item in res.verdicts]
306
- return verdicts
307
- except TypeError:
308
- res = self.model.generate(prompt)
309
- data = trimAndLoadJson(res, self)
310
- verdicts = [
311
- ContextualPrecisionVerdict(**item)
312
- for item in data["verdicts"]
313
- ]
314
- return verdicts
311
+ return generate_with_schema_and_extract(
312
+ metric=self,
313
+ prompt=prompt,
314
+ schema_cls=Verdicts,
315
+ extract_schema=lambda s: s.verdicts,
316
+ extract_json=lambda data: data["verdicts"],
317
+ )
315
318
 
316
319
  async def _a_get_interaction_score_and_reason(
317
320
  self,
@@ -320,7 +323,10 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
320
323
  multimodal: bool,
321
324
  ) -> Tuple[float, str]:
322
325
  if len(verdicts) == 0:
323
- return 1, None
326
+ return (
327
+ 1,
328
+ "There were no retrieval contexts in the given turns to evaluate the contextual precision.",
329
+ )
324
330
 
325
331
  score = self._calculate_interaction_score(verdicts)
326
332
  reason = await self._a_get_interaction_reason(
@@ -339,7 +345,10 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
339
345
  multimodal: bool,
340
346
  ) -> Tuple[float, str]:
341
347
  if len(verdicts) == 0:
342
- return 1, None
348
+ return (
349
+ 1,
350
+ "There were no retrieval contexts in the given turns to evaluate the contextual precision.",
351
+ )
343
352
 
344
353
  score = self._calculate_interaction_score(verdicts)
345
354
  reason = self._get_interaction_reason(
@@ -376,7 +385,6 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
376
385
  if relevant_nodes_count == 0:
377
386
  return 0
378
387
 
379
- # Calculate Average Precision
380
388
  score = sum_weighted_precision_at_k / relevant_nodes_count
381
389
  return 0 if self.strict_mode and score < self.threshold else score
382
390
 
@@ -408,24 +416,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
408
416
  multimodal=multimodal,
409
417
  )
410
418
 
411
- if self.using_native_model:
412
- res, cost = await self.model.a_generate(
413
- prompt, schema=ContextualPrecisionScoreReason
414
- )
415
- self.evaluation_cost += cost
416
- return res.reason
417
- else:
418
- try:
419
- res: ContextualPrecisionScoreReason = (
420
- await self.model.a_generate(
421
- prompt, schema=ContextualPrecisionScoreReason
422
- )
423
- )
424
- return res.reason
425
- except TypeError:
426
- res = await self.model.a_generate(prompt)
427
- data = trimAndLoadJson(res, self)
428
- return data["reason"]
419
+ return await a_generate_with_schema_and_extract(
420
+ metric=self,
421
+ prompt=prompt,
422
+ schema_cls=ContextualPrecisionScoreReason,
423
+ extract_schema=lambda s: s.reason,
424
+ extract_json=lambda data: data["reason"],
425
+ )
429
426
 
430
427
  def _get_interaction_reason(
431
428
  self,
@@ -455,22 +452,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
455
452
  multimodal=multimodal,
456
453
  )
457
454
 
458
- if self.using_native_model:
459
- res, cost = self.model.generate(
460
- prompt, schema=ContextualPrecisionScoreReason
461
- )
462
- self.evaluation_cost += cost
463
- return res.reason
464
- else:
465
- try:
466
- res: ContextualPrecisionScoreReason = self.model.generate(
467
- prompt, schema=ContextualPrecisionScoreReason
468
- )
469
- return res.reason
470
- except TypeError:
471
- res = self.model.generate(prompt)
472
- data = trimAndLoadJson(res, self)
473
- return data["reason"]
455
+ return generate_with_schema_and_extract(
456
+ metric=self,
457
+ prompt=prompt,
458
+ schema_cls=ContextualPrecisionScoreReason,
459
+ extract_schema=lambda s: s.reason,
460
+ extract_json=lambda data: data["reason"],
461
+ )
474
462
 
475
463
  def _get_verbose_steps(
476
464
  self, interaction_scores: List[InteractionContextualPrecisionScore]
@@ -478,7 +466,7 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
478
466
  steps = []
479
467
  for index, interaction_score in enumerate(interaction_scores):
480
468
  interaction_steps = [
481
- f"Interaction {index + 1} \n",
469
+ f"Window {index + 1} \n",
482
470
  f"Verdicts: {prettify_list(interaction_score.verdicts)} \n",
483
471
  f"Score: {interaction_score.score} \n",
484
472
  f"Reason: {interaction_score.reason} \n",
@@ -489,6 +477,12 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
489
477
  def _generate_reason(
490
478
  self, scores: List[InteractionContextualPrecisionScore]
491
479
  ) -> str:
480
+ if self.include_reason is False:
481
+ return None
482
+
483
+ if len(scores) == 0:
484
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
485
+
492
486
  reasons = []
493
487
  for score in scores:
494
488
  reasons.append(score.reason)
@@ -497,17 +491,23 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
497
491
  self.score, self.success, reasons
498
492
  )
499
493
 
500
- if self.using_native_model:
501
- res, cost = self.model.generate(prompt)
502
- self.evaluation_cost += cost
503
- return res
504
- else:
505
- res = self.model.generate(prompt)
506
- return res
494
+ return generate_with_schema_and_extract(
495
+ metric=self,
496
+ prompt=prompt,
497
+ schema_cls=ContextualPrecisionScoreReason,
498
+ extract_schema=lambda s: s.reason,
499
+ extract_json=lambda data: data["reason"],
500
+ )
507
501
 
508
502
  async def _a_generate_reason(
509
503
  self, scores: List[InteractionContextualPrecisionScore]
510
504
  ) -> str:
505
+ if self.include_reason is False:
506
+ return None
507
+
508
+ if len(scores) == 0:
509
+ return "There were no retrieval contexts in your turns to evaluate, hence the score is 1"
510
+
511
511
  reasons = []
512
512
  for score in scores:
513
513
  reasons.append(score.reason)
@@ -516,13 +516,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
516
516
  self.score, self.success, reasons
517
517
  )
518
518
 
519
- if self.using_native_model:
520
- res, cost = await self.model.a_generate(prompt)
521
- self.evaluation_cost += cost
522
- return res
523
- else:
524
- res = await self.model.a_generate(prompt)
525
- return res
519
+ return await a_generate_with_schema_and_extract(
520
+ metric=self,
521
+ prompt=prompt,
522
+ schema_cls=ContextualPrecisionScoreReason,
523
+ extract_schema=lambda s: s.reason,
524
+ extract_json=lambda data: data["reason"],
525
+ )
526
526
 
527
527
  def _calculate_score(
528
528
  self, scores: List[InteractionContextualPrecisionScore]
@@ -1,4 +1,4 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
  from pydantic import BaseModel
3
3
 
4
4
 
@@ -17,5 +17,5 @@ class ContextualRecallScoreReason(BaseModel):
17
17
 
18
18
  class InteractionContextualRecallScore(BaseModel):
19
19
  score: float
20
- reason: str
21
- verdicts: List[ContextualRecallVerdict]
20
+ reason: Optional[str]
21
+ verdicts: Optional[List[ContextualRecallVerdict]]
@@ -125,6 +125,13 @@ class TurnContextualRecallTemplate:
125
125
  Context:
126
126
  This metric evaluates conversational contextual recall by determining whether sentences in the assistant output can be attributed to the retrieval context for each interaction. Each interaction yields a reason indicating which sentences were supported or unsupported. You are given all those reasons.
127
127
 
128
+ **
129
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
130
+ Example JSON:
131
+ {{
132
+ "reason": "The score is <contextual_recall_score> because <your_reason>."
133
+ }}
134
+
128
135
  Inputs:
129
136
  - final_score: the averaged score across all interactions.
130
137
  - success: whether the metric passed or failed
@@ -151,7 +158,7 @@ class TurnContextualRecallTemplate:
151
158
 
152
159
  Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
153
160
 
154
- The final reason:
161
+ JSON:
155
162
  """
156
163
  )
157
164