deepeval 3.7.6__py3-none-any.whl → 3.7.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +658 -262
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/test_run_tracer.py +4 -6
  8. deepeval/evaluate/execute.py +153 -94
  9. deepeval/integrations/pydantic_ai/instrumentator.py +4 -2
  10. deepeval/integrations/pydantic_ai/otel.py +5 -1
  11. deepeval/key_handler.py +121 -51
  12. deepeval/metrics/base_metric.py +9 -3
  13. deepeval/metrics/g_eval/g_eval.py +6 -1
  14. deepeval/metrics/indicator.py +8 -4
  15. deepeval/metrics/mcp/mcp_task_completion.py +15 -16
  16. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +15 -15
  17. deepeval/metrics/mcp/schema.py +4 -0
  18. deepeval/metrics/mcp/template.py +8 -1
  19. deepeval/metrics/prompt_alignment/prompt_alignment.py +6 -3
  20. deepeval/metrics/tool_use/schema.py +4 -0
  21. deepeval/metrics/tool_use/template.py +16 -2
  22. deepeval/metrics/tool_use/tool_use.py +30 -28
  23. deepeval/metrics/topic_adherence/schema.py +4 -0
  24. deepeval/metrics/topic_adherence/template.py +8 -1
  25. deepeval/metrics/topic_adherence/topic_adherence.py +15 -14
  26. deepeval/metrics/turn_contextual_precision/template.py +8 -1
  27. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +44 -86
  28. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  29. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +44 -82
  30. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  31. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +48 -92
  32. deepeval/metrics/turn_faithfulness/template.py +8 -1
  33. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +76 -130
  34. deepeval/metrics/utils.py +16 -1
  35. deepeval/models/__init__.py +2 -0
  36. deepeval/models/llms/__init__.py +2 -0
  37. deepeval/models/llms/amazon_bedrock_model.py +5 -4
  38. deepeval/models/llms/anthropic_model.py +4 -3
  39. deepeval/models/llms/azure_model.py +4 -3
  40. deepeval/models/llms/deepseek_model.py +5 -8
  41. deepeval/models/llms/grok_model.py +5 -8
  42. deepeval/models/llms/kimi_model.py +5 -8
  43. deepeval/models/llms/litellm_model.py +2 -0
  44. deepeval/models/llms/local_model.py +1 -1
  45. deepeval/models/llms/openai_model.py +4 -3
  46. deepeval/models/retry_policy.py +10 -5
  47. deepeval/models/utils.py +1 -5
  48. deepeval/simulator/conversation_simulator.py +6 -2
  49. deepeval/simulator/template.py +3 -1
  50. deepeval/synthesizer/synthesizer.py +19 -17
  51. deepeval/test_run/test_run.py +6 -1
  52. deepeval/utils.py +26 -0
  53. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/METADATA +3 -3
  54. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/RECORD +57 -56
  55. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/LICENSE.md +0 -0
  56. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/WHEEL +0 -0
  57. {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/entry_points.txt +0 -0
@@ -18,6 +18,7 @@ from deepeval.metrics.topic_adherence.schema import (
18
18
  RelevancyVerdict,
19
19
  QAPairs,
20
20
  QAPair,
21
+ TopicAdherenceReason,
21
22
  )
22
23
  from deepeval.metrics.api import metric_data_manager
23
24
 
@@ -227,25 +228,25 @@ class TopicAdherenceMetric(BaseConversationalMetric):
227
228
  prompt = TopicAdherenceTemplate.generate_reason(
228
229
  self.success, self.score, self.threshold, TP, TN, FP, FN
229
230
  )
230
- if self.using_native_model:
231
- res, cost = self.model.generate(prompt)
232
- self.evaluation_cost += cost
233
- return res
234
- else:
235
- res = self.model.generate(prompt)
236
- return res
231
+ return generate_with_schema_and_extract(
232
+ metric=self,
233
+ prompt=prompt,
234
+ schema_cls=TopicAdherenceReason,
235
+ extract_schema=lambda s: s.reason,
236
+ extract_json=lambda data: data["reason"],
237
+ )
237
238
 
238
239
  async def _a_generate_reason(self, TP, TN, FP, FN):
239
240
  prompt = TopicAdherenceTemplate.generate_reason(
240
241
  self.success, self.score, self.threshold, TP, TN, FP, FN
241
242
  )
242
- if self.using_native_model:
243
- res, cost = await self.model.a_generate(prompt)
244
- self.evaluation_cost += cost
245
- return res
246
- else:
247
- res = await self.model.a_generate(prompt)
248
- return res
243
+ return await a_generate_with_schema_and_extract(
244
+ metric=self,
245
+ prompt=prompt,
246
+ schema_cls=TopicAdherenceReason,
247
+ extract_schema=lambda s: s.reason,
248
+ extract_json=lambda data: data["reason"],
249
+ )
249
250
 
250
251
  def _get_score(self, TP, TN, FP, FN) -> float:
251
252
  true_values = TP[0] + TN[0]
@@ -134,6 +134,13 @@ class TurnContextualPrecisionTemplate:
134
134
  Context:
135
135
  This metric evaluates conversational contextual precision by determining whether relevant nodes in retrieval context are ranked higher than irrelevant nodes for each interaction. Each interaction yields a reason indicating why relevant nodes were well-ranked or poorly-ranked. You are given all those reasons.
136
136
 
137
+ **
138
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
139
+ Example JSON:
140
+ {{
141
+ "reason": "The score is <contextual_precision_score> because <your_reason>."
142
+ }}
143
+
137
144
  Inputs:
138
145
  - final_score: the averaged score across all interactions.
139
146
  - success: whether the metric passed or failed
@@ -160,7 +167,7 @@ class TurnContextualPrecisionTemplate:
160
167
 
161
168
  Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
162
169
 
163
- The final reason:
170
+ JSON:
164
171
  """
165
172
  )
166
173
 
@@ -14,6 +14,8 @@ from deepeval.metrics.utils import (
14
14
  get_unit_interactions,
15
15
  get_turns_in_sliding_window,
16
16
  initialize_model,
17
+ a_generate_with_schema_and_extract,
18
+ generate_with_schema_and_extract,
17
19
  )
18
20
  from deepeval.models import DeepEvalBaseLLM
19
21
  from deepeval.metrics.turn_contextual_precision.template import (
@@ -279,26 +281,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
279
281
  multimodal=multimodal,
280
282
  )
281
283
 
282
- if self.using_native_model:
283
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
284
- self.evaluation_cost += cost
285
- verdicts = [item for item in res.verdicts]
286
- return verdicts
287
- else:
288
- try:
289
- res: Verdicts = await self.model.a_generate(
290
- prompt, schema=Verdicts
291
- )
292
- verdicts = [item for item in res.verdicts]
293
- return verdicts
294
- except TypeError:
295
- res = await self.model.a_generate(prompt)
296
- data = trimAndLoadJson(res, self)
297
- verdicts = [
298
- ContextualPrecisionVerdict(**item)
299
- for item in data["verdicts"]
300
- ]
301
- return verdicts
284
+ return await a_generate_with_schema_and_extract(
285
+ metric=self,
286
+ prompt=prompt,
287
+ schema_cls=Verdicts,
288
+ extract_schema=lambda s: s.verdicts,
289
+ extract_json=lambda data: data["verdicts"],
290
+ )
302
291
 
303
292
  def _generate_verdicts(
304
293
  self,
@@ -319,24 +308,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
319
308
  multimodal=multimodal,
320
309
  )
321
310
 
322
- if self.using_native_model:
323
- res, cost = self.model.generate(prompt, schema=Verdicts)
324
- self.evaluation_cost += cost
325
- verdicts = [item for item in res.verdicts]
326
- return verdicts
327
- else:
328
- try:
329
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
330
- verdicts = [item for item in res.verdicts]
331
- return verdicts
332
- except TypeError:
333
- res = self.model.generate(prompt)
334
- data = trimAndLoadJson(res, self)
335
- verdicts = [
336
- ContextualPrecisionVerdict(**item)
337
- for item in data["verdicts"]
338
- ]
339
- return verdicts
311
+ return generate_with_schema_and_extract(
312
+ metric=self,
313
+ prompt=prompt,
314
+ schema_cls=Verdicts,
315
+ extract_schema=lambda s: s.verdicts,
316
+ extract_json=lambda data: data["verdicts"],
317
+ )
340
318
 
341
319
  async def _a_get_interaction_score_and_reason(
342
320
  self,
@@ -438,24 +416,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
438
416
  multimodal=multimodal,
439
417
  )
440
418
 
441
- if self.using_native_model:
442
- res, cost = await self.model.a_generate(
443
- prompt, schema=ContextualPrecisionScoreReason
444
- )
445
- self.evaluation_cost += cost
446
- return res.reason
447
- else:
448
- try:
449
- res: ContextualPrecisionScoreReason = (
450
- await self.model.a_generate(
451
- prompt, schema=ContextualPrecisionScoreReason
452
- )
453
- )
454
- return res.reason
455
- except TypeError:
456
- res = await self.model.a_generate(prompt)
457
- data = trimAndLoadJson(res, self)
458
- return data["reason"]
419
+ return await a_generate_with_schema_and_extract(
420
+ metric=self,
421
+ prompt=prompt,
422
+ schema_cls=ContextualPrecisionScoreReason,
423
+ extract_schema=lambda s: s.reason,
424
+ extract_json=lambda data: data["reason"],
425
+ )
459
426
 
460
427
  def _get_interaction_reason(
461
428
  self,
@@ -485,22 +452,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
485
452
  multimodal=multimodal,
486
453
  )
487
454
 
488
- if self.using_native_model:
489
- res, cost = self.model.generate(
490
- prompt, schema=ContextualPrecisionScoreReason
491
- )
492
- self.evaluation_cost += cost
493
- return res.reason
494
- else:
495
- try:
496
- res: ContextualPrecisionScoreReason = self.model.generate(
497
- prompt, schema=ContextualPrecisionScoreReason
498
- )
499
- return res.reason
500
- except TypeError:
501
- res = self.model.generate(prompt)
502
- data = trimAndLoadJson(res, self)
503
- return data["reason"]
455
+ return generate_with_schema_and_extract(
456
+ metric=self,
457
+ prompt=prompt,
458
+ schema_cls=ContextualPrecisionScoreReason,
459
+ extract_schema=lambda s: s.reason,
460
+ extract_json=lambda data: data["reason"],
461
+ )
504
462
 
505
463
  def _get_verbose_steps(
506
464
  self, interaction_scores: List[InteractionContextualPrecisionScore]
@@ -533,13 +491,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
533
491
  self.score, self.success, reasons
534
492
  )
535
493
 
536
- if self.using_native_model:
537
- res, cost = self.model.generate(prompt)
538
- self.evaluation_cost += cost
539
- return res
540
- else:
541
- res = self.model.generate(prompt)
542
- return res
494
+ return generate_with_schema_and_extract(
495
+ metric=self,
496
+ prompt=prompt,
497
+ schema_cls=ContextualPrecisionScoreReason,
498
+ extract_schema=lambda s: s.reason,
499
+ extract_json=lambda data: data["reason"],
500
+ )
543
501
 
544
502
  async def _a_generate_reason(
545
503
  self, scores: List[InteractionContextualPrecisionScore]
@@ -558,13 +516,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
558
516
  self.score, self.success, reasons
559
517
  )
560
518
 
561
- if self.using_native_model:
562
- res, cost = await self.model.a_generate(prompt)
563
- self.evaluation_cost += cost
564
- return res
565
- else:
566
- res = await self.model.a_generate(prompt)
567
- return res
519
+ return await a_generate_with_schema_and_extract(
520
+ metric=self,
521
+ prompt=prompt,
522
+ schema_cls=ContextualPrecisionScoreReason,
523
+ extract_schema=lambda s: s.reason,
524
+ extract_json=lambda data: data["reason"],
525
+ )
568
526
 
569
527
  def _calculate_score(
570
528
  self, scores: List[InteractionContextualPrecisionScore]
@@ -125,6 +125,13 @@ class TurnContextualRecallTemplate:
125
125
  Context:
126
126
  This metric evaluates conversational contextual recall by determining whether sentences in the assistant output can be attributed to the retrieval context for each interaction. Each interaction yields a reason indicating which sentences were supported or unsupported. You are given all those reasons.
127
127
 
128
+ **
129
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
130
+ Example JSON:
131
+ {{
132
+ "reason": "The score is <contextual_recall_score> because <your_reason>."
133
+ }}
134
+
128
135
  Inputs:
129
136
  - final_score: the averaged score across all interactions.
130
137
  - success: whether the metric passed or failed
@@ -151,7 +158,7 @@ class TurnContextualRecallTemplate:
151
158
 
152
159
  Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
153
160
 
154
- The final reason:
161
+ JSON:
155
162
  """
156
163
  )
157
164
 
@@ -14,6 +14,8 @@ from deepeval.metrics.utils import (
14
14
  get_unit_interactions,
15
15
  get_turns_in_sliding_window,
16
16
  initialize_model,
17
+ a_generate_with_schema_and_extract,
18
+ generate_with_schema_and_extract,
17
19
  )
18
20
  from deepeval.models import DeepEvalBaseLLM
19
21
  from deepeval.metrics.turn_contextual_recall.template import (
@@ -271,25 +273,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
271
273
  multimodal=multimodal,
272
274
  )
273
275
 
274
- if self.using_native_model:
275
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
276
- self.evaluation_cost += cost
277
- verdicts = [item for item in res.verdicts]
278
- return verdicts
279
- else:
280
- try:
281
- res: Verdicts = await self.model.a_generate(
282
- prompt, schema=Verdicts
283
- )
284
- verdicts = [item for item in res.verdicts]
285
- return verdicts
286
- except TypeError:
287
- res = await self.model.a_generate(prompt)
288
- data = trimAndLoadJson(res, self)
289
- verdicts = [
290
- ContextualRecallVerdict(**item) for item in data["verdicts"]
291
- ]
292
- return verdicts
276
+ return await a_generate_with_schema_and_extract(
277
+ metric=self,
278
+ prompt=prompt,
279
+ schema_cls=Verdicts,
280
+ extract_schema=lambda s: s.verdicts,
281
+ extract_json=lambda data: data["verdicts"],
282
+ )
293
283
 
294
284
  def _generate_verdicts(
295
285
  self,
@@ -308,23 +298,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
308
298
  multimodal=multimodal,
309
299
  )
310
300
 
311
- if self.using_native_model:
312
- res, cost = self.model.generate(prompt, schema=Verdicts)
313
- self.evaluation_cost += cost
314
- verdicts = [item for item in res.verdicts]
315
- return verdicts
316
- else:
317
- try:
318
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
319
- verdicts = [item for item in res.verdicts]
320
- return verdicts
321
- except TypeError:
322
- res = self.model.generate(prompt)
323
- data = trimAndLoadJson(res, self)
324
- verdicts = [
325
- ContextualRecallVerdict(**item) for item in data["verdicts"]
326
- ]
327
- return verdicts
301
+ return generate_with_schema_and_extract(
302
+ metric=self,
303
+ prompt=prompt,
304
+ schema_cls=Verdicts,
305
+ extract_schema=lambda s: s.verdicts,
306
+ extract_json=lambda data: data["verdicts"],
307
+ )
328
308
 
329
309
  async def _a_get_interaction_score_and_reason(
330
310
  self,
@@ -412,22 +392,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
412
392
  multimodal=multimodal,
413
393
  )
414
394
 
415
- if self.using_native_model:
416
- res, cost = await self.model.a_generate(
417
- prompt, schema=ContextualRecallScoreReason
418
- )
419
- self.evaluation_cost += cost
420
- return res.reason
421
- else:
422
- try:
423
- res: ContextualRecallScoreReason = await self.model.a_generate(
424
- prompt, schema=ContextualRecallScoreReason
425
- )
426
- return res.reason
427
- except TypeError:
428
- res = await self.model.a_generate(prompt)
429
- data = trimAndLoadJson(res, self)
430
- return data["reason"]
395
+ return await a_generate_with_schema_and_extract(
396
+ metric=self,
397
+ prompt=prompt,
398
+ schema_cls=ContextualRecallScoreReason,
399
+ extract_schema=lambda s: s.reason,
400
+ extract_json=lambda data: data["reason"],
401
+ )
431
402
 
432
403
  def _get_interaction_reason(
433
404
  self,
@@ -456,22 +427,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
456
427
  multimodal=multimodal,
457
428
  )
458
429
 
459
- if self.using_native_model:
460
- res, cost = self.model.generate(
461
- prompt, schema=ContextualRecallScoreReason
462
- )
463
- self.evaluation_cost += cost
464
- return res.reason
465
- else:
466
- try:
467
- res: ContextualRecallScoreReason = self.model.generate(
468
- prompt, schema=ContextualRecallScoreReason
469
- )
470
- return res.reason
471
- except TypeError:
472
- res = self.model.generate(prompt)
473
- data = trimAndLoadJson(res, self)
474
- return data["reason"]
430
+ return generate_with_schema_and_extract(
431
+ metric=self,
432
+ prompt=prompt,
433
+ schema_cls=ContextualRecallScoreReason,
434
+ extract_schema=lambda s: s.reason,
435
+ extract_json=lambda data: data["reason"],
436
+ )
475
437
 
476
438
  def _get_verbose_steps(
477
439
  self, interaction_scores: List[InteractionContextualRecallScore]
@@ -504,13 +466,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
504
466
  self.score, self.success, reasons
505
467
  )
506
468
 
507
- if self.using_native_model:
508
- res, cost = self.model.generate(prompt)
509
- self.evaluation_cost += cost
510
- return res
511
- else:
512
- res = self.model.generate(prompt)
513
- return res
469
+ return generate_with_schema_and_extract(
470
+ metric=self,
471
+ prompt=prompt,
472
+ schema_cls=ContextualRecallScoreReason,
473
+ extract_schema=lambda s: s.reason,
474
+ extract_json=lambda data: data["reason"],
475
+ )
514
476
 
515
477
  async def _a_generate_reason(
516
478
  self, scores: List[InteractionContextualRecallScore]
@@ -529,13 +491,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
529
491
  self.score, self.success, reasons
530
492
  )
531
493
 
532
- if self.using_native_model:
533
- res, cost = await self.model.a_generate(prompt)
534
- self.evaluation_cost += cost
535
- return res
536
- else:
537
- res = await self.model.a_generate(prompt)
538
- return res
494
+ return await a_generate_with_schema_and_extract(
495
+ metric=self,
496
+ prompt=prompt,
497
+ schema_cls=ContextualRecallScoreReason,
498
+ extract_schema=lambda s: s.reason,
499
+ extract_json=lambda data: data["reason"],
500
+ )
539
501
 
540
502
  def _calculate_score(
541
503
  self, scores: List[InteractionContextualRecallScore]
@@ -130,6 +130,13 @@ class TurnContextualRelevancyTemplate:
130
130
  Context:
131
131
  This metric evaluates conversational contextual relevancy by determining whether statements in the retrieval context are relevant to the user message for each interaction. Each interaction yields a reason indicating which statements were relevant or irrelevant. You are given all those reasons.
132
132
 
133
+ **
134
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
135
+ Example JSON:
136
+ {{
137
+ "reason": "The score is <contextual_relevancy_score> because <your_reason>."
138
+ }}
139
+
133
140
  Inputs:
134
141
  - final_score: the averaged score across all interactions.
135
142
  - success: whether the metric passed or failed
@@ -156,6 +163,6 @@ class TurnContextualRelevancyTemplate:
156
163
 
157
164
  Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
158
165
 
159
- The final reason:
166
+ JSON:
160
167
  """
161
168
  )
@@ -14,6 +14,8 @@ from deepeval.metrics.utils import (
14
14
  get_unit_interactions,
15
15
  get_turns_in_sliding_window,
16
16
  initialize_model,
17
+ generate_with_schema_and_extract,
18
+ a_generate_with_schema_and_extract,
17
19
  )
18
20
  from deepeval.models import DeepEvalBaseLLM
19
21
  from deepeval.metrics.turn_contextual_relevancy.template import (
@@ -264,29 +266,15 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
264
266
  multimodal=multimodal,
265
267
  )
266
268
 
267
- if self.using_native_model:
268
- res, cost = await self.model.a_generate(
269
- prompt, schema=ContextualRelevancyVerdicts
270
- )
271
- self.evaluation_cost += cost
272
- verdicts.extend([item for item in res.verdicts])
273
- else:
274
- try:
275
- res: ContextualRelevancyVerdicts = (
276
- await self.model.a_generate(
277
- prompt, schema=ContextualRelevancyVerdicts
278
- )
279
- )
280
- verdicts.extend([item for item in res.verdicts])
281
- except TypeError:
282
- res = await self.model.a_generate(prompt)
283
- data = trimAndLoadJson(res, self)
284
- verdicts.extend(
285
- [
286
- ContextualRelevancyVerdict(**item)
287
- for item in data["verdicts"]
288
- ]
289
- )
269
+ result = await a_generate_with_schema_and_extract(
270
+ metric=self,
271
+ prompt=prompt,
272
+ schema_cls=ContextualRelevancyVerdicts,
273
+ extract_schema=lambda s: s.verdicts,
274
+ extract_json=lambda data: data["verdicts"],
275
+ )
276
+
277
+ verdicts.extend(result)
290
278
 
291
279
  return verdicts
292
280
 
@@ -306,27 +294,15 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
306
294
  multimodal=multimodal,
307
295
  )
308
296
 
309
- if self.using_native_model:
310
- res, cost = self.model.generate(
311
- prompt, schema=ContextualRelevancyVerdicts
312
- )
313
- self.evaluation_cost += cost
314
- verdicts.extend([item for item in res.verdicts])
315
- else:
316
- try:
317
- res: ContextualRelevancyVerdicts = self.model.generate(
318
- prompt, schema=ContextualRelevancyVerdicts
319
- )
320
- verdicts.extend([item for item in res.verdicts])
321
- except TypeError:
322
- res = self.model.generate(prompt)
323
- data = trimAndLoadJson(res, self)
324
- verdicts.extend(
325
- [
326
- ContextualRelevancyVerdict(**item)
327
- for item in data["verdicts"]
328
- ]
329
- )
297
+ result = generate_with_schema_and_extract(
298
+ metric=self,
299
+ prompt=prompt,
300
+ schema_cls=ContextualRelevancyVerdicts,
301
+ extract_schema=lambda s: s.verdicts,
302
+ extract_json=lambda data: data["verdicts"],
303
+ )
304
+
305
+ verdicts.extend(result)
330
306
 
331
307
  return verdicts
332
308
 
@@ -419,24 +395,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
419
395
  multimodal=multimodal,
420
396
  )
421
397
 
422
- if self.using_native_model:
423
- res, cost = await self.model.a_generate(
424
- prompt, schema=ContextualRelevancyScoreReason
425
- )
426
- self.evaluation_cost += cost
427
- return res.reason
428
- else:
429
- try:
430
- res: ContextualRelevancyScoreReason = (
431
- await self.model.a_generate(
432
- prompt, schema=ContextualRelevancyScoreReason
433
- )
434
- )
435
- return res.reason
436
- except TypeError:
437
- res = await self.model.a_generate(prompt)
438
- data = trimAndLoadJson(res, self)
439
- return data["reason"]
398
+ return await a_generate_with_schema_and_extract(
399
+ metric=self,
400
+ prompt=prompt,
401
+ schema_cls=ContextualRelevancyScoreReason,
402
+ extract_schema=lambda s: s.reason,
403
+ extract_json=lambda data: data["reason"],
404
+ )
440
405
 
441
406
  def _get_interaction_reason(
442
407
  self,
@@ -469,22 +434,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
469
434
  multimodal=multimodal,
470
435
  )
471
436
 
472
- if self.using_native_model:
473
- res, cost = self.model.generate(
474
- prompt, schema=ContextualRelevancyScoreReason
475
- )
476
- self.evaluation_cost += cost
477
- return res.reason
478
- else:
479
- try:
480
- res: ContextualRelevancyScoreReason = self.model.generate(
481
- prompt, schema=ContextualRelevancyScoreReason
482
- )
483
- return res.reason
484
- except TypeError:
485
- res = self.model.generate(prompt)
486
- data = trimAndLoadJson(res, self)
487
- return data["reason"]
437
+ return generate_with_schema_and_extract(
438
+ metric=self,
439
+ prompt=prompt,
440
+ schema_cls=ContextualRelevancyScoreReason,
441
+ extract_schema=lambda s: s.reason,
442
+ extract_json=lambda data: data["reason"],
443
+ )
488
444
 
489
445
  def _get_verbose_steps(
490
446
  self, windows_scores: List[InteractionContextualRelevancyScore]
@@ -517,13 +473,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
517
473
  self.score, self.success, reasons
518
474
  )
519
475
 
520
- if self.using_native_model:
521
- res, cost = self.model.generate(prompt)
522
- self.evaluation_cost += cost
523
- return res
524
- else:
525
- res = self.model.generate(prompt)
526
- return res
476
+ return generate_with_schema_and_extract(
477
+ metric=self,
478
+ prompt=prompt,
479
+ schema_cls=ContextualRelevancyScoreReason,
480
+ extract_schema=lambda s: s.reason,
481
+ extract_json=lambda data: data["reason"],
482
+ )
527
483
 
528
484
  async def _a_generate_reason(
529
485
  self, scores: List[InteractionContextualRelevancyScore]
@@ -542,13 +498,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
542
498
  self.score, self.success, reasons
543
499
  )
544
500
 
545
- if self.using_native_model:
546
- res, cost = await self.model.a_generate(prompt)
547
- self.evaluation_cost += cost
548
- return res
549
- else:
550
- res = await self.model.a_generate(prompt)
551
- return res
501
+ return await a_generate_with_schema_and_extract(
502
+ metric=self,
503
+ prompt=prompt,
504
+ schema_cls=ContextualRelevancyScoreReason,
505
+ extract_schema=lambda s: s.reason,
506
+ extract_json=lambda data: data["reason"],
507
+ )
552
508
 
553
509
  def _calculate_score(
554
510
  self, scores: List[InteractionContextualRelevancyScore]