deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +106 -21
  5. deepeval/evaluate/evaluate.py +0 -3
  6. deepeval/evaluate/execute.py +10 -222
  7. deepeval/evaluate/utils.py +6 -30
  8. deepeval/key_handler.py +3 -0
  9. deepeval/metrics/__init__.py +0 -4
  10. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  11. deepeval/metrics/answer_relevancy/template.py +102 -179
  12. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  13. deepeval/metrics/arena_g_eval/template.py +17 -1
  14. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  15. deepeval/metrics/argument_correctness/template.py +19 -2
  16. deepeval/metrics/base_metric.py +13 -41
  17. deepeval/metrics/bias/bias.py +102 -108
  18. deepeval/metrics/bias/template.py +14 -2
  19. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  20. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  21. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  22. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  23. deepeval/metrics/conversation_completeness/template.py +23 -3
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  25. deepeval/metrics/conversational_dag/nodes.py +66 -123
  26. deepeval/metrics/conversational_dag/templates.py +16 -0
  27. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  28. deepeval/metrics/dag/dag.py +10 -0
  29. deepeval/metrics/dag/nodes.py +63 -126
  30. deepeval/metrics/dag/templates.py +14 -0
  31. deepeval/metrics/exact_match/exact_match.py +9 -1
  32. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  33. deepeval/metrics/g_eval/g_eval.py +87 -78
  34. deepeval/metrics/g_eval/template.py +18 -1
  35. deepeval/metrics/g_eval/utils.py +7 -6
  36. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  37. deepeval/metrics/goal_accuracy/template.py +21 -3
  38. deepeval/metrics/hallucination/hallucination.py +60 -75
  39. deepeval/metrics/hallucination/template.py +13 -0
  40. deepeval/metrics/indicator.py +3 -6
  41. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  42. deepeval/metrics/json_correctness/template.py +10 -0
  43. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  44. deepeval/metrics/knowledge_retention/schema.py +9 -3
  45. deepeval/metrics/knowledge_retention/template.py +12 -0
  46. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  47. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  48. deepeval/metrics/mcp/template.py +52 -0
  49. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  50. deepeval/metrics/mcp_use_metric/template.py +12 -0
  51. deepeval/metrics/misuse/misuse.py +77 -97
  52. deepeval/metrics/misuse/template.py +15 -0
  53. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  58. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  59. deepeval/metrics/non_advice/non_advice.py +79 -105
  60. deepeval/metrics/non_advice/template.py +12 -0
  61. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  62. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  63. deepeval/metrics/pii_leakage/template.py +14 -0
  64. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  65. deepeval/metrics/plan_adherence/template.py +11 -0
  66. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  67. deepeval/metrics/plan_quality/template.py +9 -0
  68. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  69. deepeval/metrics/prompt_alignment/template.py +12 -0
  70. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  71. deepeval/metrics/role_adherence/template.py +14 -0
  72. deepeval/metrics/role_violation/role_violation.py +75 -108
  73. deepeval/metrics/role_violation/template.py +12 -0
  74. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  75. deepeval/metrics/step_efficiency/template.py +11 -0
  76. deepeval/metrics/summarization/summarization.py +115 -183
  77. deepeval/metrics/summarization/template.py +19 -0
  78. deepeval/metrics/task_completion/task_completion.py +67 -73
  79. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  80. deepeval/metrics/tool_use/tool_use.py +42 -66
  81. deepeval/metrics/topic_adherence/template.py +13 -0
  82. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  83. deepeval/metrics/toxicity/template.py +13 -0
  84. deepeval/metrics/toxicity/toxicity.py +80 -99
  85. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  86. deepeval/metrics/turn_contextual_precision/template.py +1 -1
  87. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
  88. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  89. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
  90. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  91. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
  92. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  93. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
  94. deepeval/metrics/turn_relevancy/template.py +14 -0
  95. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  96. deepeval/metrics/utils.py +145 -90
  97. deepeval/models/base_model.py +44 -6
  98. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  99. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  100. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  101. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  102. deepeval/models/llms/amazon_bedrock_model.py +226 -71
  103. deepeval/models/llms/anthropic_model.py +141 -47
  104. deepeval/models/llms/azure_model.py +167 -94
  105. deepeval/models/llms/constants.py +2032 -0
  106. deepeval/models/llms/deepseek_model.py +79 -29
  107. deepeval/models/llms/gemini_model.py +126 -67
  108. deepeval/models/llms/grok_model.py +125 -59
  109. deepeval/models/llms/kimi_model.py +126 -81
  110. deepeval/models/llms/litellm_model.py +92 -18
  111. deepeval/models/llms/local_model.py +114 -15
  112. deepeval/models/llms/ollama_model.py +97 -76
  113. deepeval/models/llms/openai_model.py +167 -310
  114. deepeval/models/llms/portkey_model.py +58 -16
  115. deepeval/models/llms/utils.py +5 -2
  116. deepeval/models/utils.py +60 -4
  117. deepeval/simulator/conversation_simulator.py +43 -0
  118. deepeval/simulator/template.py +13 -0
  119. deepeval/test_case/api.py +24 -45
  120. deepeval/test_case/arena_test_case.py +7 -2
  121. deepeval/test_case/conversational_test_case.py +55 -6
  122. deepeval/test_case/llm_test_case.py +60 -6
  123. deepeval/test_run/api.py +3 -0
  124. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
  125. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
  126. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  127. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  128. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  129. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  130. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  131. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  132. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  133. {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -18,7 +18,11 @@ from deepeval.metrics.dag.templates import (
18
18
  from deepeval.metrics.base_metric import BaseMetric
19
19
  from deepeval.metrics.g_eval.g_eval import GEval
20
20
  from deepeval.metrics.g_eval.utils import G_EVAL_PARAMS
21
- from deepeval.metrics.utils import copy_metrics, trimAndLoadJson
21
+ from deepeval.metrics.utils import (
22
+ copy_metrics,
23
+ a_generate_with_schema_and_extract,
24
+ generate_with_schema_and_extract,
25
+ )
22
26
  from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
23
27
  from deepeval.utils import prettify_list
24
28
 
@@ -222,20 +226,13 @@ class VerdictNode(BaseNode):
222
226
  score=metric.score,
223
227
  name=metric.__name__,
224
228
  )
225
- if metric.using_native_model:
226
- res, cost = metric.model.generate(prompt, schema=MetricScoreReason)
227
- metric.evaluation_cost += cost
228
- else:
229
- try:
230
- res: MetricScoreReason = metric.model.generate(
231
- prompt, schema=MetricScoreReason
232
- )
233
- except TypeError:
234
- res = metric.model.generate(prompt)
235
- data = trimAndLoadJson(res, self)
236
- res = MetricScoreReason(**data)
237
-
238
- return res.reason
229
+ return generate_with_schema_and_extract(
230
+ metric=metric,
231
+ prompt=prompt,
232
+ schema_cls=MetricScoreReason,
233
+ extract_schema=lambda s: s.reason,
234
+ extract_json=lambda data: data["reason"],
235
+ )
239
236
 
240
237
  async def _a_generate_reason(self, metric: BaseMetric):
241
238
  prompt = VerdictNodeTemplate.generate_reason(
@@ -243,22 +240,13 @@ class VerdictNode(BaseNode):
243
240
  score=metric.score,
244
241
  name=metric.__name__,
245
242
  )
246
- if metric.using_native_model:
247
- res, cost = await metric.model.a_generate(
248
- prompt, schema=MetricScoreReason
249
- )
250
- metric.evaluation_cost += cost
251
- else:
252
- try:
253
- res: MetricScoreReason = await metric.model.a_generate(
254
- prompt, schema=MetricScoreReason
255
- )
256
- except TypeError:
257
- res = await metric.model.a_generate(prompt)
258
- data = trimAndLoadJson(res, self)
259
- res = MetricScoreReason(**data)
260
-
261
- return res.reason
243
+ return await a_generate_with_schema_and_extract(
244
+ metric=metric,
245
+ prompt=prompt,
246
+ schema_cls=MetricScoreReason,
247
+ extract_schema=lambda s: s.reason,
248
+ extract_json=lambda data: data["reason"],
249
+ )
262
250
 
263
251
 
264
252
  @dataclass
@@ -317,20 +305,13 @@ class TaskNode(BaseNode):
317
305
  instructions=self.instructions,
318
306
  text=text,
319
307
  )
320
- if metric.using_native_model:
321
- res, cost = metric.model.generate(prompt, schema=TaskNodeOutput)
322
- metric.evaluation_cost += cost
323
- self._output = res.output
324
- else:
325
- try:
326
- res: TaskNodeOutput = metric.model.generate(
327
- prompt, schema=TaskNodeOutput
328
- )
329
- self._output = res.output
330
- except TypeError:
331
- res = metric.model.generate(prompt)
332
- data = trimAndLoadJson(res, self)
333
- self._output = TaskNodeOutput(**data).output
308
+ self._output = generate_with_schema_and_extract(
309
+ metric=metric,
310
+ prompt=prompt,
311
+ schema_cls=TaskNodeOutput,
312
+ extract_schema=lambda s: s.output,
313
+ extract_json=lambda data: data["output"],
314
+ )
334
315
 
335
316
  metric._verbose_steps.append(
336
317
  construct_node_verbose_log(self, self._depth)
@@ -371,22 +352,13 @@ class TaskNode(BaseNode):
371
352
  text=text,
372
353
  )
373
354
 
374
- if metric.using_native_model:
375
- res, cost = await metric.model.a_generate(
376
- prompt, schema=TaskNodeOutput
377
- )
378
- metric.evaluation_cost += cost
379
- self._output = res.output
380
- else:
381
- try:
382
- res: TaskNodeOutput = await metric.model.a_generate(
383
- prompt, schema=TaskNodeOutput
384
- )
385
- self._output = res.output
386
- except TypeError:
387
- res = await metric.model.a_generate(prompt)
388
- data = trimAndLoadJson(res, self)
389
- self._output = TaskNodeOutput(**data).output
355
+ self._output = await a_generate_with_schema_and_extract(
356
+ metric=metric,
357
+ prompt=prompt,
358
+ schema_cls=TaskNodeOutput,
359
+ extract_schema=lambda s: s.output,
360
+ extract_json=lambda data: data["output"],
361
+ )
390
362
 
391
363
  metric._verbose_steps.append(
392
364
  construct_node_verbose_log(self, self._depth)
@@ -470,23 +442,13 @@ class BinaryJudgementNode(BaseNode):
470
442
  criteria=self.criteria,
471
443
  text=text,
472
444
  )
473
- if metric.using_native_model:
474
- res, cost = metric.model.generate(
475
- prompt, schema=BinaryJudgementVerdict
476
- )
477
- metric.evaluation_cost += cost
478
- self._verdict = res
479
- else:
480
- try:
481
- res: BinaryJudgementVerdict = metric.model.generate(
482
- prompt, schema=BinaryJudgementVerdict
483
- )
484
- self._verdict = res
485
- except TypeError:
486
- res = metric.model.generate(prompt)
487
- data = trimAndLoadJson(res, self)
488
- self._verdict = BinaryJudgementVerdict(**data)
489
-
445
+ self._verdict = generate_with_schema_and_extract(
446
+ metric=metric,
447
+ prompt=prompt,
448
+ schema_cls=BinaryJudgementVerdict,
449
+ extract_schema=lambda s: s,
450
+ extract_json=lambda data: BinaryJudgementVerdict(**data),
451
+ )
490
452
  metric._verbose_steps.append(
491
453
  construct_node_verbose_log(self, self._depth)
492
454
  )
@@ -520,22 +482,13 @@ class BinaryJudgementNode(BaseNode):
520
482
  criteria=self.criteria,
521
483
  text=text,
522
484
  )
523
- if metric.using_native_model:
524
- res, cost = await metric.model.a_generate(
525
- prompt, schema=BinaryJudgementVerdict
526
- )
527
- metric.evaluation_cost += cost
528
- self._verdict = res
529
- else:
530
- try:
531
- res: BinaryJudgementVerdict = await metric.model.a_generate(
532
- prompt, schema=BinaryJudgementVerdict
533
- )
534
- self._verdict = res
535
- except TypeError:
536
- res = await metric.model.a_generate(prompt)
537
- data = trimAndLoadJson(res, self)
538
- self._verdict = BinaryJudgementVerdict(**data)
485
+ self._verdict = await a_generate_with_schema_and_extract(
486
+ metric=metric,
487
+ prompt=prompt,
488
+ schema_cls=BinaryJudgementVerdict,
489
+ extract_schema=lambda s: s,
490
+ extract_json=lambda data: BinaryJudgementVerdict(**data),
491
+ )
539
492
 
540
493
  metric._verbose_steps.append(
541
494
  construct_node_verbose_log(self, self._depth)
@@ -629,22 +582,14 @@ class NonBinaryJudgementNode(BaseNode):
629
582
  prompt = NonBinaryJudgementTemplate.generate_non_binary_verdict(
630
583
  criteria=self.criteria, text=text, options=self._verdict_options
631
584
  )
632
- if metric.using_native_model:
633
- res, cost = metric.model.generate(
634
- prompt, schema=self._verdict_schema
635
- )
636
- metric.evaluation_cost += cost
637
- self._verdict = res
638
- else:
639
- try:
640
- res: self._verdict_schema = metric.model.generate(
641
- prompt, schema=self._verdict_schema
642
- )
643
- self._verdict = res
644
- except TypeError:
645
- res = metric.model.generate(prompt)
646
- data = trimAndLoadJson(res, self)
647
- self._verdict = self._verdict_schema(**data)
585
+
586
+ self._verdict = generate_with_schema_and_extract(
587
+ metric=metric,
588
+ prompt=prompt,
589
+ schema_cls=self._verdict_schema,
590
+ extract_schema=lambda s: s,
591
+ extract_json=lambda data: self._verdict_schema(**data),
592
+ )
648
593
 
649
594
  metric._verbose_steps.append(
650
595
  construct_node_verbose_log(self, self._depth)
@@ -678,22 +623,14 @@ class NonBinaryJudgementNode(BaseNode):
678
623
  prompt = NonBinaryJudgementTemplate.generate_non_binary_verdict(
679
624
  criteria=self.criteria, text=text, options=self._verdict_options
680
625
  )
681
- if metric.using_native_model:
682
- res, cost = await metric.model.a_generate(
683
- prompt, schema=self._verdict_schema
684
- )
685
- metric.evaluation_cost += cost
686
- self._verdict = res
687
- else:
688
- try:
689
- res: self._verdict_schema = await metric.model.a_generate(
690
- prompt, schema=self._verdict_schema
691
- )
692
- self._verdict = res
693
- except TypeError:
694
- res = await metric.model.a_generate(prompt)
695
- data = trimAndLoadJson(res, self)
696
- self._verdict = self._verdict_schema(**data)
626
+
627
+ self._verdict = await a_generate_with_schema_and_extract(
628
+ metric=metric,
629
+ prompt=prompt,
630
+ schema_cls=self._verdict_schema,
631
+ extract_schema=lambda s: s,
632
+ extract_json=lambda data: self._verdict_schema(**data),
633
+ )
697
634
 
698
635
  metric._verbose_steps.append(
699
636
  construct_node_verbose_log(self, self._depth)
@@ -1,5 +1,13 @@
1
1
  from typing import List
2
2
 
3
+ multimodal_rules = """
4
+ --- MULTIMODAL INPUT RULES ---
5
+ - Treat image content as factual evidence.
6
+ - Only reference visual details that are explicitly and clearly visible.
7
+ - Do not infer or guess objects, text, or details not visibly present.
8
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
9
+ """
10
+
3
11
 
4
12
  class VerdictNodeTemplate:
5
13
  @staticmethod
@@ -34,6 +42,8 @@ class TaskNodeTemplate:
34
42
  def generate_task_output(instructions: str, text: str):
35
43
  return f"""Given the following instructions, generate an output.
36
44
 
45
+ {multimodal_rules}
46
+
37
47
  {instructions}
38
48
 
39
49
  {text}
@@ -57,6 +67,8 @@ class BinaryJudgementTemplate:
57
67
  def generate_binary_verdict(criteria: str, text: str):
58
68
  return f"""{criteria}
59
69
 
70
+ {multimodal_rules}
71
+
60
72
  {text}
61
73
 
62
74
  **
@@ -79,6 +91,8 @@ class NonBinaryJudgementTemplate:
79
91
  ):
80
92
  return f"""{criteria}
81
93
 
94
+ {multimodal_rules}
95
+
82
96
  {text}
83
97
 
84
98
  **
@@ -32,7 +32,15 @@ class ExactMatchMetric(BaseMetric):
32
32
  _in_component: bool = False,
33
33
  _log_metric_to_confident: bool = True,
34
34
  ) -> float:
35
- check_llm_test_case_params(test_case, self._required_params, self)
35
+ check_llm_test_case_params(
36
+ test_case,
37
+ self._required_params,
38
+ None,
39
+ None,
40
+ self,
41
+ None,
42
+ test_case.multimodal,
43
+ )
36
44
 
37
45
  with metric_progress_indicator(
38
46
  self, _show_indicator=_show_indicator, _in_component=_in_component
@@ -1,7 +1,7 @@
1
1
  from typing import List, Optional, Union, Type
2
2
  import asyncio
3
3
 
4
- from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
4
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
5
5
  from deepeval.metrics import BaseMetric
6
6
  from deepeval.utils import (
7
7
  get_or_create_event_loop,
@@ -9,10 +9,10 @@ from deepeval.utils import (
9
9
  )
10
10
  from deepeval.metrics.utils import (
11
11
  construct_verbose_logs,
12
- trimAndLoadJson,
13
12
  check_llm_test_case_params,
14
- check_mllm_test_case_params,
15
13
  initialize_model,
14
+ a_generate_with_schema_and_extract,
15
+ generate_with_schema_and_extract,
16
16
  )
17
17
  from deepeval.models import DeepEvalBaseLLM
18
18
  from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
@@ -69,12 +69,15 @@ class FaithfulnessMetric(BaseMetric):
69
69
  ) -> float:
70
70
 
71
71
  multimodal = test_case.multimodal
72
- if multimodal:
73
- check_mllm_test_case_params(
74
- test_case, self._required_params, None, None, self, self.model
75
- )
76
- else:
77
- check_llm_test_case_params(test_case, self._required_params, self)
72
+ check_llm_test_case_params(
73
+ test_case,
74
+ self._required_params,
75
+ None,
76
+ None,
77
+ self,
78
+ self.model,
79
+ multimodal,
80
+ )
78
81
 
79
82
  self.evaluation_cost = 0 if self.using_native_model else None
80
83
  with metric_progress_indicator(
@@ -127,12 +130,15 @@ class FaithfulnessMetric(BaseMetric):
127
130
  ) -> float:
128
131
 
129
132
  multimodal = test_case.multimodal
130
- if multimodal:
131
- check_mllm_test_case_params(
132
- test_case, self._required_params, None, None, self, self.model
133
- )
134
- else:
135
- check_llm_test_case_params(test_case, self._required_params, self)
133
+ check_llm_test_case_params(
134
+ test_case,
135
+ self._required_params,
136
+ None,
137
+ None,
138
+ self,
139
+ self.model,
140
+ multimodal,
141
+ )
136
142
 
137
143
  self.evaluation_cost = 0 if self.using_native_model else None
138
144
  with metric_progress_indicator(
@@ -182,22 +188,13 @@ class FaithfulnessMetric(BaseMetric):
182
188
  multimodal=multimodal,
183
189
  )
184
190
 
185
- if self.using_native_model:
186
- res, cost = await self.model.a_generate(
187
- prompt, schema=FaithfulnessScoreReason
188
- )
189
- self.evaluation_cost += cost
190
- return res.reason
191
- else:
192
- try:
193
- res: FaithfulnessScoreReason = await self.model.a_generate(
194
- prompt, schema=FaithfulnessScoreReason
195
- )
196
- return res.reason
197
- except TypeError:
198
- res = await self.model.a_generate(prompt)
199
- data = trimAndLoadJson(res, self)
200
- return data["reason"]
191
+ return await a_generate_with_schema_and_extract(
192
+ metric=self,
193
+ prompt=prompt,
194
+ schema_cls=FaithfulnessScoreReason,
195
+ extract_schema=lambda s: s.reason,
196
+ extract_json=lambda data: data["reason"],
197
+ )
201
198
 
202
199
  def _generate_reason(self, multimodal: bool) -> str:
203
200
  if self.include_reason is False:
@@ -214,22 +211,13 @@ class FaithfulnessMetric(BaseMetric):
214
211
  multimodal=multimodal,
215
212
  )
216
213
 
217
- if self.using_native_model:
218
- res, cost = self.model.generate(
219
- prompt, schema=FaithfulnessScoreReason
220
- )
221
- self.evaluation_cost += cost
222
- return res.reason
223
- else:
224
- try:
225
- res: FaithfulnessScoreReason = self.model.generate(
226
- prompt, schema=FaithfulnessScoreReason
227
- )
228
- return res.reason
229
- except TypeError:
230
- res = self.model.generate(prompt)
231
- data = trimAndLoadJson(res, self)
232
- return data["reason"]
214
+ return generate_with_schema_and_extract(
215
+ metric=self,
216
+ prompt=prompt,
217
+ schema_cls=FaithfulnessScoreReason,
218
+ extract_schema=lambda s: s.reason,
219
+ extract_json=lambda data: data["reason"],
220
+ )
233
221
 
234
222
  async def _a_generate_verdicts(
235
223
  self, multimodal: bool
@@ -237,63 +225,41 @@ class FaithfulnessMetric(BaseMetric):
237
225
  if len(self.claims) == 0:
238
226
  return []
239
227
 
240
- verdicts: List[FaithfulnessVerdict] = []
241
-
242
228
  prompt = self.evaluation_template.generate_verdicts(
243
229
  claims=self.claims,
244
230
  retrieval_context="\n\n".join(self.truths),
245
231
  multimodal=multimodal,
246
232
  )
247
233
 
248
- if self.using_native_model:
249
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
250
- self.evaluation_cost += cost
251
- verdicts = [item for item in res.verdicts]
252
- return verdicts
253
- else:
254
- try:
255
- res: Verdicts = await self.model.a_generate(
256
- prompt, schema=Verdicts
257
- )
258
- verdicts = [item for item in res.verdicts]
259
- return verdicts
260
- except TypeError:
261
- res = await self.model.a_generate(prompt)
262
- data = trimAndLoadJson(res, self)
263
- verdicts = [
264
- FaithfulnessVerdict(**item) for item in data["verdicts"]
265
- ]
266
- return verdicts
234
+ return await a_generate_with_schema_and_extract(
235
+ metric=self,
236
+ prompt=prompt,
237
+ schema_cls=Verdicts,
238
+ extract_schema=lambda s: list(s.verdicts),
239
+ extract_json=lambda data: [
240
+ FaithfulnessVerdict(**item) for item in data["verdicts"]
241
+ ],
242
+ )
267
243
 
268
244
  def _generate_verdicts(self, multimodal: bool) -> List[FaithfulnessVerdict]:
269
245
  if len(self.claims) == 0:
270
246
  return []
271
247
 
272
- verdicts: List[FaithfulnessVerdict] = []
273
-
274
248
  prompt = self.evaluation_template.generate_verdicts(
275
249
  claims=self.claims,
276
250
  retrieval_context="\n\n".join(self.truths),
277
251
  multimodal=multimodal,
278
252
  )
279
253
 
280
- if self.using_native_model:
281
- res, cost = self.model.generate(prompt, schema=Verdicts)
282
- self.evaluation_cost += cost
283
- verdicts = [item for item in res.verdicts]
284
- return verdicts
285
- else:
286
- try:
287
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
288
- verdicts = [item for item in res.verdicts]
289
- return verdicts
290
- except TypeError:
291
- res = self.model.generate(prompt)
292
- data = trimAndLoadJson(res, self)
293
- verdicts = [
294
- FaithfulnessVerdict(**item) for item in data["verdicts"]
295
- ]
296
- return verdicts
254
+ return generate_with_schema_and_extract(
255
+ metric=self,
256
+ prompt=prompt,
257
+ schema_cls=Verdicts,
258
+ extract_schema=lambda s: list(s.verdicts),
259
+ extract_json=lambda data: [
260
+ FaithfulnessVerdict(**item) for item in data["verdicts"]
261
+ ],
262
+ )
297
263
 
298
264
  async def _a_generate_truths(
299
265
  self, retrieval_context: str, multimodal: bool
@@ -303,18 +269,13 @@ class FaithfulnessMetric(BaseMetric):
303
269
  extraction_limit=self.truths_extraction_limit,
304
270
  multimodal=multimodal,
305
271
  )
306
- if self.using_native_model:
307
- res, cost = await self.model.a_generate(prompt, schema=Truths)
308
- self.evaluation_cost += cost
309
- return res.truths
310
- else:
311
- try:
312
- res: Truths = await self.model.a_generate(prompt, schema=Truths)
313
- return res.truths
314
- except TypeError:
315
- res = await self.model.a_generate(prompt)
316
- data = trimAndLoadJson(res, self)
317
- return data["truths"]
272
+ return await a_generate_with_schema_and_extract(
273
+ metric=self,
274
+ prompt=prompt,
275
+ schema_cls=Truths,
276
+ extract_schema=lambda s: s.truths,
277
+ extract_json=lambda data: data["truths"],
278
+ )
318
279
 
319
280
  def _generate_truths(
320
281
  self, retrieval_context: str, multimodal: bool
@@ -324,18 +285,13 @@ class FaithfulnessMetric(BaseMetric):
324
285
  extraction_limit=self.truths_extraction_limit,
325
286
  multimodal=multimodal,
326
287
  )
327
- if self.using_native_model:
328
- res, cost = self.model.generate(prompt, schema=Truths)
329
- self.evaluation_cost += cost
330
- return res.truths
331
- else:
332
- try:
333
- res: Truths = self.model.generate(prompt, schema=Truths)
334
- return res.truths
335
- except TypeError:
336
- res = self.model.generate(prompt)
337
- data = trimAndLoadJson(res, self)
338
- return data["truths"]
288
+ return generate_with_schema_and_extract(
289
+ metric=self,
290
+ prompt=prompt,
291
+ schema_cls=Truths,
292
+ extract_schema=lambda s: s.truths,
293
+ extract_json=lambda data: data["truths"],
294
+ )
339
295
 
340
296
  async def _a_generate_claims(
341
297
  self, actual_output: str, multimodal: bool
@@ -343,18 +299,13 @@ class FaithfulnessMetric(BaseMetric):
343
299
  prompt = self.evaluation_template.generate_claims(
344
300
  actual_output=actual_output, multimodal=multimodal
345
301
  )
346
- if self.using_native_model:
347
- res, cost = await self.model.a_generate(prompt, schema=Claims)
348
- self.evaluation_cost += cost
349
- return res.claims
350
- else:
351
- try:
352
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
353
- return res.claims
354
- except TypeError:
355
- res = await self.model.a_generate(prompt)
356
- data = trimAndLoadJson(res, self)
357
- return data["claims"]
302
+ return await a_generate_with_schema_and_extract(
303
+ metric=self,
304
+ prompt=prompt,
305
+ schema_cls=Claims,
306
+ extract_schema=lambda s: s.claims,
307
+ extract_json=lambda data: data["claims"],
308
+ )
358
309
 
359
310
  def _generate_claims(
360
311
  self, actual_output: str, multimodal: bool
@@ -362,18 +313,13 @@ class FaithfulnessMetric(BaseMetric):
362
313
  prompt = self.evaluation_template.generate_claims(
363
314
  actual_output=actual_output, multimodal=multimodal
364
315
  )
365
- if self.using_native_model:
366
- res, cost = self.model.generate(prompt, schema=Claims)
367
- self.evaluation_cost += cost
368
- return res.claims
369
- else:
370
- try:
371
- res: Claims = self.model.generate(prompt, schema=Claims)
372
- return res.claims
373
- except TypeError:
374
- res = self.model.generate(prompt)
375
- data = trimAndLoadJson(res, self)
376
- return data["claims"]
316
+ return generate_with_schema_and_extract(
317
+ metric=self,
318
+ prompt=prompt,
319
+ schema_cls=Claims,
320
+ extract_schema=lambda s: s.claims,
321
+ extract_json=lambda data: data["claims"],
322
+ )
377
323
 
378
324
  def _calculate_score(self) -> float:
379
325
  number_of_verdicts = len(self.verdicts)
@@ -400,7 +346,7 @@ class FaithfulnessMetric(BaseMetric):
400
346
  else:
401
347
  try:
402
348
  self.success = self.score >= self.threshold
403
- except:
349
+ except TypeError:
404
350
  self.success = False
405
351
  return self.success
406
352