deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -1,17 +1,18 @@
1
1
  from typing import List, Optional, Union, Type
2
2
  import asyncio
3
3
 
4
- from deepeval.test_case import (
5
- LLMTestCase,
6
- LLMTestCaseParams,
7
- )
4
+ from deepeval.test_case import LLMTestCase, LLMTestCaseParams
8
5
  from deepeval.metrics import BaseMetric
9
- from deepeval.utils import get_or_create_event_loop, prettify_list
6
+ from deepeval.utils import (
7
+ get_or_create_event_loop,
8
+ prettify_list,
9
+ )
10
10
  from deepeval.metrics.utils import (
11
11
  construct_verbose_logs,
12
- trimAndLoadJson,
13
12
  check_llm_test_case_params,
14
13
  initialize_model,
14
+ a_generate_with_schema_and_extract,
15
+ generate_with_schema_and_extract,
15
16
  )
16
17
  from deepeval.models import DeepEvalBaseLLM
17
18
  from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
@@ -67,7 +68,16 @@ class FaithfulnessMetric(BaseMetric):
67
68
  _log_metric_to_confident: bool = True,
68
69
  ) -> float:
69
70
 
70
- check_llm_test_case_params(test_case, self._required_params, self)
71
+ multimodal = test_case.multimodal
72
+ check_llm_test_case_params(
73
+ test_case,
74
+ self._required_params,
75
+ None,
76
+ None,
77
+ self,
78
+ self.model,
79
+ multimodal,
80
+ )
71
81
 
72
82
  self.evaluation_cost = 0 if self.using_native_model else None
73
83
  with metric_progress_indicator(
@@ -84,11 +94,16 @@ class FaithfulnessMetric(BaseMetric):
84
94
  )
85
95
  )
86
96
  else:
87
- self.truths = self._generate_truths(test_case.retrieval_context)
88
- self.claims = self._generate_claims(test_case.actual_output)
89
- self.verdicts = self._generate_verdicts()
97
+ retrieval_context = test_case.retrieval_context
98
+ actual_output = test_case.actual_output
99
+
100
+ self.truths = self._generate_truths(
101
+ retrieval_context, multimodal
102
+ )
103
+ self.claims = self._generate_claims(actual_output, multimodal)
104
+ self.verdicts = self._generate_verdicts(multimodal)
90
105
  self.score = self._calculate_score()
91
- self.reason = self._generate_reason()
106
+ self.reason = self._generate_reason(multimodal)
92
107
  self.success = self.score >= self.threshold
93
108
  self.verbose_logs = construct_verbose_logs(
94
109
  self,
@@ -114,7 +129,16 @@ class FaithfulnessMetric(BaseMetric):
114
129
  _log_metric_to_confident: bool = True,
115
130
  ) -> float:
116
131
 
117
- check_llm_test_case_params(test_case, self._required_params, self)
132
+ multimodal = test_case.multimodal
133
+ check_llm_test_case_params(
134
+ test_case,
135
+ self._required_params,
136
+ None,
137
+ None,
138
+ self,
139
+ self.model,
140
+ multimodal,
141
+ )
118
142
 
119
143
  self.evaluation_cost = 0 if self.using_native_model else None
120
144
  with metric_progress_indicator(
@@ -123,13 +147,16 @@ class FaithfulnessMetric(BaseMetric):
123
147
  _show_indicator=_show_indicator,
124
148
  _in_component=_in_component,
125
149
  ):
150
+ retrieval_context = test_case.retrieval_context
151
+ actual_output = test_case.actual_output
152
+
126
153
  self.truths, self.claims = await asyncio.gather(
127
- self._a_generate_truths(test_case.retrieval_context),
128
- self._a_generate_claims(test_case.actual_output),
154
+ self._a_generate_truths(retrieval_context, multimodal),
155
+ self._a_generate_claims(actual_output, multimodal),
129
156
  )
130
- self.verdicts = await self._a_generate_verdicts()
157
+ self.verdicts = await self._a_generate_verdicts(multimodal)
131
158
  self.score = self._calculate_score()
132
- self.reason = await self._a_generate_reason()
159
+ self.reason = await self._a_generate_reason(multimodal)
133
160
  self.success = self.score >= self.threshold
134
161
  self.verbose_logs = construct_verbose_logs(
135
162
  self,
@@ -146,7 +173,7 @@ class FaithfulnessMetric(BaseMetric):
146
173
  )
147
174
  return self.score
148
175
 
149
- async def _a_generate_reason(self) -> str:
176
+ async def _a_generate_reason(self, multimodal: bool) -> str:
150
177
  if self.include_reason is False:
151
178
  return None
152
179
 
@@ -158,26 +185,18 @@ class FaithfulnessMetric(BaseMetric):
158
185
  prompt = self.evaluation_template.generate_reason(
159
186
  contradictions=contradictions,
160
187
  score=format(self.score, ".2f"),
188
+ multimodal=multimodal,
161
189
  )
162
190
 
163
- if self.using_native_model:
164
- res, cost = await self.model.a_generate(
165
- prompt, schema=FaithfulnessScoreReason
166
- )
167
- self.evaluation_cost += cost
168
- return res.reason
169
- else:
170
- try:
171
- res: FaithfulnessScoreReason = await self.model.a_generate(
172
- prompt, schema=FaithfulnessScoreReason
173
- )
174
- return res.reason
175
- except TypeError:
176
- res = await self.model.a_generate(prompt)
177
- data = trimAndLoadJson(res, self)
178
- return data["reason"]
191
+ return await a_generate_with_schema_and_extract(
192
+ metric=self,
193
+ prompt=prompt,
194
+ schema_cls=FaithfulnessScoreReason,
195
+ extract_schema=lambda s: s.reason,
196
+ extract_json=lambda data: data["reason"],
197
+ )
179
198
 
180
- def _generate_reason(self) -> str:
199
+ def _generate_reason(self, multimodal: bool) -> str:
181
200
  if self.include_reason is False:
182
201
  return None
183
202
 
@@ -189,148 +208,118 @@ class FaithfulnessMetric(BaseMetric):
189
208
  prompt = self.evaluation_template.generate_reason(
190
209
  contradictions=contradictions,
191
210
  score=format(self.score, ".2f"),
211
+ multimodal=multimodal,
192
212
  )
193
213
 
194
- if self.using_native_model:
195
- res, cost = self.model.generate(
196
- prompt, schema=FaithfulnessScoreReason
197
- )
198
- self.evaluation_cost += cost
199
- return res.reason
200
- else:
201
- try:
202
- res: FaithfulnessScoreReason = self.model.generate(
203
- prompt, schema=FaithfulnessScoreReason
204
- )
205
- return res.reason
206
- except TypeError:
207
- res = self.model.generate(prompt)
208
- data = trimAndLoadJson(res, self)
209
- return data["reason"]
214
+ return generate_with_schema_and_extract(
215
+ metric=self,
216
+ prompt=prompt,
217
+ schema_cls=FaithfulnessScoreReason,
218
+ extract_schema=lambda s: s.reason,
219
+ extract_json=lambda data: data["reason"],
220
+ )
210
221
 
211
- async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]:
222
+ async def _a_generate_verdicts(
223
+ self, multimodal: bool
224
+ ) -> List[FaithfulnessVerdict]:
212
225
  if len(self.claims) == 0:
213
226
  return []
214
227
 
215
- verdicts: List[FaithfulnessVerdict] = []
216
228
  prompt = self.evaluation_template.generate_verdicts(
217
- claims=self.claims, retrieval_context="\n\n".join(self.truths)
229
+ claims=self.claims,
230
+ retrieval_context="\n\n".join(self.truths),
231
+ multimodal=multimodal,
218
232
  )
219
- if self.using_native_model:
220
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
221
- self.evaluation_cost += cost
222
- verdicts = [item for item in res.verdicts]
223
- return verdicts
224
- else:
225
- try:
226
- res: Verdicts = await self.model.a_generate(
227
- prompt, schema=Verdicts
228
- )
229
- verdicts = [item for item in res.verdicts]
230
- return verdicts
231
- except TypeError:
232
- res = await self.model.a_generate(prompt)
233
- data = trimAndLoadJson(res, self)
234
- verdicts = [
235
- FaithfulnessVerdict(**item) for item in data["verdicts"]
236
- ]
237
- return verdicts
238
-
239
- def _generate_verdicts(self) -> List[FaithfulnessVerdict]:
233
+
234
+ return await a_generate_with_schema_and_extract(
235
+ metric=self,
236
+ prompt=prompt,
237
+ schema_cls=Verdicts,
238
+ extract_schema=lambda s: list(s.verdicts),
239
+ extract_json=lambda data: [
240
+ FaithfulnessVerdict(**item) for item in data["verdicts"]
241
+ ],
242
+ )
243
+
244
+ def _generate_verdicts(self, multimodal: bool) -> List[FaithfulnessVerdict]:
240
245
  if len(self.claims) == 0:
241
246
  return []
242
247
 
243
- verdicts: List[FaithfulnessVerdict] = []
244
248
  prompt = self.evaluation_template.generate_verdicts(
245
- claims=self.claims, retrieval_context="\n\n".join(self.truths)
249
+ claims=self.claims,
250
+ retrieval_context="\n\n".join(self.truths),
251
+ multimodal=multimodal,
246
252
  )
247
- if self.using_native_model:
248
- res, cost = self.model.generate(prompt, schema=Verdicts)
249
- self.evaluation_cost += cost
250
- verdicts = [item for item in res.verdicts]
251
- return verdicts
252
- else:
253
- try:
254
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
255
- verdicts = [item for item in res.verdicts]
256
- return verdicts
257
- except TypeError:
258
- res = self.model.generate(prompt)
259
- data = trimAndLoadJson(res, self)
260
- verdicts = [
261
- FaithfulnessVerdict(**item) for item in data["verdicts"]
262
- ]
263
- return verdicts
264
-
265
- async def _a_generate_truths(self, retrieval_context: str) -> List[str]:
253
+
254
+ return generate_with_schema_and_extract(
255
+ metric=self,
256
+ prompt=prompt,
257
+ schema_cls=Verdicts,
258
+ extract_schema=lambda s: list(s.verdicts),
259
+ extract_json=lambda data: [
260
+ FaithfulnessVerdict(**item) for item in data["verdicts"]
261
+ ],
262
+ )
263
+
264
+ async def _a_generate_truths(
265
+ self, retrieval_context: str, multimodal: bool
266
+ ) -> List[str]:
266
267
  prompt = self.evaluation_template.generate_truths(
267
268
  retrieval_context="\n\n".join(retrieval_context),
268
269
  extraction_limit=self.truths_extraction_limit,
270
+ multimodal=multimodal,
271
+ )
272
+ return await a_generate_with_schema_and_extract(
273
+ metric=self,
274
+ prompt=prompt,
275
+ schema_cls=Truths,
276
+ extract_schema=lambda s: s.truths,
277
+ extract_json=lambda data: data["truths"],
269
278
  )
270
- if self.using_native_model:
271
- res, cost = await self.model.a_generate(prompt, schema=Truths)
272
- self.evaluation_cost += cost
273
- return res.truths
274
- else:
275
- try:
276
- res: Truths = await self.model.a_generate(prompt, schema=Truths)
277
- return res.truths
278
- except TypeError:
279
- res = await self.model.a_generate(prompt)
280
- data = trimAndLoadJson(res, self)
281
- return data["truths"]
282
279
 
283
- def _generate_truths(self, retrieval_context: str) -> List[str]:
280
+ def _generate_truths(
281
+ self, retrieval_context: str, multimodal: bool
282
+ ) -> List[str]:
284
283
  prompt = self.evaluation_template.generate_truths(
285
284
  retrieval_context="\n\n".join(retrieval_context),
286
285
  extraction_limit=self.truths_extraction_limit,
286
+ multimodal=multimodal,
287
+ )
288
+ return generate_with_schema_and_extract(
289
+ metric=self,
290
+ prompt=prompt,
291
+ schema_cls=Truths,
292
+ extract_schema=lambda s: s.truths,
293
+ extract_json=lambda data: data["truths"],
287
294
  )
288
- if self.using_native_model:
289
- res, cost = self.model.generate(prompt, schema=Truths)
290
- self.evaluation_cost += cost
291
- return res.truths
292
- else:
293
- try:
294
- res: Truths = self.model.generate(prompt, schema=Truths)
295
- return res.truths
296
- except TypeError:
297
- res = self.model.generate(prompt)
298
- data = trimAndLoadJson(res, self)
299
- return data["truths"]
300
295
 
301
- async def _a_generate_claims(self, actual_output: str) -> List[str]:
296
+ async def _a_generate_claims(
297
+ self, actual_output: str, multimodal: bool
298
+ ) -> List[str]:
302
299
  prompt = self.evaluation_template.generate_claims(
303
- actual_output=actual_output
300
+ actual_output=actual_output, multimodal=multimodal
301
+ )
302
+ return await a_generate_with_schema_and_extract(
303
+ metric=self,
304
+ prompt=prompt,
305
+ schema_cls=Claims,
306
+ extract_schema=lambda s: s.claims,
307
+ extract_json=lambda data: data["claims"],
304
308
  )
305
- if self.using_native_model:
306
- res, cost = await self.model.a_generate(prompt, schema=Claims)
307
- self.evaluation_cost += cost
308
- return res.claims
309
- else:
310
- try:
311
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
312
- return res.claims
313
- except TypeError:
314
- res = await self.model.a_generate(prompt)
315
- data = trimAndLoadJson(res, self)
316
- return data["claims"]
317
309
 
318
- def _generate_claims(self, actual_output: str) -> List[str]:
310
+ def _generate_claims(
311
+ self, actual_output: str, multimodal: bool
312
+ ) -> List[str]:
319
313
  prompt = self.evaluation_template.generate_claims(
320
- actual_output=actual_output
314
+ actual_output=actual_output, multimodal=multimodal
315
+ )
316
+ return generate_with_schema_and_extract(
317
+ metric=self,
318
+ prompt=prompt,
319
+ schema_cls=Claims,
320
+ extract_schema=lambda s: s.claims,
321
+ extract_json=lambda data: data["claims"],
321
322
  )
322
- if self.using_native_model:
323
- res, cost = self.model.generate(prompt, schema=Claims)
324
- self.evaluation_cost += cost
325
- return res.claims
326
- else:
327
- try:
328
- res: Claims = self.model.generate(prompt, schema=Claims)
329
- return res.claims
330
- except TypeError:
331
- res = self.model.generate(prompt)
332
- data = trimAndLoadJson(res, self)
333
- return data["claims"]
334
323
 
335
324
  def _calculate_score(self) -> float:
336
325
  number_of_verdicts = len(self.verdicts)
@@ -357,7 +346,7 @@ class FaithfulnessMetric(BaseMetric):
357
346
  else:
358
347
  try:
359
348
  self.success = self.score >= self.threshold
360
- except:
349
+ except TypeError:
361
350
  self.success = False
362
351
  return self.success
363
352
 
@@ -3,7 +3,7 @@ from pydantic import BaseModel, Field
3
3
 
4
4
 
5
5
  class FaithfulnessVerdict(BaseModel):
6
- verdict: Literal["yes", "idk", "no"]
6
+ verdict: Literal["yes", "no", "idk"]
7
7
  reason: Optional[str] = Field(default=None)
8
8
 
9
9