deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -5,16 +5,16 @@ from deepeval.metrics.api import metric_data_manager
5
5
  from deepeval.test_case import (
6
6
  LLMTestCase,
7
7
  LLMTestCaseParams,
8
- ConversationalTestCase,
9
8
  )
10
9
  from deepeval.metrics import BaseMetric
11
10
  from deepeval.models import DeepEvalBaseLLM
12
11
  from deepeval.utils import get_or_create_event_loop, prettify_list
13
12
  from deepeval.metrics.utils import (
14
13
  construct_verbose_logs,
15
- trimAndLoadJson,
16
14
  check_llm_test_case_params,
17
15
  initialize_model,
16
+ a_generate_with_schema_and_extract,
17
+ generate_with_schema_and_extract,
18
18
  )
19
19
  from deepeval.metrics.summarization.template import SummarizationTemplate
20
20
  from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
@@ -77,7 +77,15 @@ class SummarizationMetric(BaseMetric):
77
77
  _log_metric_to_confident: bool = True,
78
78
  ) -> float:
79
79
 
80
- check_llm_test_case_params(test_case, self._required_params, self)
80
+ check_llm_test_case_params(
81
+ test_case,
82
+ self._required_params,
83
+ None,
84
+ None,
85
+ self,
86
+ self.model,
87
+ test_case.multimodal,
88
+ )
81
89
 
82
90
  self.evaluation_cost = 0 if self.using_native_model else None
83
91
  with metric_progress_indicator(
@@ -138,7 +146,15 @@ class SummarizationMetric(BaseMetric):
138
146
  _log_metric_to_confident: bool = True,
139
147
  ) -> float:
140
148
 
141
- check_llm_test_case_params(test_case, self._required_params, self)
149
+ check_llm_test_case_params(
150
+ test_case,
151
+ self._required_params,
152
+ None,
153
+ None,
154
+ self,
155
+ self.model,
156
+ test_case.multimodal,
157
+ )
142
158
 
143
159
  self.evaluation_cost = 0 if self.using_native_model else None
144
160
  with metric_progress_indicator(
@@ -185,7 +201,7 @@ class SummarizationMetric(BaseMetric):
185
201
 
186
202
  return self.score
187
203
 
188
- async def _a_generate_reason(self) -> str:
204
+ async def _a_generate_reason(self) -> Optional[str]:
189
205
  if self.include_reason is False:
190
206
  return None
191
207
 
@@ -221,24 +237,15 @@ class SummarizationMetric(BaseMetric):
221
237
  prompt += """JSON:
222
238
  """
223
239
 
224
- if self.using_native_model:
225
- res, cost = await self.model.a_generate(
226
- prompt, schema=SummarizationScoreReason
227
- )
228
- self.evaluation_cost += cost
229
- return res.reason
230
- else:
231
- try:
232
- res: SummarizationScoreReason = await self.model.a_generate(
233
- prompt, schema=SummarizationScoreReason
234
- )
235
- return res.reason
236
- except TypeError:
237
- res = await self.model.a_generate(prompt)
238
- data = trimAndLoadJson(res, self)
239
- return data["reason"]
240
+ return await a_generate_with_schema_and_extract(
241
+ metric=self,
242
+ prompt=prompt,
243
+ schema_cls=SummarizationScoreReason,
244
+ extract_schema=lambda s: s.reason,
245
+ extract_json=lambda data: data["reason"],
246
+ )
240
247
 
241
- def _generate_reason(self) -> str:
248
+ def _generate_reason(self) -> Optional[str]:
242
249
  if self.include_reason is False:
243
250
  return None
244
251
 
@@ -274,22 +281,13 @@ class SummarizationMetric(BaseMetric):
274
281
  prompt += """JSON:
275
282
  """
276
283
 
277
- if self.using_native_model:
278
- res, cost = self.model.generate(
279
- prompt, schema=SummarizationScoreReason
280
- )
281
- self.evaluation_cost += cost
282
- return res.reason
283
- else:
284
- try:
285
- res: SummarizationScoreReason = self.model.generate(
286
- prompt, schema=SummarizationScoreReason
287
- )
288
- return res.reason
289
- except TypeError:
290
- res = self.model.generate(prompt)
291
- data = trimAndLoadJson(res, self)
292
- return data["reason"]
284
+ return generate_with_schema_and_extract(
285
+ metric=self,
286
+ prompt=prompt,
287
+ schema_cls=SummarizationScoreReason,
288
+ extract_schema=lambda s: s.reason,
289
+ extract_json=lambda data: data["reason"],
290
+ )
293
291
 
294
292
  def _calculate_score(self, score_type: ScoreType) -> float:
295
293
  if score_type == ScoreType.ALIGNMENT:
@@ -327,69 +325,45 @@ class SummarizationMetric(BaseMetric):
327
325
  prompt = SummarizationTemplate.generate_answers(
328
326
  questions=self.assessment_questions, text=text
329
327
  )
330
- if self.using_native_model:
331
- res, cost = await self.model.a_generate(prompt, schema=Answers)
332
- self.evaluation_cost += cost
333
- return res.answers
334
- else:
335
- try:
336
- res: Answers = await self.model.a_generate(
337
- prompt, schema=Answers
338
- )
339
- return res.answers
340
- except TypeError:
341
- res = await self.model.a_generate(prompt)
342
- data = trimAndLoadJson(res, self)
343
- return data["answers"]
328
+ return await a_generate_with_schema_and_extract(
329
+ metric=self,
330
+ prompt=prompt,
331
+ schema_cls=Answers,
332
+ extract_schema=lambda s: s.answers,
333
+ extract_json=lambda data: data["answers"],
334
+ )
344
335
 
345
336
  def _generate_answers(self, text: str) -> List[str]:
346
337
  prompt = SummarizationTemplate.generate_answers(
347
338
  questions=self.assessment_questions, text=text
348
339
  )
349
- if self.using_native_model:
350
- res, cost = self.model.generate(prompt, schema=Answers)
351
- self.evaluation_cost += cost
352
- return res.answers
353
- else:
354
- try:
355
- res: Answers = self.model.generate(prompt, schema=Answers)
356
- return res.answers
357
- except TypeError:
358
- res = self.model.generate(prompt)
359
- data = trimAndLoadJson(res, self)
360
- return data["answers"]
340
+ return generate_with_schema_and_extract(
341
+ metric=self,
342
+ prompt=prompt,
343
+ schema_cls=Answers,
344
+ extract_schema=lambda s: s.answers,
345
+ extract_json=lambda data: data["answers"],
346
+ )
361
347
 
362
- async def _a_generate_assessment_questions(self, text: str):
348
+ async def _a_generate_assessment_questions(self, text: str) -> List[str]:
363
349
  prompt = SummarizationTemplate.generate_questions(text=text, n=self.n)
364
- if self.using_native_model:
365
- res, cost = await self.model.a_generate(prompt, schema=Questions)
366
- self.evaluation_cost += cost
367
- return res.questions
368
- else:
369
- try:
370
- res: Questions = await self.model.a_generate(
371
- prompt, schema=Questions
372
- )
373
- return res.questions
374
- except TypeError:
375
- res = await self.model.a_generate(prompt)
376
- data = trimAndLoadJson(res, self)
377
- return data["questions"]
350
+ return await a_generate_with_schema_and_extract(
351
+ metric=self,
352
+ prompt=prompt,
353
+ schema_cls=Questions,
354
+ extract_schema=lambda s: s.questions,
355
+ extract_json=lambda data: data["questions"],
356
+ )
378
357
 
379
- def _generate_assessment_questions(self, text: str):
358
+ def _generate_assessment_questions(self, text: str) -> List[str]:
380
359
  prompt = SummarizationTemplate.generate_questions(text=text, n=self.n)
381
- if self.using_native_model:
382
- res, cost = self.model.generate(prompt, schema=Questions)
383
- self.evaluation_cost += cost
384
- return res.questions
385
- else:
386
- try:
387
- res: Questions = self.model.generate(prompt, schema=Questions)
388
- return res.questions
389
- except TypeError:
390
- res = self.model.generate(prompt)
391
- data = trimAndLoadJson(res, self)
392
- return data["questions"]
360
+ return generate_with_schema_and_extract(
361
+ metric=self,
362
+ prompt=prompt,
363
+ schema_cls=Questions,
364
+ extract_schema=lambda s: s.questions,
365
+ extract_json=lambda data: data["questions"],
366
+ )
393
367
 
394
368
  async def _a_generate_coverage_verdicts(
395
369
  self, test_case: LLMTestCase
@@ -453,30 +427,19 @@ class SummarizationMetric(BaseMetric):
453
427
  if len(self.claims) == 0:
454
428
  return []
455
429
 
456
- verdicts: List[SummarizationAlignmentVerdict] = []
457
430
  prompt = SummarizationTemplate.generate_alignment_verdicts(
458
431
  summary_claims=self.claims, original_text="\n\n".join(self.truths)
459
432
  )
460
- if self.using_native_model:
461
- res, cost = await self.model.a_generate(prompt, schema=Verdicts)
462
- self.evaluation_cost += cost
463
- verdicts = [item for item in res.verdicts]
464
- return verdicts
465
- else:
466
- try:
467
- res: Verdicts = await self.model.a_generate(
468
- prompt, schema=Verdicts
469
- )
470
- verdicts = [item for item in res.verdicts]
471
- return verdicts
472
- except TypeError:
473
- res = await self.model.a_generate(prompt)
474
- data = trimAndLoadJson(res, self)
475
- verdicts = [
476
- SummarizationAlignmentVerdict(**item)
477
- for item in data["verdicts"]
478
- ]
479
- return verdicts
433
+ return await a_generate_with_schema_and_extract(
434
+ metric=self,
435
+ prompt=prompt,
436
+ schema_cls=Verdicts,
437
+ extract_schema=lambda s: list(s.verdicts),
438
+ extract_json=lambda data: [
439
+ SummarizationAlignmentVerdict(**item)
440
+ for item in data["verdicts"]
441
+ ],
442
+ )
480
443
 
481
444
  def _generate_alignment_verdicts(
482
445
  self,
@@ -484,30 +447,19 @@ class SummarizationMetric(BaseMetric):
484
447
  if len(self.claims) == 0:
485
448
  return []
486
449
 
487
- verdicts: List[SummarizationAlignmentVerdict] = []
488
450
  prompt = SummarizationTemplate.generate_alignment_verdicts(
489
451
  summary_claims=self.claims, original_text="\n\n".join(self.truths)
490
452
  )
491
- if self.using_native_model:
492
- res, cost = self.model.generate(
493
- prompt, schema=SummarizationAlignmentVerdict
494
- )
495
- self.evaluation_cost += cost
496
- verdicts = [item for item in res.verdicts]
497
- return verdicts
498
- else:
499
- try:
500
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
501
- verdicts = [item for item in res.verdicts]
502
- return verdicts
503
- except TypeError:
504
- res = self.model.generate(prompt)
505
- data = trimAndLoadJson(res, self)
506
- verdicts = [
507
- SummarizationAlignmentVerdict(**item)
508
- for item in data["verdicts"]
509
- ]
510
- return verdicts
453
+ return generate_with_schema_and_extract(
454
+ metric=self,
455
+ prompt=prompt,
456
+ schema_cls=Verdicts,
457
+ extract_schema=lambda s: list(s.verdicts),
458
+ extract_json=lambda data: [
459
+ SummarizationAlignmentVerdict(**item)
460
+ for item in data["verdicts"]
461
+ ],
462
+ )
511
463
 
512
464
  async def _a_generate_truths(self, text: str) -> List[str]:
513
465
  # Borrow faithfulness template
@@ -515,34 +467,24 @@ class SummarizationMetric(BaseMetric):
515
467
  retrieval_context=text,
516
468
  extraction_limit=self.truths_extraction_limit,
517
469
  )
518
- if self.using_native_model:
519
- res, cost = await self.model.a_generate(prompt, schema=Truths)
520
- self.evaluation_cost += cost
521
- return res.truths
522
- else:
523
- try:
524
- res: Truths = await self.model.a_generate(prompt, schema=Truths)
525
- return res.truths
526
- except TypeError:
527
- res = await self.model.a_generate(prompt)
528
- data = trimAndLoadJson(res, self)
529
- return data["truths"]
470
+ return await a_generate_with_schema_and_extract(
471
+ metric=self,
472
+ prompt=prompt,
473
+ schema_cls=Truths,
474
+ extract_schema=lambda s: s.truths,
475
+ extract_json=lambda data: data["truths"],
476
+ )
530
477
 
531
478
  async def _a_generate_claims(self, text: str) -> List[str]:
532
479
  # Borrow faithfulness template
533
480
  prompt = FaithfulnessTemplate.generate_claims(actual_output=text)
534
- if self.using_native_model:
535
- res, cost = await self.model.a_generate(prompt, schema=Claims)
536
- self.evaluation_cost += cost
537
- return res.claims
538
- else:
539
- try:
540
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
541
- return res.claims
542
- except TypeError:
543
- res = await self.model.a_generate(prompt)
544
- data = trimAndLoadJson(res, self)
545
- return data["claims"]
481
+ return await a_generate_with_schema_and_extract(
482
+ metric=self,
483
+ prompt=prompt,
484
+ schema_cls=Claims,
485
+ extract_schema=lambda s: s.claims,
486
+ extract_json=lambda data: data["claims"],
487
+ )
546
488
 
547
489
  def _generate_truths(self, text: str) -> List[str]:
548
490
  # Borrow faithfulness template
@@ -550,34 +492,24 @@ class SummarizationMetric(BaseMetric):
550
492
  retrieval_context=text,
551
493
  extraction_limit=self.truths_extraction_limit,
552
494
  )
553
- if self.using_native_model:
554
- res, cost = self.model.generate(prompt, schema=Truths)
555
- self.evaluation_cost += cost
556
- return res.truths
557
- else:
558
- try:
559
- res: Truths = self.model.generate(prompt, schema=Truths)
560
- return res.truths
561
- except TypeError:
562
- res = self.model.generate(prompt)
563
- data = trimAndLoadJson(res, self)
564
- return data["truths"]
495
+ return generate_with_schema_and_extract(
496
+ metric=self,
497
+ prompt=prompt,
498
+ schema_cls=Truths,
499
+ extract_schema=lambda s: s.truths,
500
+ extract_json=lambda data: data["truths"],
501
+ )
565
502
 
566
503
  def _generate_claims(self, text: str) -> List[str]:
567
504
  # Borrow faithfulness template
568
505
  prompt = FaithfulnessTemplate.generate_claims(actual_output=text)
569
- if self.using_native_model:
570
- res, cost = self.model.generate(prompt, schema=Claims)
571
- self.evaluation_cost += cost
572
- return res.claims
573
- else:
574
- try:
575
- res: Claims = self.model.generate(prompt, schema=Claims)
576
- return res.claims
577
- except TypeError:
578
- res = self.model.generate(prompt)
579
- data = trimAndLoadJson(res, self)
580
- return data["claims"]
506
+ return generate_with_schema_and_extract(
507
+ metric=self,
508
+ prompt=prompt,
509
+ schema_cls=Claims,
510
+ extract_schema=lambda s: s.claims,
511
+ extract_json=lambda data: data["claims"],
512
+ )
581
513
 
582
514
  def is_successful(self) -> bool:
583
515
  if self.error is not None:
@@ -585,7 +517,7 @@ class SummarizationMetric(BaseMetric):
585
517
  else:
586
518
  try:
587
519
  self.success = self.score >= self.threshold
588
- except:
520
+ except TypeError:
589
521
  self.success = False
590
522
  return self.success
591
523
 
@@ -1,9 +1,20 @@
1
+ multimodal_rules = """
2
+ --- MULTIMODAL INPUT RULES ---
3
+ - Treat image content as factual evidence.
4
+ - Only reference visual details that are explicitly and clearly visible.
5
+ - Do not infer or guess objects, text, or details not visibly present.
6
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
7
+ """
8
+
9
+
1
10
  class SummarizationTemplate:
2
11
  @staticmethod
3
12
  def generate_reason(contradictions, redundancies, questions, score):
4
13
  return f"""You will be given the following: 1) information in the summary contradicting the original text, 2) extra information in the summary not mentioned in the original text, 3) [Optional] questions cannot be answered by the summary. Your task is to explain the quality of this summarization task.
5
14
  Given the summarization score, which is a 0-1 score indicating how good the summary is to the original text (higher the better), CONCISELY summarize the provided information to justify the score.
6
15
 
16
+ {multimodal_rules}
17
+
7
18
  **
8
19
  IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
9
20
  Example JSON:
@@ -28,6 +39,9 @@ Extra Information not mentioned in the original text:
28
39
  @staticmethod
29
40
  def generate_answers(questions, text):
30
41
  return f"""Based on the list of close-ended 'yes' or 'no' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided text contains sufficient information to answer EACH question.
42
+
43
+ {multimodal_rules}
44
+
31
45
  Answers should STRICTLY be either 'yes' or 'no'.
32
46
  Answer 'no' if the provided text does not contain enough information to answer the question.
33
47
  **
@@ -57,6 +71,8 @@ JSON:
57
71
  def generate_questions(text, n):
58
72
  return f"""Based on the given text, generate {n} closed-ended questions that can be answered with either a 'yes' or 'no'.
59
73
  The questions generated should ALWAYS result in a 'yes' based on the given text.
74
+
75
+ {multimodal_rules}
60
76
 
61
77
  ** IMPORTANT
62
78
  Only return a JSON with a 'questions' key, which is a list of strings.
@@ -72,6 +88,9 @@ JSON:
72
88
  @staticmethod
73
89
  def generate_alignment_verdicts(original_text, summary_claims):
74
90
  return f"""Based on the given summary claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH piece of info contradicts any facts in the original text. The JSON will have 2 fields: 'verdict' and 'reason'.
91
+
92
+ {multimodal_rules}
93
+
75
94
  The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given summary claim agrees with the original text.
76
95
  Provide a 'reason' ONLY if the answer is 'no' OR 'idk'.
77
96
  The provided summary claims is drawn from the summary. Try to provide a correction in the reason using the facts in the original text.