deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -5,16 +5,16 @@ from deepeval.metrics.api import metric_data_manager
|
|
|
5
5
|
from deepeval.test_case import (
|
|
6
6
|
LLMTestCase,
|
|
7
7
|
LLMTestCaseParams,
|
|
8
|
-
ConversationalTestCase,
|
|
9
8
|
)
|
|
10
9
|
from deepeval.metrics import BaseMetric
|
|
11
10
|
from deepeval.models import DeepEvalBaseLLM
|
|
12
11
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
13
12
|
from deepeval.metrics.utils import (
|
|
14
13
|
construct_verbose_logs,
|
|
15
|
-
trimAndLoadJson,
|
|
16
14
|
check_llm_test_case_params,
|
|
17
15
|
initialize_model,
|
|
16
|
+
a_generate_with_schema_and_extract,
|
|
17
|
+
generate_with_schema_and_extract,
|
|
18
18
|
)
|
|
19
19
|
from deepeval.metrics.summarization.template import SummarizationTemplate
|
|
20
20
|
from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
|
|
@@ -77,7 +77,15 @@ class SummarizationMetric(BaseMetric):
|
|
|
77
77
|
_log_metric_to_confident: bool = True,
|
|
78
78
|
) -> float:
|
|
79
79
|
|
|
80
|
-
check_llm_test_case_params(
|
|
80
|
+
check_llm_test_case_params(
|
|
81
|
+
test_case,
|
|
82
|
+
self._required_params,
|
|
83
|
+
None,
|
|
84
|
+
None,
|
|
85
|
+
self,
|
|
86
|
+
self.model,
|
|
87
|
+
test_case.multimodal,
|
|
88
|
+
)
|
|
81
89
|
|
|
82
90
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
83
91
|
with metric_progress_indicator(
|
|
@@ -138,7 +146,15 @@ class SummarizationMetric(BaseMetric):
|
|
|
138
146
|
_log_metric_to_confident: bool = True,
|
|
139
147
|
) -> float:
|
|
140
148
|
|
|
141
|
-
check_llm_test_case_params(
|
|
149
|
+
check_llm_test_case_params(
|
|
150
|
+
test_case,
|
|
151
|
+
self._required_params,
|
|
152
|
+
None,
|
|
153
|
+
None,
|
|
154
|
+
self,
|
|
155
|
+
self.model,
|
|
156
|
+
test_case.multimodal,
|
|
157
|
+
)
|
|
142
158
|
|
|
143
159
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
144
160
|
with metric_progress_indicator(
|
|
@@ -185,7 +201,7 @@ class SummarizationMetric(BaseMetric):
|
|
|
185
201
|
|
|
186
202
|
return self.score
|
|
187
203
|
|
|
188
|
-
async def _a_generate_reason(self) -> str:
|
|
204
|
+
async def _a_generate_reason(self) -> Optional[str]:
|
|
189
205
|
if self.include_reason is False:
|
|
190
206
|
return None
|
|
191
207
|
|
|
@@ -221,24 +237,15 @@ class SummarizationMetric(BaseMetric):
|
|
|
221
237
|
prompt += """JSON:
|
|
222
238
|
"""
|
|
223
239
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
try:
|
|
232
|
-
res: SummarizationScoreReason = await self.model.a_generate(
|
|
233
|
-
prompt, schema=SummarizationScoreReason
|
|
234
|
-
)
|
|
235
|
-
return res.reason
|
|
236
|
-
except TypeError:
|
|
237
|
-
res = await self.model.a_generate(prompt)
|
|
238
|
-
data = trimAndLoadJson(res, self)
|
|
239
|
-
return data["reason"]
|
|
240
|
+
return await a_generate_with_schema_and_extract(
|
|
241
|
+
metric=self,
|
|
242
|
+
prompt=prompt,
|
|
243
|
+
schema_cls=SummarizationScoreReason,
|
|
244
|
+
extract_schema=lambda s: s.reason,
|
|
245
|
+
extract_json=lambda data: data["reason"],
|
|
246
|
+
)
|
|
240
247
|
|
|
241
|
-
def _generate_reason(self) -> str:
|
|
248
|
+
def _generate_reason(self) -> Optional[str]:
|
|
242
249
|
if self.include_reason is False:
|
|
243
250
|
return None
|
|
244
251
|
|
|
@@ -274,22 +281,13 @@ class SummarizationMetric(BaseMetric):
|
|
|
274
281
|
prompt += """JSON:
|
|
275
282
|
"""
|
|
276
283
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
try:
|
|
285
|
-
res: SummarizationScoreReason = self.model.generate(
|
|
286
|
-
prompt, schema=SummarizationScoreReason
|
|
287
|
-
)
|
|
288
|
-
return res.reason
|
|
289
|
-
except TypeError:
|
|
290
|
-
res = self.model.generate(prompt)
|
|
291
|
-
data = trimAndLoadJson(res, self)
|
|
292
|
-
return data["reason"]
|
|
284
|
+
return generate_with_schema_and_extract(
|
|
285
|
+
metric=self,
|
|
286
|
+
prompt=prompt,
|
|
287
|
+
schema_cls=SummarizationScoreReason,
|
|
288
|
+
extract_schema=lambda s: s.reason,
|
|
289
|
+
extract_json=lambda data: data["reason"],
|
|
290
|
+
)
|
|
293
291
|
|
|
294
292
|
def _calculate_score(self, score_type: ScoreType) -> float:
|
|
295
293
|
if score_type == ScoreType.ALIGNMENT:
|
|
@@ -327,69 +325,45 @@ class SummarizationMetric(BaseMetric):
|
|
|
327
325
|
prompt = SummarizationTemplate.generate_answers(
|
|
328
326
|
questions=self.assessment_questions, text=text
|
|
329
327
|
)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
prompt, schema=Answers
|
|
338
|
-
)
|
|
339
|
-
return res.answers
|
|
340
|
-
except TypeError:
|
|
341
|
-
res = await self.model.a_generate(prompt)
|
|
342
|
-
data = trimAndLoadJson(res, self)
|
|
343
|
-
return data["answers"]
|
|
328
|
+
return await a_generate_with_schema_and_extract(
|
|
329
|
+
metric=self,
|
|
330
|
+
prompt=prompt,
|
|
331
|
+
schema_cls=Answers,
|
|
332
|
+
extract_schema=lambda s: s.answers,
|
|
333
|
+
extract_json=lambda data: data["answers"],
|
|
334
|
+
)
|
|
344
335
|
|
|
345
336
|
def _generate_answers(self, text: str) -> List[str]:
|
|
346
337
|
prompt = SummarizationTemplate.generate_answers(
|
|
347
338
|
questions=self.assessment_questions, text=text
|
|
348
339
|
)
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
return res.answers
|
|
357
|
-
except TypeError:
|
|
358
|
-
res = self.model.generate(prompt)
|
|
359
|
-
data = trimAndLoadJson(res, self)
|
|
360
|
-
return data["answers"]
|
|
340
|
+
return generate_with_schema_and_extract(
|
|
341
|
+
metric=self,
|
|
342
|
+
prompt=prompt,
|
|
343
|
+
schema_cls=Answers,
|
|
344
|
+
extract_schema=lambda s: s.answers,
|
|
345
|
+
extract_json=lambda data: data["answers"],
|
|
346
|
+
)
|
|
361
347
|
|
|
362
|
-
async def _a_generate_assessment_questions(self, text: str):
|
|
348
|
+
async def _a_generate_assessment_questions(self, text: str) -> List[str]:
|
|
363
349
|
prompt = SummarizationTemplate.generate_questions(text=text, n=self.n)
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
prompt, schema=Questions
|
|
372
|
-
)
|
|
373
|
-
return res.questions
|
|
374
|
-
except TypeError:
|
|
375
|
-
res = await self.model.a_generate(prompt)
|
|
376
|
-
data = trimAndLoadJson(res, self)
|
|
377
|
-
return data["questions"]
|
|
350
|
+
return await a_generate_with_schema_and_extract(
|
|
351
|
+
metric=self,
|
|
352
|
+
prompt=prompt,
|
|
353
|
+
schema_cls=Questions,
|
|
354
|
+
extract_schema=lambda s: s.questions,
|
|
355
|
+
extract_json=lambda data: data["questions"],
|
|
356
|
+
)
|
|
378
357
|
|
|
379
|
-
def _generate_assessment_questions(self, text: str):
|
|
358
|
+
def _generate_assessment_questions(self, text: str) -> List[str]:
|
|
380
359
|
prompt = SummarizationTemplate.generate_questions(text=text, n=self.n)
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
return res.questions
|
|
389
|
-
except TypeError:
|
|
390
|
-
res = self.model.generate(prompt)
|
|
391
|
-
data = trimAndLoadJson(res, self)
|
|
392
|
-
return data["questions"]
|
|
360
|
+
return generate_with_schema_and_extract(
|
|
361
|
+
metric=self,
|
|
362
|
+
prompt=prompt,
|
|
363
|
+
schema_cls=Questions,
|
|
364
|
+
extract_schema=lambda s: s.questions,
|
|
365
|
+
extract_json=lambda data: data["questions"],
|
|
366
|
+
)
|
|
393
367
|
|
|
394
368
|
async def _a_generate_coverage_verdicts(
|
|
395
369
|
self, test_case: LLMTestCase
|
|
@@ -453,30 +427,19 @@ class SummarizationMetric(BaseMetric):
|
|
|
453
427
|
if len(self.claims) == 0:
|
|
454
428
|
return []
|
|
455
429
|
|
|
456
|
-
verdicts: List[SummarizationAlignmentVerdict] = []
|
|
457
430
|
prompt = SummarizationTemplate.generate_alignment_verdicts(
|
|
458
431
|
summary_claims=self.claims, original_text="\n\n".join(self.truths)
|
|
459
432
|
)
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
verdicts = [item for item in res.verdicts]
|
|
471
|
-
return verdicts
|
|
472
|
-
except TypeError:
|
|
473
|
-
res = await self.model.a_generate(prompt)
|
|
474
|
-
data = trimAndLoadJson(res, self)
|
|
475
|
-
verdicts = [
|
|
476
|
-
SummarizationAlignmentVerdict(**item)
|
|
477
|
-
for item in data["verdicts"]
|
|
478
|
-
]
|
|
479
|
-
return verdicts
|
|
433
|
+
return await a_generate_with_schema_and_extract(
|
|
434
|
+
metric=self,
|
|
435
|
+
prompt=prompt,
|
|
436
|
+
schema_cls=Verdicts,
|
|
437
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
438
|
+
extract_json=lambda data: [
|
|
439
|
+
SummarizationAlignmentVerdict(**item)
|
|
440
|
+
for item in data["verdicts"]
|
|
441
|
+
],
|
|
442
|
+
)
|
|
480
443
|
|
|
481
444
|
def _generate_alignment_verdicts(
|
|
482
445
|
self,
|
|
@@ -484,30 +447,19 @@ class SummarizationMetric(BaseMetric):
|
|
|
484
447
|
if len(self.claims) == 0:
|
|
485
448
|
return []
|
|
486
449
|
|
|
487
|
-
verdicts: List[SummarizationAlignmentVerdict] = []
|
|
488
450
|
prompt = SummarizationTemplate.generate_alignment_verdicts(
|
|
489
451
|
summary_claims=self.claims, original_text="\n\n".join(self.truths)
|
|
490
452
|
)
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
verdicts = [item for item in res.verdicts]
|
|
502
|
-
return verdicts
|
|
503
|
-
except TypeError:
|
|
504
|
-
res = self.model.generate(prompt)
|
|
505
|
-
data = trimAndLoadJson(res, self)
|
|
506
|
-
verdicts = [
|
|
507
|
-
SummarizationAlignmentVerdict(**item)
|
|
508
|
-
for item in data["verdicts"]
|
|
509
|
-
]
|
|
510
|
-
return verdicts
|
|
453
|
+
return generate_with_schema_and_extract(
|
|
454
|
+
metric=self,
|
|
455
|
+
prompt=prompt,
|
|
456
|
+
schema_cls=Verdicts,
|
|
457
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
458
|
+
extract_json=lambda data: [
|
|
459
|
+
SummarizationAlignmentVerdict(**item)
|
|
460
|
+
for item in data["verdicts"]
|
|
461
|
+
],
|
|
462
|
+
)
|
|
511
463
|
|
|
512
464
|
async def _a_generate_truths(self, text: str) -> List[str]:
|
|
513
465
|
# Borrow faithfulness template
|
|
@@ -515,34 +467,24 @@ class SummarizationMetric(BaseMetric):
|
|
|
515
467
|
retrieval_context=text,
|
|
516
468
|
extraction_limit=self.truths_extraction_limit,
|
|
517
469
|
)
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
return res.truths
|
|
526
|
-
except TypeError:
|
|
527
|
-
res = await self.model.a_generate(prompt)
|
|
528
|
-
data = trimAndLoadJson(res, self)
|
|
529
|
-
return data["truths"]
|
|
470
|
+
return await a_generate_with_schema_and_extract(
|
|
471
|
+
metric=self,
|
|
472
|
+
prompt=prompt,
|
|
473
|
+
schema_cls=Truths,
|
|
474
|
+
extract_schema=lambda s: s.truths,
|
|
475
|
+
extract_json=lambda data: data["truths"],
|
|
476
|
+
)
|
|
530
477
|
|
|
531
478
|
async def _a_generate_claims(self, text: str) -> List[str]:
|
|
532
479
|
# Borrow faithfulness template
|
|
533
480
|
prompt = FaithfulnessTemplate.generate_claims(actual_output=text)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
return res.claims
|
|
542
|
-
except TypeError:
|
|
543
|
-
res = await self.model.a_generate(prompt)
|
|
544
|
-
data = trimAndLoadJson(res, self)
|
|
545
|
-
return data["claims"]
|
|
481
|
+
return await a_generate_with_schema_and_extract(
|
|
482
|
+
metric=self,
|
|
483
|
+
prompt=prompt,
|
|
484
|
+
schema_cls=Claims,
|
|
485
|
+
extract_schema=lambda s: s.claims,
|
|
486
|
+
extract_json=lambda data: data["claims"],
|
|
487
|
+
)
|
|
546
488
|
|
|
547
489
|
def _generate_truths(self, text: str) -> List[str]:
|
|
548
490
|
# Borrow faithfulness template
|
|
@@ -550,34 +492,24 @@ class SummarizationMetric(BaseMetric):
|
|
|
550
492
|
retrieval_context=text,
|
|
551
493
|
extraction_limit=self.truths_extraction_limit,
|
|
552
494
|
)
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
return res.truths
|
|
561
|
-
except TypeError:
|
|
562
|
-
res = self.model.generate(prompt)
|
|
563
|
-
data = trimAndLoadJson(res, self)
|
|
564
|
-
return data["truths"]
|
|
495
|
+
return generate_with_schema_and_extract(
|
|
496
|
+
metric=self,
|
|
497
|
+
prompt=prompt,
|
|
498
|
+
schema_cls=Truths,
|
|
499
|
+
extract_schema=lambda s: s.truths,
|
|
500
|
+
extract_json=lambda data: data["truths"],
|
|
501
|
+
)
|
|
565
502
|
|
|
566
503
|
def _generate_claims(self, text: str) -> List[str]:
|
|
567
504
|
# Borrow faithfulness template
|
|
568
505
|
prompt = FaithfulnessTemplate.generate_claims(actual_output=text)
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
return res.claims
|
|
577
|
-
except TypeError:
|
|
578
|
-
res = self.model.generate(prompt)
|
|
579
|
-
data = trimAndLoadJson(res, self)
|
|
580
|
-
return data["claims"]
|
|
506
|
+
return generate_with_schema_and_extract(
|
|
507
|
+
metric=self,
|
|
508
|
+
prompt=prompt,
|
|
509
|
+
schema_cls=Claims,
|
|
510
|
+
extract_schema=lambda s: s.claims,
|
|
511
|
+
extract_json=lambda data: data["claims"],
|
|
512
|
+
)
|
|
581
513
|
|
|
582
514
|
def is_successful(self) -> bool:
|
|
583
515
|
if self.error is not None:
|
|
@@ -585,7 +517,7 @@ class SummarizationMetric(BaseMetric):
|
|
|
585
517
|
else:
|
|
586
518
|
try:
|
|
587
519
|
self.success = self.score >= self.threshold
|
|
588
|
-
except:
|
|
520
|
+
except TypeError:
|
|
589
521
|
self.success = False
|
|
590
522
|
return self.success
|
|
591
523
|
|
|
@@ -1,9 +1,20 @@
|
|
|
1
|
+
multimodal_rules = """
|
|
2
|
+
--- MULTIMODAL INPUT RULES ---
|
|
3
|
+
- Treat image content as factual evidence.
|
|
4
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
5
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
6
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
1
10
|
class SummarizationTemplate:
|
|
2
11
|
@staticmethod
|
|
3
12
|
def generate_reason(contradictions, redundancies, questions, score):
|
|
4
13
|
return f"""You will be given the following: 1) information in the summary contradicting the original text, 2) extra information in the summary not mentioned in the original text, 3) [Optional] questions cannot be answered by the summary. Your task is to explain the quality of this summarization task.
|
|
5
14
|
Given the summarization score, which is a 0-1 score indicating how good the summary is to the original text (higher the better), CONCISELY summarize the provided information to justify the score.
|
|
6
15
|
|
|
16
|
+
{multimodal_rules}
|
|
17
|
+
|
|
7
18
|
**
|
|
8
19
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
9
20
|
Example JSON:
|
|
@@ -28,6 +39,9 @@ Extra Information not mentioned in the original text:
|
|
|
28
39
|
@staticmethod
|
|
29
40
|
def generate_answers(questions, text):
|
|
30
41
|
return f"""Based on the list of close-ended 'yes' or 'no' questions, generate a JSON with key 'answers', which is a list of strings that determines whether the provided text contains sufficient information to answer EACH question.
|
|
42
|
+
|
|
43
|
+
{multimodal_rules}
|
|
44
|
+
|
|
31
45
|
Answers should STRICTLY be either 'yes' or 'no'.
|
|
32
46
|
Answer 'no' if the provided text does not contain enough information to answer the question.
|
|
33
47
|
**
|
|
@@ -57,6 +71,8 @@ JSON:
|
|
|
57
71
|
def generate_questions(text, n):
|
|
58
72
|
return f"""Based on the given text, generate {n} closed-ended questions that can be answered with either a 'yes' or 'no'.
|
|
59
73
|
The questions generated should ALWAYS result in a 'yes' based on the given text.
|
|
74
|
+
|
|
75
|
+
{multimodal_rules}
|
|
60
76
|
|
|
61
77
|
** IMPORTANT
|
|
62
78
|
Only return a JSON with a 'questions' key, which is a list of strings.
|
|
@@ -72,6 +88,9 @@ JSON:
|
|
|
72
88
|
@staticmethod
|
|
73
89
|
def generate_alignment_verdicts(original_text, summary_claims):
|
|
74
90
|
return f"""Based on the given summary claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH piece of info contradicts any facts in the original text. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
91
|
+
|
|
92
|
+
{multimodal_rules}
|
|
93
|
+
|
|
75
94
|
The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given summary claim agrees with the original text.
|
|
76
95
|
Provide a 'reason' ONLY if the answer is 'no' OR 'idk'.
|
|
77
96
|
The provided summary claims is drawn from the summary. Try to provide a correction in the reason using the facts in the original text.
|