deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
from typing import List, Optional, Union, Type
|
|
2
2
|
import asyncio
|
|
3
3
|
|
|
4
|
-
from deepeval.test_case import
|
|
5
|
-
LLMTestCase,
|
|
6
|
-
LLMTestCaseParams,
|
|
7
|
-
)
|
|
4
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
8
5
|
from deepeval.metrics import BaseMetric
|
|
9
|
-
from deepeval.utils import
|
|
6
|
+
from deepeval.utils import (
|
|
7
|
+
get_or_create_event_loop,
|
|
8
|
+
prettify_list,
|
|
9
|
+
)
|
|
10
10
|
from deepeval.metrics.utils import (
|
|
11
11
|
construct_verbose_logs,
|
|
12
|
-
trimAndLoadJson,
|
|
13
12
|
check_llm_test_case_params,
|
|
14
13
|
initialize_model,
|
|
14
|
+
a_generate_with_schema_and_extract,
|
|
15
|
+
generate_with_schema_and_extract,
|
|
15
16
|
)
|
|
16
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
18
|
from deepeval.metrics.faithfulness.template import FaithfulnessTemplate
|
|
@@ -67,7 +68,16 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
67
68
|
_log_metric_to_confident: bool = True,
|
|
68
69
|
) -> float:
|
|
69
70
|
|
|
70
|
-
|
|
71
|
+
multimodal = test_case.multimodal
|
|
72
|
+
check_llm_test_case_params(
|
|
73
|
+
test_case,
|
|
74
|
+
self._required_params,
|
|
75
|
+
None,
|
|
76
|
+
None,
|
|
77
|
+
self,
|
|
78
|
+
self.model,
|
|
79
|
+
multimodal,
|
|
80
|
+
)
|
|
71
81
|
|
|
72
82
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
73
83
|
with metric_progress_indicator(
|
|
@@ -84,11 +94,16 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
84
94
|
)
|
|
85
95
|
)
|
|
86
96
|
else:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
97
|
+
retrieval_context = test_case.retrieval_context
|
|
98
|
+
actual_output = test_case.actual_output
|
|
99
|
+
|
|
100
|
+
self.truths = self._generate_truths(
|
|
101
|
+
retrieval_context, multimodal
|
|
102
|
+
)
|
|
103
|
+
self.claims = self._generate_claims(actual_output, multimodal)
|
|
104
|
+
self.verdicts = self._generate_verdicts(multimodal)
|
|
90
105
|
self.score = self._calculate_score()
|
|
91
|
-
self.reason = self._generate_reason()
|
|
106
|
+
self.reason = self._generate_reason(multimodal)
|
|
92
107
|
self.success = self.score >= self.threshold
|
|
93
108
|
self.verbose_logs = construct_verbose_logs(
|
|
94
109
|
self,
|
|
@@ -114,7 +129,16 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
114
129
|
_log_metric_to_confident: bool = True,
|
|
115
130
|
) -> float:
|
|
116
131
|
|
|
117
|
-
|
|
132
|
+
multimodal = test_case.multimodal
|
|
133
|
+
check_llm_test_case_params(
|
|
134
|
+
test_case,
|
|
135
|
+
self._required_params,
|
|
136
|
+
None,
|
|
137
|
+
None,
|
|
138
|
+
self,
|
|
139
|
+
self.model,
|
|
140
|
+
multimodal,
|
|
141
|
+
)
|
|
118
142
|
|
|
119
143
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
120
144
|
with metric_progress_indicator(
|
|
@@ -123,13 +147,16 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
123
147
|
_show_indicator=_show_indicator,
|
|
124
148
|
_in_component=_in_component,
|
|
125
149
|
):
|
|
150
|
+
retrieval_context = test_case.retrieval_context
|
|
151
|
+
actual_output = test_case.actual_output
|
|
152
|
+
|
|
126
153
|
self.truths, self.claims = await asyncio.gather(
|
|
127
|
-
self._a_generate_truths(
|
|
128
|
-
self._a_generate_claims(
|
|
154
|
+
self._a_generate_truths(retrieval_context, multimodal),
|
|
155
|
+
self._a_generate_claims(actual_output, multimodal),
|
|
129
156
|
)
|
|
130
|
-
self.verdicts = await self._a_generate_verdicts()
|
|
157
|
+
self.verdicts = await self._a_generate_verdicts(multimodal)
|
|
131
158
|
self.score = self._calculate_score()
|
|
132
|
-
self.reason = await self._a_generate_reason()
|
|
159
|
+
self.reason = await self._a_generate_reason(multimodal)
|
|
133
160
|
self.success = self.score >= self.threshold
|
|
134
161
|
self.verbose_logs = construct_verbose_logs(
|
|
135
162
|
self,
|
|
@@ -146,7 +173,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
146
173
|
)
|
|
147
174
|
return self.score
|
|
148
175
|
|
|
149
|
-
async def _a_generate_reason(self) -> str:
|
|
176
|
+
async def _a_generate_reason(self, multimodal: bool) -> str:
|
|
150
177
|
if self.include_reason is False:
|
|
151
178
|
return None
|
|
152
179
|
|
|
@@ -158,26 +185,18 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
158
185
|
prompt = self.evaluation_template.generate_reason(
|
|
159
186
|
contradictions=contradictions,
|
|
160
187
|
score=format(self.score, ".2f"),
|
|
188
|
+
multimodal=multimodal,
|
|
161
189
|
)
|
|
162
190
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
try:
|
|
171
|
-
res: FaithfulnessScoreReason = await self.model.a_generate(
|
|
172
|
-
prompt, schema=FaithfulnessScoreReason
|
|
173
|
-
)
|
|
174
|
-
return res.reason
|
|
175
|
-
except TypeError:
|
|
176
|
-
res = await self.model.a_generate(prompt)
|
|
177
|
-
data = trimAndLoadJson(res, self)
|
|
178
|
-
return data["reason"]
|
|
191
|
+
return await a_generate_with_schema_and_extract(
|
|
192
|
+
metric=self,
|
|
193
|
+
prompt=prompt,
|
|
194
|
+
schema_cls=FaithfulnessScoreReason,
|
|
195
|
+
extract_schema=lambda s: s.reason,
|
|
196
|
+
extract_json=lambda data: data["reason"],
|
|
197
|
+
)
|
|
179
198
|
|
|
180
|
-
def _generate_reason(self) -> str:
|
|
199
|
+
def _generate_reason(self, multimodal: bool) -> str:
|
|
181
200
|
if self.include_reason is False:
|
|
182
201
|
return None
|
|
183
202
|
|
|
@@ -189,148 +208,118 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
189
208
|
prompt = self.evaluation_template.generate_reason(
|
|
190
209
|
contradictions=contradictions,
|
|
191
210
|
score=format(self.score, ".2f"),
|
|
211
|
+
multimodal=multimodal,
|
|
192
212
|
)
|
|
193
213
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
try:
|
|
202
|
-
res: FaithfulnessScoreReason = self.model.generate(
|
|
203
|
-
prompt, schema=FaithfulnessScoreReason
|
|
204
|
-
)
|
|
205
|
-
return res.reason
|
|
206
|
-
except TypeError:
|
|
207
|
-
res = self.model.generate(prompt)
|
|
208
|
-
data = trimAndLoadJson(res, self)
|
|
209
|
-
return data["reason"]
|
|
214
|
+
return generate_with_schema_and_extract(
|
|
215
|
+
metric=self,
|
|
216
|
+
prompt=prompt,
|
|
217
|
+
schema_cls=FaithfulnessScoreReason,
|
|
218
|
+
extract_schema=lambda s: s.reason,
|
|
219
|
+
extract_json=lambda data: data["reason"],
|
|
220
|
+
)
|
|
210
221
|
|
|
211
|
-
async def _a_generate_verdicts(
|
|
222
|
+
async def _a_generate_verdicts(
|
|
223
|
+
self, multimodal: bool
|
|
224
|
+
) -> List[FaithfulnessVerdict]:
|
|
212
225
|
if len(self.claims) == 0:
|
|
213
226
|
return []
|
|
214
227
|
|
|
215
|
-
verdicts: List[FaithfulnessVerdict] = []
|
|
216
228
|
prompt = self.evaluation_template.generate_verdicts(
|
|
217
|
-
claims=self.claims,
|
|
229
|
+
claims=self.claims,
|
|
230
|
+
retrieval_context="\n\n".join(self.truths),
|
|
231
|
+
multimodal=multimodal,
|
|
218
232
|
)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
self
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
except TypeError:
|
|
232
|
-
res = await self.model.a_generate(prompt)
|
|
233
|
-
data = trimAndLoadJson(res, self)
|
|
234
|
-
verdicts = [
|
|
235
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
236
|
-
]
|
|
237
|
-
return verdicts
|
|
238
|
-
|
|
239
|
-
def _generate_verdicts(self) -> List[FaithfulnessVerdict]:
|
|
233
|
+
|
|
234
|
+
return await a_generate_with_schema_and_extract(
|
|
235
|
+
metric=self,
|
|
236
|
+
prompt=prompt,
|
|
237
|
+
schema_cls=Verdicts,
|
|
238
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
239
|
+
extract_json=lambda data: [
|
|
240
|
+
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
241
|
+
],
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def _generate_verdicts(self, multimodal: bool) -> List[FaithfulnessVerdict]:
|
|
240
245
|
if len(self.claims) == 0:
|
|
241
246
|
return []
|
|
242
247
|
|
|
243
|
-
verdicts: List[FaithfulnessVerdict] = []
|
|
244
248
|
prompt = self.evaluation_template.generate_verdicts(
|
|
245
|
-
claims=self.claims,
|
|
249
|
+
claims=self.claims,
|
|
250
|
+
retrieval_context="\n\n".join(self.truths),
|
|
251
|
+
multimodal=multimodal,
|
|
246
252
|
)
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
self
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
262
|
-
]
|
|
263
|
-
return verdicts
|
|
264
|
-
|
|
265
|
-
async def _a_generate_truths(self, retrieval_context: str) -> List[str]:
|
|
253
|
+
|
|
254
|
+
return generate_with_schema_and_extract(
|
|
255
|
+
metric=self,
|
|
256
|
+
prompt=prompt,
|
|
257
|
+
schema_cls=Verdicts,
|
|
258
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
259
|
+
extract_json=lambda data: [
|
|
260
|
+
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
|
261
|
+
],
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
async def _a_generate_truths(
|
|
265
|
+
self, retrieval_context: str, multimodal: bool
|
|
266
|
+
) -> List[str]:
|
|
266
267
|
prompt = self.evaluation_template.generate_truths(
|
|
267
268
|
retrieval_context="\n\n".join(retrieval_context),
|
|
268
269
|
extraction_limit=self.truths_extraction_limit,
|
|
270
|
+
multimodal=multimodal,
|
|
271
|
+
)
|
|
272
|
+
return await a_generate_with_schema_and_extract(
|
|
273
|
+
metric=self,
|
|
274
|
+
prompt=prompt,
|
|
275
|
+
schema_cls=Truths,
|
|
276
|
+
extract_schema=lambda s: s.truths,
|
|
277
|
+
extract_json=lambda data: data["truths"],
|
|
269
278
|
)
|
|
270
|
-
if self.using_native_model:
|
|
271
|
-
res, cost = await self.model.a_generate(prompt, schema=Truths)
|
|
272
|
-
self.evaluation_cost += cost
|
|
273
|
-
return res.truths
|
|
274
|
-
else:
|
|
275
|
-
try:
|
|
276
|
-
res: Truths = await self.model.a_generate(prompt, schema=Truths)
|
|
277
|
-
return res.truths
|
|
278
|
-
except TypeError:
|
|
279
|
-
res = await self.model.a_generate(prompt)
|
|
280
|
-
data = trimAndLoadJson(res, self)
|
|
281
|
-
return data["truths"]
|
|
282
279
|
|
|
283
|
-
def _generate_truths(
|
|
280
|
+
def _generate_truths(
|
|
281
|
+
self, retrieval_context: str, multimodal: bool
|
|
282
|
+
) -> List[str]:
|
|
284
283
|
prompt = self.evaluation_template.generate_truths(
|
|
285
284
|
retrieval_context="\n\n".join(retrieval_context),
|
|
286
285
|
extraction_limit=self.truths_extraction_limit,
|
|
286
|
+
multimodal=multimodal,
|
|
287
|
+
)
|
|
288
|
+
return generate_with_schema_and_extract(
|
|
289
|
+
metric=self,
|
|
290
|
+
prompt=prompt,
|
|
291
|
+
schema_cls=Truths,
|
|
292
|
+
extract_schema=lambda s: s.truths,
|
|
293
|
+
extract_json=lambda data: data["truths"],
|
|
287
294
|
)
|
|
288
|
-
if self.using_native_model:
|
|
289
|
-
res, cost = self.model.generate(prompt, schema=Truths)
|
|
290
|
-
self.evaluation_cost += cost
|
|
291
|
-
return res.truths
|
|
292
|
-
else:
|
|
293
|
-
try:
|
|
294
|
-
res: Truths = self.model.generate(prompt, schema=Truths)
|
|
295
|
-
return res.truths
|
|
296
|
-
except TypeError:
|
|
297
|
-
res = self.model.generate(prompt)
|
|
298
|
-
data = trimAndLoadJson(res, self)
|
|
299
|
-
return data["truths"]
|
|
300
295
|
|
|
301
|
-
async def _a_generate_claims(
|
|
296
|
+
async def _a_generate_claims(
|
|
297
|
+
self, actual_output: str, multimodal: bool
|
|
298
|
+
) -> List[str]:
|
|
302
299
|
prompt = self.evaluation_template.generate_claims(
|
|
303
|
-
actual_output=actual_output
|
|
300
|
+
actual_output=actual_output, multimodal=multimodal
|
|
301
|
+
)
|
|
302
|
+
return await a_generate_with_schema_and_extract(
|
|
303
|
+
metric=self,
|
|
304
|
+
prompt=prompt,
|
|
305
|
+
schema_cls=Claims,
|
|
306
|
+
extract_schema=lambda s: s.claims,
|
|
307
|
+
extract_json=lambda data: data["claims"],
|
|
304
308
|
)
|
|
305
|
-
if self.using_native_model:
|
|
306
|
-
res, cost = await self.model.a_generate(prompt, schema=Claims)
|
|
307
|
-
self.evaluation_cost += cost
|
|
308
|
-
return res.claims
|
|
309
|
-
else:
|
|
310
|
-
try:
|
|
311
|
-
res: Claims = await self.model.a_generate(prompt, schema=Claims)
|
|
312
|
-
return res.claims
|
|
313
|
-
except TypeError:
|
|
314
|
-
res = await self.model.a_generate(prompt)
|
|
315
|
-
data = trimAndLoadJson(res, self)
|
|
316
|
-
return data["claims"]
|
|
317
309
|
|
|
318
|
-
def _generate_claims(
|
|
310
|
+
def _generate_claims(
|
|
311
|
+
self, actual_output: str, multimodal: bool
|
|
312
|
+
) -> List[str]:
|
|
319
313
|
prompt = self.evaluation_template.generate_claims(
|
|
320
|
-
actual_output=actual_output
|
|
314
|
+
actual_output=actual_output, multimodal=multimodal
|
|
315
|
+
)
|
|
316
|
+
return generate_with_schema_and_extract(
|
|
317
|
+
metric=self,
|
|
318
|
+
prompt=prompt,
|
|
319
|
+
schema_cls=Claims,
|
|
320
|
+
extract_schema=lambda s: s.claims,
|
|
321
|
+
extract_json=lambda data: data["claims"],
|
|
321
322
|
)
|
|
322
|
-
if self.using_native_model:
|
|
323
|
-
res, cost = self.model.generate(prompt, schema=Claims)
|
|
324
|
-
self.evaluation_cost += cost
|
|
325
|
-
return res.claims
|
|
326
|
-
else:
|
|
327
|
-
try:
|
|
328
|
-
res: Claims = self.model.generate(prompt, schema=Claims)
|
|
329
|
-
return res.claims
|
|
330
|
-
except TypeError:
|
|
331
|
-
res = self.model.generate(prompt)
|
|
332
|
-
data = trimAndLoadJson(res, self)
|
|
333
|
-
return data["claims"]
|
|
334
323
|
|
|
335
324
|
def _calculate_score(self) -> float:
|
|
336
325
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -357,7 +346,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
357
346
|
else:
|
|
358
347
|
try:
|
|
359
348
|
self.success = self.score >= self.threshold
|
|
360
|
-
except:
|
|
349
|
+
except TypeError:
|
|
361
350
|
self.success = False
|
|
362
351
|
return self.success
|
|
363
352
|
|