deepeval 3.7.4__py3-none-any.whl → 3.7.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/dataset/golden.py +54 -2
- deepeval/evaluate/evaluate.py +16 -8
- deepeval/evaluate/execute.py +70 -26
- deepeval/evaluate/utils.py +26 -22
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/metrics/__init__.py +14 -12
- deepeval/metrics/answer_relevancy/answer_relevancy.py +74 -29
- deepeval/metrics/answer_relevancy/template.py +188 -92
- deepeval/metrics/base_metric.py +2 -5
- deepeval/metrics/contextual_precision/contextual_precision.py +53 -15
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +50 -13
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +47 -15
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +70 -27
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/utils.py +2 -2
- deepeval/metrics/indicator.py +4 -4
- deepeval/metrics/multimodal_metrics/__init__.py +0 -18
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +24 -17
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +26 -21
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +24 -17
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +24 -17
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +19 -19
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +63 -78
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +20 -20
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +71 -50
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/tool_correctness/tool_correctness.py +2 -2
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +550 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +520 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +535 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +596 -0
- deepeval/metrics/utils.py +39 -58
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +16 -38
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +52 -28
- deepeval/models/embedding_models/local_embedding_model.py +18 -14
- deepeval/models/embedding_models/ollama_embedding_model.py +38 -16
- deepeval/models/embedding_models/openai_embedding_model.py +40 -21
- deepeval/models/llms/amazon_bedrock_model.py +1 -2
- deepeval/models/llms/anthropic_model.py +44 -23
- deepeval/models/llms/azure_model.py +121 -36
- deepeval/models/llms/deepseek_model.py +18 -13
- deepeval/models/llms/gemini_model.py +129 -43
- deepeval/models/llms/grok_model.py +18 -13
- deepeval/models/llms/kimi_model.py +18 -13
- deepeval/models/llms/litellm_model.py +42 -22
- deepeval/models/llms/local_model.py +12 -7
- deepeval/models/llms/ollama_model.py +114 -12
- deepeval/models/llms/openai_model.py +137 -41
- deepeval/models/llms/portkey_model.py +24 -7
- deepeval/models/llms/utils.py +5 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +46 -1
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +12 -10
- deepeval/test_case/conversational_test_case.py +19 -1
- deepeval/test_case/llm_test_case.py +152 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +15 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/RECORD +116 -125
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.5.dist-info}/entry_points.txt +0 -0
deepeval/metrics/__init__.py
CHANGED
|
@@ -42,6 +42,16 @@ from .mcp_use_metric.mcp_use_metric import MCPUseMetric
|
|
|
42
42
|
from .turn_relevancy.turn_relevancy import (
|
|
43
43
|
TurnRelevancyMetric,
|
|
44
44
|
)
|
|
45
|
+
from .turn_faithfulness.turn_faithfulness import TurnFaithfulnessMetric
|
|
46
|
+
from .turn_contextual_precision.turn_contextual_precision import (
|
|
47
|
+
TurnContextualPrecisionMetric,
|
|
48
|
+
)
|
|
49
|
+
from .turn_contextual_recall.turn_contextual_recall import (
|
|
50
|
+
TurnContextualRecallMetric,
|
|
51
|
+
)
|
|
52
|
+
from .turn_contextual_relevancy.turn_contextual_relevancy import (
|
|
53
|
+
TurnContextualRelevancyMetric,
|
|
54
|
+
)
|
|
45
55
|
from .conversation_completeness.conversation_completeness import (
|
|
46
56
|
ConversationCompletenessMetric,
|
|
47
57
|
)
|
|
@@ -55,12 +65,6 @@ from .multimodal_metrics import (
|
|
|
55
65
|
ImageCoherenceMetric,
|
|
56
66
|
ImageHelpfulnessMetric,
|
|
57
67
|
ImageReferenceMetric,
|
|
58
|
-
MultimodalContextualRecallMetric,
|
|
59
|
-
MultimodalContextualRelevancyMetric,
|
|
60
|
-
MultimodalContextualPrecisionMetric,
|
|
61
|
-
MultimodalAnswerRelevancyMetric,
|
|
62
|
-
MultimodalFaithfulnessMetric,
|
|
63
|
-
MultimodalToolCorrectnessMetric,
|
|
64
68
|
MultimodalGEval,
|
|
65
69
|
)
|
|
66
70
|
|
|
@@ -119,17 +123,15 @@ __all__ = [
|
|
|
119
123
|
# Conversational metrics
|
|
120
124
|
"TurnRelevancyMetric",
|
|
121
125
|
"ConversationCompletenessMetric",
|
|
126
|
+
"TurnFaithfulnessMetric",
|
|
127
|
+
"TurnContextualPrecisionMetric",
|
|
128
|
+
"TurnContextualRecallMetric",
|
|
129
|
+
"TurnContextualRelevancyMetric",
|
|
122
130
|
# Multimodal metrics
|
|
123
131
|
"TextToImageMetric",
|
|
124
132
|
"ImageEditingMetric",
|
|
125
133
|
"ImageCoherenceMetric",
|
|
126
134
|
"ImageHelpfulnessMetric",
|
|
127
135
|
"ImageReferenceMetric",
|
|
128
|
-
"MultimodalContextualRecallMetric",
|
|
129
|
-
"MultimodalContextualRelevancyMetric",
|
|
130
|
-
"MultimodalContextualPrecisionMetric",
|
|
131
|
-
"MultimodalAnswerRelevancyMetric",
|
|
132
|
-
"MultimodalFaithfulnessMetric",
|
|
133
|
-
"MultimodalToolCorrectnessMetric",
|
|
134
136
|
"MultimodalGEval",
|
|
135
137
|
]
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from typing import Optional, List, Type, Union
|
|
2
2
|
|
|
3
|
-
from deepeval.utils import
|
|
3
|
+
from deepeval.utils import (
|
|
4
|
+
get_or_create_event_loop,
|
|
5
|
+
prettify_list,
|
|
6
|
+
)
|
|
4
7
|
from deepeval.metrics.utils import (
|
|
5
8
|
construct_verbose_logs,
|
|
6
9
|
trimAndLoadJson,
|
|
7
10
|
check_llm_test_case_params,
|
|
11
|
+
check_mllm_test_case_params,
|
|
8
12
|
initialize_model,
|
|
9
13
|
)
|
|
10
|
-
from deepeval.test_case import
|
|
11
|
-
LLMTestCase,
|
|
12
|
-
LLMTestCaseParams,
|
|
13
|
-
)
|
|
14
|
+
from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
|
|
14
15
|
from deepeval.metrics import BaseMetric
|
|
15
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
|
|
@@ -53,7 +54,14 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
53
54
|
_in_component: bool = False,
|
|
54
55
|
_log_metric_to_confident: bool = True,
|
|
55
56
|
) -> float:
|
|
56
|
-
|
|
57
|
+
|
|
58
|
+
multimodal = test_case.multimodal
|
|
59
|
+
if multimodal:
|
|
60
|
+
check_mllm_test_case_params(
|
|
61
|
+
test_case, self._required_params, None, None, self, self.model
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
57
65
|
|
|
58
66
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
59
67
|
with metric_progress_indicator(
|
|
@@ -70,14 +78,17 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
70
78
|
)
|
|
71
79
|
)
|
|
72
80
|
else:
|
|
81
|
+
input = test_case.input
|
|
82
|
+
actual_output = test_case.actual_output
|
|
83
|
+
|
|
73
84
|
self.statements: List[str] = self._generate_statements(
|
|
74
|
-
|
|
85
|
+
actual_output, multimodal
|
|
75
86
|
)
|
|
76
87
|
self.verdicts: List[AnswerRelevancyVerdict] = (
|
|
77
|
-
self._generate_verdicts(
|
|
88
|
+
self._generate_verdicts(input, multimodal)
|
|
78
89
|
)
|
|
79
90
|
self.score = self._calculate_score()
|
|
80
|
-
self.reason = self._generate_reason(
|
|
91
|
+
self.reason = self._generate_reason(input, multimodal)
|
|
81
92
|
self.success = self.score >= self.threshold
|
|
82
93
|
self.verbose_logs = construct_verbose_logs(
|
|
83
94
|
self,
|
|
@@ -101,7 +112,14 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
101
112
|
_in_component: bool = False,
|
|
102
113
|
_log_metric_to_confident: bool = True,
|
|
103
114
|
) -> float:
|
|
104
|
-
|
|
115
|
+
|
|
116
|
+
multimodal = test_case.multimodal
|
|
117
|
+
if multimodal:
|
|
118
|
+
check_mllm_test_case_params(
|
|
119
|
+
test_case, self._required_params, None, None, self, self.model
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
105
123
|
|
|
106
124
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
107
125
|
with metric_progress_indicator(
|
|
@@ -110,14 +128,17 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
110
128
|
_show_indicator=_show_indicator,
|
|
111
129
|
_in_component=_in_component,
|
|
112
130
|
):
|
|
131
|
+
input = test_case.input
|
|
132
|
+
actual_output = test_case.actual_output
|
|
133
|
+
|
|
113
134
|
self.statements: List[str] = await self._a_generate_statements(
|
|
114
|
-
|
|
135
|
+
actual_output, multimodal
|
|
115
136
|
)
|
|
116
137
|
self.verdicts: List[AnswerRelevancyVerdict] = (
|
|
117
|
-
await self._a_generate_verdicts(
|
|
138
|
+
await self._a_generate_verdicts(input, multimodal)
|
|
118
139
|
)
|
|
119
140
|
self.score = self._calculate_score()
|
|
120
|
-
self.reason = await self._a_generate_reason(
|
|
141
|
+
self.reason = await self._a_generate_reason(input, multimodal)
|
|
121
142
|
self.success = self.score >= self.threshold
|
|
122
143
|
self.verbose_logs = construct_verbose_logs(
|
|
123
144
|
self,
|
|
@@ -133,7 +154,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
133
154
|
)
|
|
134
155
|
return self.score
|
|
135
156
|
|
|
136
|
-
async def _a_generate_reason(self, input: str) -> str:
|
|
157
|
+
async def _a_generate_reason(self, input: str, multimodal: bool) -> str:
|
|
137
158
|
if self.include_reason is False:
|
|
138
159
|
return None
|
|
139
160
|
|
|
@@ -146,7 +167,9 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
146
167
|
irrelevant_statements=irrelevant_statements,
|
|
147
168
|
input=input,
|
|
148
169
|
score=format(self.score, ".2f"),
|
|
170
|
+
multimodal=multimodal,
|
|
149
171
|
)
|
|
172
|
+
|
|
150
173
|
if self.using_native_model:
|
|
151
174
|
res, cost = await self.model.a_generate(
|
|
152
175
|
prompt, schema=AnswerRelevancyScoreReason
|
|
@@ -164,7 +187,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
164
187
|
data = trimAndLoadJson(res, self)
|
|
165
188
|
return data["reason"]
|
|
166
189
|
|
|
167
|
-
def _generate_reason(self, input: str) -> str:
|
|
190
|
+
def _generate_reason(self, input: str, multimodal: bool) -> str:
|
|
168
191
|
if self.include_reason is False:
|
|
169
192
|
return None
|
|
170
193
|
|
|
@@ -177,6 +200,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
177
200
|
irrelevant_statements=irrelevant_statements,
|
|
178
201
|
input=input,
|
|
179
202
|
score=format(self.score, ".2f"),
|
|
203
|
+
multimodal=multimodal,
|
|
180
204
|
)
|
|
181
205
|
|
|
182
206
|
if self.using_native_model:
|
|
@@ -197,14 +221,13 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
197
221
|
return data["reason"]
|
|
198
222
|
|
|
199
223
|
async def _a_generate_verdicts(
|
|
200
|
-
self, input: str
|
|
224
|
+
self, input: str, multimodal: bool
|
|
201
225
|
) -> List[AnswerRelevancyVerdict]:
|
|
202
226
|
if len(self.statements) == 0:
|
|
203
227
|
return []
|
|
204
228
|
|
|
205
229
|
prompt = self.evaluation_template.generate_verdicts(
|
|
206
|
-
input=input,
|
|
207
|
-
statements=self.statements,
|
|
230
|
+
input=input, statements=self.statements, multimodal=multimodal
|
|
208
231
|
)
|
|
209
232
|
|
|
210
233
|
if self.using_native_model:
|
|
@@ -224,14 +247,16 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
224
247
|
AnswerRelevancyVerdict(**item) for item in data["verdicts"]
|
|
225
248
|
]
|
|
226
249
|
|
|
227
|
-
def _generate_verdicts(
|
|
250
|
+
def _generate_verdicts(
|
|
251
|
+
self, input: str, multimodal: bool
|
|
252
|
+
) -> List[AnswerRelevancyVerdict]:
|
|
228
253
|
if len(self.statements) == 0:
|
|
229
254
|
return []
|
|
230
255
|
|
|
231
256
|
prompt = self.evaluation_template.generate_verdicts(
|
|
232
|
-
input=input,
|
|
233
|
-
statements=self.statements,
|
|
257
|
+
input=input, statements=self.statements, multimodal=multimodal
|
|
234
258
|
)
|
|
259
|
+
|
|
235
260
|
if self.using_native_model:
|
|
236
261
|
res, cost = self.model.generate(prompt, schema=Verdicts)
|
|
237
262
|
self.evaluation_cost += cost
|
|
@@ -250,44 +275,64 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
250
275
|
async def _a_generate_statements(
|
|
251
276
|
self,
|
|
252
277
|
actual_output: str,
|
|
278
|
+
multimodal: bool,
|
|
253
279
|
) -> List[str]:
|
|
254
280
|
prompt = self.evaluation_template.generate_statements(
|
|
255
|
-
actual_output=actual_output,
|
|
281
|
+
actual_output=actual_output, multimodal=multimodal
|
|
256
282
|
)
|
|
257
283
|
if self.using_native_model:
|
|
258
284
|
res, cost = await self.model.a_generate(prompt, schema=Statements)
|
|
259
285
|
self.evaluation_cost += cost
|
|
260
|
-
|
|
286
|
+
statements: List[str] = res.statements + [
|
|
287
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
288
|
+
]
|
|
289
|
+
return statements
|
|
261
290
|
else:
|
|
262
291
|
try:
|
|
263
292
|
res: Statements = await self.model.a_generate(
|
|
264
293
|
prompt, schema=Statements
|
|
265
294
|
)
|
|
266
|
-
|
|
295
|
+
statements: List[str] = res.statements + [
|
|
296
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
297
|
+
]
|
|
298
|
+
return statements
|
|
267
299
|
except TypeError:
|
|
268
300
|
res = await self.model.a_generate(prompt)
|
|
269
301
|
data = trimAndLoadJson(res, self)
|
|
270
|
-
|
|
302
|
+
statements = data["statements"] + [
|
|
303
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
304
|
+
]
|
|
305
|
+
return statements
|
|
271
306
|
|
|
272
307
|
def _generate_statements(
|
|
273
308
|
self,
|
|
274
309
|
actual_output: str,
|
|
310
|
+
multimodal: bool,
|
|
275
311
|
) -> List[str]:
|
|
276
312
|
prompt = self.evaluation_template.generate_statements(
|
|
277
|
-
actual_output=actual_output,
|
|
313
|
+
actual_output=actual_output, multimodal=multimodal
|
|
278
314
|
)
|
|
279
315
|
if self.using_native_model:
|
|
280
316
|
res, cost = self.model.generate(prompt, schema=Statements)
|
|
281
317
|
self.evaluation_cost += cost
|
|
282
|
-
|
|
318
|
+
statements = res.statements + [
|
|
319
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
320
|
+
]
|
|
321
|
+
return statements
|
|
283
322
|
else:
|
|
284
323
|
try:
|
|
285
324
|
res: Statements = self.model.generate(prompt, schema=Statements)
|
|
286
|
-
|
|
325
|
+
statements = res.statements + [
|
|
326
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
327
|
+
]
|
|
328
|
+
return statements
|
|
287
329
|
except TypeError:
|
|
288
330
|
res = self.model.generate(prompt)
|
|
289
331
|
data = trimAndLoadJson(res, self)
|
|
290
|
-
|
|
332
|
+
statements = data["statements"] + [
|
|
333
|
+
ele for ele in actual_output if isinstance(ele, MLLMImage)
|
|
334
|
+
]
|
|
335
|
+
return statements
|
|
291
336
|
|
|
292
337
|
def _calculate_score(self):
|
|
293
338
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -1,110 +1,206 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
import textwrap
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
class AnswerRelevancyTemplate:
|
|
5
6
|
@staticmethod
|
|
6
|
-
def generate_statements(actual_output: str):
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
7
|
+
def generate_statements(actual_output: str, multimodal: bool = False):
|
|
8
|
+
multimodal_instruction = ""
|
|
9
|
+
example_text = ""
|
|
10
|
+
example_json = ""
|
|
11
|
+
|
|
12
|
+
if multimodal:
|
|
13
|
+
multimodal_instruction = " The text may contain images as well."
|
|
14
|
+
example_text = "Shoes. The shoes can be refunded at no extra cost. Thanks for asking the question!"
|
|
15
|
+
example_json = textwrap.dedent(
|
|
16
|
+
"""
|
|
17
|
+
{{
|
|
18
|
+
"statements": ["Shoes.", "Shoes can be refunded at no extra cost", "Thanks for asking the question!"]
|
|
19
|
+
}}
|
|
20
|
+
"""
|
|
21
|
+
)
|
|
22
|
+
else:
|
|
23
|
+
example_text = "Our new laptop model features a high-resolution Retina display for crystal-clear visuals. It also includes a fast-charging battery, giving you up to 12 hours of usage on a single charge. For security, we've added fingerprint authentication and an encrypted SSD. Plus, every purchase comes with a one-year warranty and 24/7 customer support."
|
|
24
|
+
example_json = textwrap.dedent(
|
|
25
|
+
"""
|
|
26
|
+
{{
|
|
27
|
+
"statements": [
|
|
28
|
+
"The new laptop model has a high-resolution Retina display.",
|
|
29
|
+
"It includes a fast-charging battery with up to 12 hours of usage.",
|
|
30
|
+
"Security features include fingerprint authentication and an encrypted SSD.",
|
|
31
|
+
"Every purchase comes with a one-year warranty.",
|
|
32
|
+
"24/7 customer support is included."
|
|
33
|
+
]
|
|
34
|
+
}}
|
|
35
|
+
"""
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
coherence_note = (
|
|
39
|
+
""
|
|
40
|
+
if multimodal
|
|
41
|
+
else " Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
return textwrap.dedent(
|
|
45
|
+
f"""Given the text, breakdown and generate a list of statements presented.{coherence_note}{multimodal_instruction}
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
Example text:
|
|
49
|
+
{example_text}
|
|
50
|
+
|
|
51
|
+
{example_json}
|
|
52
|
+
===== END OF EXAMPLE ======
|
|
53
|
+
|
|
54
|
+
**
|
|
55
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
56
|
+
**
|
|
57
|
+
|
|
58
|
+
Text:
|
|
59
|
+
{actual_output}
|
|
60
|
+
|
|
61
|
+
JSON:
|
|
62
|
+
"""
|
|
63
|
+
)
|
|
33
64
|
|
|
34
65
|
@staticmethod
|
|
35
|
-
def generate_verdicts(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
}}
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
"""
|
|
66
|
+
def generate_verdicts(
|
|
67
|
+
input: str, statements: str, multimodal: bool = False
|
|
68
|
+
):
|
|
69
|
+
content_type = (
|
|
70
|
+
"statements (which can contain images)"
|
|
71
|
+
if multimodal
|
|
72
|
+
else "list of statements"
|
|
73
|
+
)
|
|
74
|
+
statement_or_image = "statement or image" if multimodal else "statement"
|
|
75
|
+
|
|
76
|
+
format_instruction = textwrap.dedent(
|
|
77
|
+
"""
|
|
78
|
+
Expected JSON format:
|
|
79
|
+
{{
|
|
80
|
+
"verdicts": [
|
|
81
|
+
{{
|
|
82
|
+
"verdict": "yes"
|
|
83
|
+
}},
|
|
84
|
+
{{
|
|
85
|
+
"reason": <explanation_for_irrelevance>,
|
|
86
|
+
"verdict": "no"
|
|
87
|
+
}},
|
|
88
|
+
{{
|
|
89
|
+
"reason": <explanation_for_ambiguity>,
|
|
90
|
+
"verdict": "idk"
|
|
91
|
+
}}
|
|
92
|
+
]
|
|
93
|
+
}}
|
|
94
|
+
"""
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
example_section = ""
|
|
98
|
+
if multimodal:
|
|
99
|
+
example_section = textwrap.dedent(
|
|
100
|
+
"""
|
|
101
|
+
Example input: What should I do if there is an earthquake?
|
|
102
|
+
Example statements: ["Shoes.", "Thanks for asking the question!", "Is there anything else I can help you with?", "Duck and hide"]
|
|
103
|
+
Example JSON:
|
|
104
|
+
{{
|
|
105
|
+
"verdicts": [
|
|
106
|
+
{{
|
|
107
|
+
"reason": "The 'Shoes.' statement made in the actual output is completely irrelevant to the input, which asks about what to do in the event of an earthquake.",
|
|
108
|
+
"verdict": "no"
|
|
109
|
+
}},
|
|
110
|
+
{{
|
|
111
|
+
"reason": "The statement thanking the user for asking the question is not directly relevant to the input, but is not entirely irrelevant.",
|
|
112
|
+
"verdict": "idk"
|
|
113
|
+
}},
|
|
114
|
+
{{
|
|
115
|
+
"reason": "The question about whether there is anything else the user can help with is not directly relevant to the input, but is not entirely irrelevant.",
|
|
116
|
+
"verdict": "idk"
|
|
117
|
+
}},
|
|
118
|
+
{{
|
|
119
|
+
"verdict": "yes"
|
|
120
|
+
}}
|
|
121
|
+
]
|
|
122
|
+
}}
|
|
123
|
+
"""
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
guidelines = ""
|
|
127
|
+
if multimodal:
|
|
128
|
+
guidelines = textwrap.dedent(
|
|
129
|
+
f"""
|
|
130
|
+
Since you are going to generate a verdict for each statement and image, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
|
|
131
|
+
"""
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
guidelines = textwrap.dedent(
|
|
135
|
+
f"""
|
|
136
|
+
Generate ONE verdict per statement - number of 'verdicts' MUST equal number of statements.
|
|
137
|
+
'verdict' must be STRICTLY 'yes', 'no', or 'idk':
|
|
138
|
+
- 'yes': statement is relevant to addressing the input
|
|
139
|
+
- 'no': statement is irrelevant to the input
|
|
140
|
+
- 'idk': statement is ambiguous (not directly relevant but could be supporting information)
|
|
141
|
+
Provide 'reason' ONLY for 'no' or 'idk' verdicts.
|
|
142
|
+
"""
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return textwrap.dedent(
|
|
146
|
+
f"""For the provided {content_type}, determine whether each {statement_or_image} is relevant to address the input.
|
|
147
|
+
{"Please generate a list of JSON with two keys: `verdict` and `reason`." if multimodal else "Generate JSON objects with 'verdict' and 'reason' fields."}
|
|
148
|
+
The 'verdict' {"key " if multimodal else ''}should {"STRICTLY be either a 'yes', 'idk' or 'no'" if multimodal else "be 'yes' (relevant), 'no' (irrelevant), or 'idk' (ambiguous/supporting information)"}. {"Answer 'yes' if the " + statement_or_image + ' is relevant to addressing the original input, no if the ' + statement_or_image + ' is irrelevant, and "idk" if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).' if multimodal else ""}
|
|
149
|
+
{"The 'reason' is the reason for the verdict.' if multimodal else '"}
|
|
150
|
+
Provide 'reason' ONLY for 'no' or 'idk' verdicts.
|
|
151
|
+
The {"provided statements are statements and images' if multimodal else 'statements are from an AI's actual output"} generated in the actual output.
|
|
152
|
+
|
|
153
|
+
**
|
|
154
|
+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
155
|
+
|
|
156
|
+
{format_instruction if not multimodal else ''}
|
|
157
|
+
{example_section}
|
|
158
|
+
{guidelines}
|
|
159
|
+
**
|
|
160
|
+
|
|
161
|
+
Input:
|
|
162
|
+
{input}
|
|
163
|
+
|
|
164
|
+
Statements:
|
|
165
|
+
{statements}
|
|
166
|
+
|
|
167
|
+
JSON:
|
|
168
|
+
"""
|
|
169
|
+
)
|
|
78
170
|
|
|
79
171
|
@staticmethod
|
|
80
172
|
def generate_reason(
|
|
81
|
-
irrelevant_statements: List[str],
|
|
173
|
+
irrelevant_statements: List[str],
|
|
174
|
+
input: str,
|
|
175
|
+
score: float,
|
|
176
|
+
multimodal: bool = False,
|
|
82
177
|
):
|
|
83
|
-
return
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
178
|
+
return textwrap.dedent(
|
|
179
|
+
f"""Given the answer relevancy score, the list of reasons of irrelevant statements made in the actual output, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score.
|
|
180
|
+
The irrelevant statements represent things in the actual output that is irrelevant to addressing whatever is asked/talked about in the input.
|
|
181
|
+
If there is nothing irrelevant, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
87
182
|
|
|
88
|
-
**
|
|
89
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
90
183
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
{{
|
|
94
|
-
"reason": "The score is <answer_relevancy_score> because <your_reason>."
|
|
95
|
-
}}
|
|
96
|
-
===== END OF EXAMPLE ======
|
|
97
|
-
**
|
|
184
|
+
**
|
|
185
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
|
|
98
186
|
|
|
187
|
+
{"Example:' if not multimodal else '"}
|
|
188
|
+
Example JSON:
|
|
189
|
+
{{
|
|
190
|
+
"reason": "The score is <answer_relevancy_score> because <your_reason>."
|
|
191
|
+
}}
|
|
192
|
+
{"===== END OF EXAMPLE ======' if not multimodal else '"}
|
|
193
|
+
**
|
|
99
194
|
|
|
100
|
-
Answer Relevancy Score:
|
|
101
|
-
{score}
|
|
195
|
+
Answer Relevancy Score:
|
|
196
|
+
{score}
|
|
102
197
|
|
|
103
|
-
Reasons why the score can't be higher based on irrelevant statements in the actual output:
|
|
104
|
-
{irrelevant_statements}
|
|
198
|
+
Reasons why the score can't be higher based on irrelevant statements in the actual output:
|
|
199
|
+
{irrelevant_statements}
|
|
105
200
|
|
|
106
|
-
Input:
|
|
107
|
-
{input}
|
|
201
|
+
Input:
|
|
202
|
+
{input}
|
|
108
203
|
|
|
109
|
-
JSON:
|
|
110
|
-
"""
|
|
204
|
+
JSON:
|
|
205
|
+
"""
|
|
206
|
+
)
|
deepeval/metrics/base_metric.py
CHANGED
|
@@ -4,7 +4,6 @@ from typing import Optional, Dict, List
|
|
|
4
4
|
from deepeval.test_case import (
|
|
5
5
|
LLMTestCase,
|
|
6
6
|
ConversationalTestCase,
|
|
7
|
-
MLLMTestCase,
|
|
8
7
|
LLMTestCaseParams,
|
|
9
8
|
ArenaTestCase,
|
|
10
9
|
)
|
|
@@ -113,13 +112,11 @@ class BaseMultimodalMetric:
|
|
|
113
112
|
self._threshold = value
|
|
114
113
|
|
|
115
114
|
@abstractmethod
|
|
116
|
-
def measure(self, test_case:
|
|
115
|
+
def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
|
|
117
116
|
raise NotImplementedError
|
|
118
117
|
|
|
119
118
|
@abstractmethod
|
|
120
|
-
async def a_measure(
|
|
121
|
-
self, test_case: MLLMTestCase, *args, **kwargs
|
|
122
|
-
) -> float:
|
|
119
|
+
async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
|
|
123
120
|
raise NotImplementedError(
|
|
124
121
|
f"Async execution for {self.__class__.__name__} not supported yet. Please set 'async_mode' to 'False'."
|
|
125
122
|
)
|