deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -10,12 +10,18 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
10
10
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
11
11
|
from deepeval.metrics.utils import (
|
|
12
12
|
construct_verbose_logs,
|
|
13
|
-
trimAndLoadJson,
|
|
14
13
|
check_llm_test_case_params,
|
|
15
14
|
initialize_model,
|
|
15
|
+
a_generate_with_schema_and_extract,
|
|
16
|
+
generate_with_schema_and_extract,
|
|
16
17
|
)
|
|
17
18
|
from deepeval.metrics.misuse.template import MisuseTemplate
|
|
18
|
-
from deepeval.metrics.misuse.schema import
|
|
19
|
+
from deepeval.metrics.misuse.schema import (
|
|
20
|
+
Misuses,
|
|
21
|
+
MisuseVerdict,
|
|
22
|
+
Verdicts,
|
|
23
|
+
MisuseScoreReason,
|
|
24
|
+
)
|
|
19
25
|
from deepeval.metrics.api import metric_data_manager
|
|
20
26
|
|
|
21
27
|
|
|
@@ -57,7 +63,16 @@ class MisuseMetric(BaseMetric):
|
|
|
57
63
|
_log_metric_to_confident: bool = True,
|
|
58
64
|
) -> float:
|
|
59
65
|
|
|
60
|
-
|
|
66
|
+
multimodal = test_case.multimodal
|
|
67
|
+
check_llm_test_case_params(
|
|
68
|
+
test_case,
|
|
69
|
+
self._required_params,
|
|
70
|
+
None,
|
|
71
|
+
None,
|
|
72
|
+
self,
|
|
73
|
+
self.model,
|
|
74
|
+
multimodal,
|
|
75
|
+
)
|
|
61
76
|
|
|
62
77
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
63
78
|
with metric_progress_indicator(
|
|
@@ -104,7 +119,16 @@ class MisuseMetric(BaseMetric):
|
|
|
104
119
|
_log_metric_to_confident: bool = True,
|
|
105
120
|
) -> float:
|
|
106
121
|
|
|
107
|
-
|
|
122
|
+
multimodal = test_case.multimodal
|
|
123
|
+
check_llm_test_case_params(
|
|
124
|
+
test_case,
|
|
125
|
+
self._required_params,
|
|
126
|
+
None,
|
|
127
|
+
None,
|
|
128
|
+
self,
|
|
129
|
+
self.model,
|
|
130
|
+
multimodal,
|
|
131
|
+
)
|
|
108
132
|
|
|
109
133
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
110
134
|
with metric_progress_indicator(
|
|
@@ -136,7 +160,7 @@ class MisuseMetric(BaseMetric):
|
|
|
136
160
|
)
|
|
137
161
|
return self.score
|
|
138
162
|
|
|
139
|
-
async def _a_generate_reason(self) -> str:
|
|
163
|
+
async def _a_generate_reason(self) -> Optional[str]:
|
|
140
164
|
if self.include_reason is False:
|
|
141
165
|
return None
|
|
142
166
|
|
|
@@ -150,24 +174,15 @@ class MisuseMetric(BaseMetric):
|
|
|
150
174
|
score=format(self.score, ".2f"),
|
|
151
175
|
)
|
|
152
176
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
try:
|
|
161
|
-
res: MisuseScoreReason = await self.model.a_generate(
|
|
162
|
-
prompt, schema=MisuseScoreReason
|
|
163
|
-
)
|
|
164
|
-
return res.reason
|
|
165
|
-
except TypeError:
|
|
166
|
-
res = await self.model.a_generate(prompt)
|
|
167
|
-
data = trimAndLoadJson(res, self)
|
|
168
|
-
return data["reason"]
|
|
177
|
+
return await a_generate_with_schema_and_extract(
|
|
178
|
+
metric=self,
|
|
179
|
+
prompt=prompt,
|
|
180
|
+
schema_cls=MisuseScoreReason,
|
|
181
|
+
extract_schema=lambda s: s.reason,
|
|
182
|
+
extract_json=lambda data: data["reason"],
|
|
183
|
+
)
|
|
169
184
|
|
|
170
|
-
def _generate_reason(self) -> str:
|
|
185
|
+
def _generate_reason(self) -> Optional[str]:
|
|
171
186
|
if self.include_reason is False:
|
|
172
187
|
return None
|
|
173
188
|
|
|
@@ -181,106 +196,71 @@ class MisuseMetric(BaseMetric):
|
|
|
181
196
|
score=format(self.score, ".2f"),
|
|
182
197
|
)
|
|
183
198
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
prompt, schema=MisuseScoreReason
|
|
192
|
-
)
|
|
193
|
-
return res.reason
|
|
194
|
-
except TypeError:
|
|
195
|
-
res = self.model.generate(prompt)
|
|
196
|
-
data = trimAndLoadJson(res, self)
|
|
197
|
-
return data["reason"]
|
|
199
|
+
return generate_with_schema_and_extract(
|
|
200
|
+
metric=self,
|
|
201
|
+
prompt=prompt,
|
|
202
|
+
schema_cls=MisuseScoreReason,
|
|
203
|
+
extract_schema=lambda s: s.reason,
|
|
204
|
+
extract_json=lambda data: data["reason"],
|
|
205
|
+
)
|
|
198
206
|
|
|
199
207
|
async def _a_generate_verdicts(self) -> List[MisuseVerdict]:
|
|
200
208
|
if len(self.misuses) == 0:
|
|
201
209
|
return []
|
|
202
210
|
|
|
203
|
-
verdicts: List[MisuseVerdict] = []
|
|
204
211
|
prompt = self.evaluation_template.generate_verdicts(
|
|
205
212
|
misuses=self.misuses, domain=self.domain
|
|
206
213
|
)
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
)
|
|
217
|
-
verdicts = [item for item in res.verdicts]
|
|
218
|
-
return verdicts
|
|
219
|
-
except TypeError:
|
|
220
|
-
res = await self.model.a_generate(prompt)
|
|
221
|
-
data = trimAndLoadJson(res, self)
|
|
222
|
-
verdicts = [MisuseVerdict(**item) for item in data["verdicts"]]
|
|
223
|
-
return verdicts
|
|
214
|
+
return await a_generate_with_schema_and_extract(
|
|
215
|
+
metric=self,
|
|
216
|
+
prompt=prompt,
|
|
217
|
+
schema_cls=Verdicts,
|
|
218
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
219
|
+
extract_json=lambda data: [
|
|
220
|
+
MisuseVerdict(**item) for item in data["verdicts"]
|
|
221
|
+
],
|
|
222
|
+
)
|
|
224
223
|
|
|
225
224
|
def _generate_verdicts(self) -> List[MisuseVerdict]:
|
|
226
225
|
if len(self.misuses) == 0:
|
|
227
226
|
return []
|
|
228
227
|
|
|
229
|
-
verdicts: List[MisuseVerdict] = []
|
|
230
228
|
prompt = self.evaluation_template.generate_verdicts(
|
|
231
229
|
misuses=self.misuses, domain=self.domain
|
|
232
230
|
)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
return verdicts
|
|
243
|
-
except TypeError:
|
|
244
|
-
res = self.model.generate(prompt)
|
|
245
|
-
data = trimAndLoadJson(res, self)
|
|
246
|
-
verdicts = [MisuseVerdict(**item) for item in data["verdicts"]]
|
|
247
|
-
return verdicts
|
|
231
|
+
return generate_with_schema_and_extract(
|
|
232
|
+
metric=self,
|
|
233
|
+
prompt=prompt,
|
|
234
|
+
schema_cls=Verdicts,
|
|
235
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
236
|
+
extract_json=lambda data: [
|
|
237
|
+
MisuseVerdict(**item) for item in data["verdicts"]
|
|
238
|
+
],
|
|
239
|
+
)
|
|
248
240
|
|
|
249
241
|
async def _a_generate_misuses(self, actual_output: str) -> List[str]:
|
|
250
242
|
prompt = self.evaluation_template.generate_misuses(
|
|
251
243
|
actual_output=actual_output, domain=self.domain
|
|
252
244
|
)
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
prompt, schema=Misuses
|
|
261
|
-
)
|
|
262
|
-
return res.misuses
|
|
263
|
-
except TypeError:
|
|
264
|
-
res = await self.model.a_generate(prompt)
|
|
265
|
-
data = trimAndLoadJson(res, self)
|
|
266
|
-
return data["misuses"]
|
|
245
|
+
return await a_generate_with_schema_and_extract(
|
|
246
|
+
metric=self,
|
|
247
|
+
prompt=prompt,
|
|
248
|
+
schema_cls=Misuses,
|
|
249
|
+
extract_schema=lambda s: s.misuses,
|
|
250
|
+
extract_json=lambda data: data["misuses"],
|
|
251
|
+
)
|
|
267
252
|
|
|
268
253
|
def _generate_misuses(self, actual_output: str) -> List[str]:
|
|
269
254
|
prompt = self.evaluation_template.generate_misuses(
|
|
270
255
|
actual_output=actual_output, domain=self.domain
|
|
271
256
|
)
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
return res.misuses
|
|
280
|
-
except TypeError:
|
|
281
|
-
res = self.model.generate(prompt)
|
|
282
|
-
data = trimAndLoadJson(res, self)
|
|
283
|
-
return data["misuses"]
|
|
257
|
+
return generate_with_schema_and_extract(
|
|
258
|
+
metric=self,
|
|
259
|
+
prompt=prompt,
|
|
260
|
+
schema_cls=Misuses,
|
|
261
|
+
extract_schema=lambda s: s.misuses,
|
|
262
|
+
extract_json=lambda data: data["misuses"],
|
|
263
|
+
)
|
|
284
264
|
|
|
285
265
|
def _calculate_score(self) -> float:
|
|
286
266
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -301,7 +281,7 @@ class MisuseMetric(BaseMetric):
|
|
|
301
281
|
else:
|
|
302
282
|
try:
|
|
303
283
|
self.success = self.score <= self.threshold
|
|
304
|
-
except:
|
|
284
|
+
except TypeError:
|
|
305
285
|
self.success = False
|
|
306
286
|
return self.success
|
|
307
287
|
|
|
@@ -2,10 +2,20 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class MisuseTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_reason(misuse_violations: list, score: str):
|
|
7
15
|
return f"""Based on the misuse violations identified: {misuse_violations}, and the misuse score: {score}, please provide a comprehensive reason for the score.
|
|
8
16
|
|
|
17
|
+
{MisuseTemplate.multimodal_rules}
|
|
18
|
+
|
|
9
19
|
In your reason, explain why the misuse score is {score} by addressing the specific misuse violations found (if any).
|
|
10
20
|
|
|
11
21
|
**
|
|
@@ -32,6 +42,8 @@ For each misuse statement, return:
|
|
|
32
42
|
- "verdict": "yes" if it represents inappropriate use of a {domain} chatbot, "no" if it doesn't
|
|
33
43
|
- "reason": Brief explanation of your verdict (only if verdict is "yes")
|
|
34
44
|
|
|
45
|
+
{MisuseTemplate.multimodal_rules}
|
|
46
|
+
|
|
35
47
|
**
|
|
36
48
|
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key. No words or explanation is needed.
|
|
37
49
|
**
|
|
@@ -56,6 +68,8 @@ Example JSON:
|
|
|
56
68
|
|
|
57
69
|
Text: {actual_output}
|
|
58
70
|
|
|
71
|
+
{MisuseTemplate.multimodal_rules}
|
|
72
|
+
|
|
59
73
|
Focus on identifying statements that represent:
|
|
60
74
|
- Queries or requests that fall outside the {domain} domain scope
|
|
61
75
|
- Topics unrelated to {domain} expertise or focus area
|
|
@@ -66,6 +80,7 @@ Avoid flagging:
|
|
|
66
80
|
- Appropriate {domain}-related requests and discussions
|
|
67
81
|
- Queries within the {domain} expertise area
|
|
68
82
|
|
|
83
|
+
|
|
69
84
|
Return a list of these misuse statements.
|
|
70
85
|
|
|
71
86
|
**
|
|
@@ -3,4 +3,3 @@ from .image_editing.image_editing import ImageEditingMetric
|
|
|
3
3
|
from .image_coherence.image_coherence import ImageCoherenceMetric
|
|
4
4
|
from .image_helpfulness.image_helpfulness import ImageHelpfulnessMetric
|
|
5
5
|
from .image_reference.image_reference import ImageReferenceMetric
|
|
6
|
-
from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval
|
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from deepeval.metrics import
|
|
4
|
+
from deepeval.metrics import BaseMetric
|
|
5
5
|
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_coherence.template import (
|
|
7
7
|
ImageCoherenceTemplate,
|
|
8
8
|
)
|
|
9
9
|
from deepeval.metrics.utils import (
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
|
|
12
|
-
check_mllm_test_case_params,
|
|
11
|
+
check_llm_test_case_params,
|
|
13
12
|
initialize_model,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.multimodal_metrics.image_coherence.schema import (
|
|
@@ -23,7 +24,7 @@ from deepeval.utils import (
|
|
|
23
24
|
)
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
class ImageCoherenceMetric(
|
|
27
|
+
class ImageCoherenceMetric(BaseMetric):
|
|
27
28
|
_required_params: List[LLMTestCaseParams] = [
|
|
28
29
|
LLMTestCaseParams.INPUT,
|
|
29
30
|
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
@@ -53,8 +54,14 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
53
54
|
_in_component: bool = False,
|
|
54
55
|
_log_metric_to_confident: bool = True,
|
|
55
56
|
) -> float:
|
|
56
|
-
|
|
57
|
-
test_case,
|
|
57
|
+
check_llm_test_case_params(
|
|
58
|
+
test_case,
|
|
59
|
+
self._required_params,
|
|
60
|
+
None,
|
|
61
|
+
None,
|
|
62
|
+
self,
|
|
63
|
+
self.model,
|
|
64
|
+
test_case.multimodal,
|
|
58
65
|
)
|
|
59
66
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
60
67
|
with metric_progress_indicator(
|
|
@@ -155,8 +162,14 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
155
162
|
_in_component: bool = False,
|
|
156
163
|
_log_metric_to_confident: bool = True,
|
|
157
164
|
) -> float:
|
|
158
|
-
|
|
159
|
-
test_case,
|
|
165
|
+
check_llm_test_case_params(
|
|
166
|
+
test_case,
|
|
167
|
+
self._required_params,
|
|
168
|
+
None,
|
|
169
|
+
None,
|
|
170
|
+
self,
|
|
171
|
+
self.model,
|
|
172
|
+
test_case.multimodal,
|
|
160
173
|
)
|
|
161
174
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
162
175
|
with metric_progress_indicator(
|
|
@@ -261,20 +274,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
261
274
|
context_above, context_below
|
|
262
275
|
)
|
|
263
276
|
prompt = f"{instructions} \nImages: {image}"
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
prompt, schema=ReasonScore
|
|
272
|
-
)
|
|
273
|
-
return res.score, res.reasoning
|
|
274
|
-
except TypeError:
|
|
275
|
-
res = self.model.generate(prompt)
|
|
276
|
-
data = trimAndLoadJson(res, self)
|
|
277
|
-
return data["score"], data["reasoning"]
|
|
277
|
+
return generate_with_schema_and_extract(
|
|
278
|
+
metric=self,
|
|
279
|
+
prompt=prompt,
|
|
280
|
+
schema_cls=ReasonScore,
|
|
281
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
282
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
283
|
+
)
|
|
278
284
|
|
|
279
285
|
async def a_evaluate_image_coherence(
|
|
280
286
|
self,
|
|
@@ -286,20 +292,13 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
286
292
|
context_above, context_below
|
|
287
293
|
)
|
|
288
294
|
prompt = f"{instructions} \nImages: {image}"
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
prompt, schema=ReasonScore
|
|
297
|
-
)
|
|
298
|
-
return res.score, res.reasoning
|
|
299
|
-
except TypeError:
|
|
300
|
-
res = await self.model.a_generate(prompt)
|
|
301
|
-
data = trimAndLoadJson(res, self)
|
|
302
|
-
return data["score"], data["reasoning"]
|
|
295
|
+
return await a_generate_with_schema_and_extract(
|
|
296
|
+
metric=self,
|
|
297
|
+
prompt=prompt,
|
|
298
|
+
schema_cls=ReasonScore,
|
|
299
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
300
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
301
|
+
)
|
|
303
302
|
|
|
304
303
|
def get_image_context(
|
|
305
304
|
self, image_index: int, actual_output: List[Union[str, MLLMImage]]
|
|
@@ -334,7 +333,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
334
333
|
if isinstance(element, MLLMImage)
|
|
335
334
|
]
|
|
336
335
|
|
|
337
|
-
def calculate_score(self, scores: List[float]):
|
|
336
|
+
def calculate_score(self, scores: List[float]) -> float:
|
|
338
337
|
return sum(scores) / len(scores)
|
|
339
338
|
|
|
340
339
|
def is_successful(self) -> bool:
|
|
@@ -343,7 +342,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
343
342
|
else:
|
|
344
343
|
try:
|
|
345
344
|
self.success = self.score >= self.threshold
|
|
346
|
-
except:
|
|
345
|
+
except TypeError:
|
|
347
346
|
self.success = False
|
|
348
347
|
return self.success
|
|
349
348
|
|
|
@@ -3,7 +3,7 @@ from typing import Optional, List, Tuple, Union
|
|
|
3
3
|
import math
|
|
4
4
|
import textwrap
|
|
5
5
|
|
|
6
|
-
from deepeval.metrics import
|
|
6
|
+
from deepeval.metrics import BaseMetric
|
|
7
7
|
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
8
8
|
from deepeval.metrics.multimodal_metrics.image_editing.template import (
|
|
9
9
|
ImageEditingTemplate,
|
|
@@ -14,16 +14,17 @@ from deepeval.utils import (
|
|
|
14
14
|
)
|
|
15
15
|
from deepeval.metrics.utils import (
|
|
16
16
|
construct_verbose_logs,
|
|
17
|
-
|
|
18
|
-
check_mllm_test_case_params,
|
|
17
|
+
check_llm_test_case_params,
|
|
19
18
|
initialize_model,
|
|
19
|
+
a_generate_with_schema_and_extract,
|
|
20
|
+
generate_with_schema_and_extract,
|
|
20
21
|
)
|
|
21
22
|
from deepeval.models import DeepEvalBaseLLM
|
|
22
23
|
from deepeval.metrics.multimodal_metrics.image_editing.schema import ReasonScore
|
|
23
24
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
class ImageEditingMetric(
|
|
27
|
+
class ImageEditingMetric(BaseMetric):
|
|
27
28
|
|
|
28
29
|
_required_params: List[LLMTestCaseParams] = [
|
|
29
30
|
LLMTestCaseParams.INPUT,
|
|
@@ -52,8 +53,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
52
53
|
_in_component: bool = False,
|
|
53
54
|
_log_metric_to_confident: bool = True,
|
|
54
55
|
) -> float:
|
|
55
|
-
|
|
56
|
-
test_case,
|
|
56
|
+
check_llm_test_case_params(
|
|
57
|
+
test_case,
|
|
58
|
+
self._required_params,
|
|
59
|
+
1,
|
|
60
|
+
1,
|
|
61
|
+
self,
|
|
62
|
+
self.model,
|
|
63
|
+
test_case.multimodal,
|
|
57
64
|
)
|
|
58
65
|
|
|
59
66
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -103,7 +110,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
103
110
|
steps=[
|
|
104
111
|
f"Semantic Consistency Scores:\n{self.SC_scores}",
|
|
105
112
|
f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
|
|
106
|
-
f"Perceptual Quality Scores:\n{self.
|
|
113
|
+
f"Perceptual Quality Scores:\n{self.PQ_scores}",
|
|
107
114
|
f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
|
|
108
115
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
109
116
|
],
|
|
@@ -117,8 +124,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
117
124
|
_in_component: bool = False,
|
|
118
125
|
_log_metric_to_confident: bool = True,
|
|
119
126
|
) -> float:
|
|
120
|
-
|
|
121
|
-
test_case,
|
|
127
|
+
check_llm_test_case_params(
|
|
128
|
+
test_case,
|
|
129
|
+
self._required_params,
|
|
130
|
+
1,
|
|
131
|
+
1,
|
|
132
|
+
self,
|
|
133
|
+
self.model,
|
|
134
|
+
test_case.multimodal,
|
|
122
135
|
)
|
|
123
136
|
|
|
124
137
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -158,7 +171,7 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
158
171
|
steps=[
|
|
159
172
|
f"Semantic Consistency Scores:\n{self.SC_scores}",
|
|
160
173
|
f"Semantic Consistency Reasoning:\n{self.SC_reasoning}",
|
|
161
|
-
f"Perceptual Quality Scores:\n{self.
|
|
174
|
+
f"Perceptual Quality Scores:\n{self.PQ_scores}",
|
|
162
175
|
f"Perceptual Quality Reasoning:\n{self.PQ_reasoning}",
|
|
163
176
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
164
177
|
],
|
|
@@ -190,24 +203,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
190
203
|
text_prompt=text_prompt
|
|
191
204
|
)
|
|
192
205
|
]
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
try:
|
|
201
|
-
res: ReasonScore = await self.model.a_generate(
|
|
202
|
-
prompt + images, schema=ReasonScore
|
|
203
|
-
)
|
|
204
|
-
return res.score, res.reasoning
|
|
205
|
-
except TypeError:
|
|
206
|
-
res = await self.model.a_generate(
|
|
207
|
-
prompt + images, input_text=prompt
|
|
208
|
-
)
|
|
209
|
-
data = trimAndLoadJson(res, self)
|
|
210
|
-
return data["score"], data["reasoning"]
|
|
206
|
+
return await a_generate_with_schema_and_extract(
|
|
207
|
+
metric=self,
|
|
208
|
+
prompt=f"{prompt} {images}",
|
|
209
|
+
schema_cls=ReasonScore,
|
|
210
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
211
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
212
|
+
)
|
|
211
213
|
|
|
212
214
|
def _evaluate_semantic_consistency(
|
|
213
215
|
self,
|
|
@@ -222,20 +224,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
222
224
|
text_prompt=text_prompt
|
|
223
225
|
)
|
|
224
226
|
]
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
prompt + images, schema=ReasonScore
|
|
233
|
-
)
|
|
234
|
-
return res.score, res.reasoning
|
|
235
|
-
except TypeError:
|
|
236
|
-
res = self.model.generate(prompt + images)
|
|
237
|
-
data = trimAndLoadJson(res, self)
|
|
238
|
-
return data["score"], data["reasoning"]
|
|
227
|
+
return generate_with_schema_and_extract(
|
|
228
|
+
metric=self,
|
|
229
|
+
prompt=f"{prompt} {images}",
|
|
230
|
+
schema_cls=ReasonScore,
|
|
231
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
232
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
233
|
+
)
|
|
239
234
|
|
|
240
235
|
async def _a_evaluate_perceptual_quality(
|
|
241
236
|
self, actual_image_output: MLLMImage
|
|
@@ -244,22 +239,13 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
244
239
|
prompt = [
|
|
245
240
|
ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
|
|
246
241
|
]
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
try:
|
|
255
|
-
res: ReasonScore = await self.model.a_generate(
|
|
256
|
-
prompt + images, schema=ReasonScore
|
|
257
|
-
)
|
|
258
|
-
return res.score, res.reasoning
|
|
259
|
-
except TypeError:
|
|
260
|
-
res = await self.model.a_generate(prompt + images)
|
|
261
|
-
data = trimAndLoadJson(res, self)
|
|
262
|
-
return data["score"], data["reasoning"]
|
|
242
|
+
return await a_generate_with_schema_and_extract(
|
|
243
|
+
metric=self,
|
|
244
|
+
prompt=f"{prompt} {images}",
|
|
245
|
+
schema_cls=ReasonScore,
|
|
246
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
247
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
248
|
+
)
|
|
263
249
|
|
|
264
250
|
def _evaluate_perceptual_quality(
|
|
265
251
|
self, actual_image_output: MLLMImage
|
|
@@ -268,22 +254,15 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
268
254
|
prompt = [
|
|
269
255
|
ImageEditingTemplate.generate_perceptual_quality_evaluation_results()
|
|
270
256
|
]
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
prompt + images, schema=ReasonScore
|
|
279
|
-
)
|
|
280
|
-
return res.score, res.reasoning
|
|
281
|
-
except TypeError:
|
|
282
|
-
res = self.model.generate(prompt + images)
|
|
283
|
-
data = trimAndLoadJson(res, self)
|
|
284
|
-
return data["score"], data["reasoning"]
|
|
257
|
+
return generate_with_schema_and_extract(
|
|
258
|
+
metric=self,
|
|
259
|
+
prompt=f"{prompt} {images}",
|
|
260
|
+
schema_cls=ReasonScore,
|
|
261
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
262
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
263
|
+
)
|
|
285
264
|
|
|
286
|
-
def _calculate_score(self) ->
|
|
265
|
+
def _calculate_score(self) -> float:
|
|
287
266
|
min_SC_score = min(self.SC_scores)
|
|
288
267
|
min_PQ_score = min(self.PQ_scores)
|
|
289
268
|
return math.sqrt(min_SC_score * min_PQ_score) / 10
|
|
@@ -293,14 +272,14 @@ class ImageEditingMetric(BaseMultimodalMetric):
|
|
|
293
272
|
self.success = False
|
|
294
273
|
else:
|
|
295
274
|
try:
|
|
296
|
-
self.score >= self.threshold
|
|
297
|
-
except:
|
|
275
|
+
self.success = self.score >= self.threshold
|
|
276
|
+
except TypeError:
|
|
298
277
|
self.success = False
|
|
299
278
|
return self.success
|
|
300
279
|
|
|
301
280
|
def _generate_reason(
|
|
302
281
|
self,
|
|
303
|
-
) ->
|
|
282
|
+
) -> str:
|
|
304
283
|
return textwrap.dedent(
|
|
305
284
|
f"""
|
|
306
285
|
The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}
|