deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +139 -2
- deepeval/evaluate/evaluate.py +16 -11
- deepeval/evaluate/execute.py +13 -181
- deepeval/evaluate/utils.py +6 -26
- deepeval/integrations/pydantic_ai/agent.py +19 -2
- deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +14 -16
- deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
- deepeval/metrics/answer_relevancy/template.py +22 -3
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -44
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
- deepeval/metrics/contextual_precision/template.py +115 -66
- deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
- deepeval/metrics/contextual_recall/template.py +106 -55
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
- deepeval/metrics/contextual_relevancy/template.py +87 -58
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +16 -2
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +138 -149
- deepeval/metrics/faithfulness/schema.py +1 -1
- deepeval/metrics/faithfulness/template.py +200 -115
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +7 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -19
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/ragas.py +3 -3
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +21 -0
- deepeval/metrics/turn_contextual_precision/template.py +187 -0
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
- deepeval/metrics/turn_contextual_recall/schema.py +21 -0
- deepeval/metrics/turn_contextual_recall/template.py +178 -0
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
- deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
- deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
- deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
- deepeval/metrics/turn_faithfulness/template.py +218 -0
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +158 -122
- deepeval/models/__init__.py +0 -12
- deepeval/models/base_model.py +49 -33
- deepeval/models/embedding_models/__init__.py +7 -0
- deepeval/models/embedding_models/azure_embedding_model.py +79 -33
- deepeval/models/embedding_models/local_embedding_model.py +39 -20
- deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
- deepeval/models/embedding_models/openai_embedding_model.py +42 -22
- deepeval/models/llms/amazon_bedrock_model.py +226 -72
- deepeval/models/llms/anthropic_model.py +178 -63
- deepeval/models/llms/azure_model.py +218 -60
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +95 -40
- deepeval/models/llms/gemini_model.py +209 -64
- deepeval/models/llms/grok_model.py +139 -68
- deepeval/models/llms/kimi_model.py +140 -90
- deepeval/models/llms/litellm_model.py +131 -37
- deepeval/models/llms/local_model.py +125 -21
- deepeval/models/llms/ollama_model.py +147 -24
- deepeval/models/llms/openai_model.py +222 -269
- deepeval/models/llms/portkey_model.py +81 -22
- deepeval/models/llms/utils.py +8 -3
- deepeval/models/retry_policy.py +17 -14
- deepeval/models/utils.py +106 -5
- deepeval/optimizer/__init__.py +5 -0
- deepeval/optimizer/algorithms/__init__.py +6 -0
- deepeval/optimizer/algorithms/base.py +29 -0
- deepeval/optimizer/algorithms/configs.py +18 -0
- deepeval/optimizer/algorithms/copro/__init__.py +5 -0
- deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
- deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
- deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
- deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
- deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
- deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
- deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
- deepeval/optimizer/algorithms/simba/__init__.py +5 -0
- deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
- deepeval/{optimization → optimizer}/configs.py +5 -8
- deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
- deepeval/optimizer/prompt_optimizer.py +263 -0
- deepeval/optimizer/rewriter/__init__.py +5 -0
- deepeval/optimizer/rewriter/rewriter.py +124 -0
- deepeval/optimizer/rewriter/utils.py +214 -0
- deepeval/optimizer/scorer/__init__.py +5 -0
- deepeval/optimizer/scorer/base.py +86 -0
- deepeval/optimizer/scorer/scorer.py +316 -0
- deepeval/optimizer/scorer/utils.py +30 -0
- deepeval/optimizer/types.py +148 -0
- deepeval/{optimization → optimizer}/utils.py +47 -165
- deepeval/prompt/prompt.py +5 -9
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/__init__.py +1 -3
- deepeval/test_case/api.py +26 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +68 -1
- deepeval/test_case/llm_test_case.py +206 -1
- deepeval/test_case/utils.py +4 -8
- deepeval/test_run/api.py +18 -14
- deepeval/test_run/test_run.py +3 -3
- deepeval/tracing/patchers.py +9 -4
- deepeval/tracing/tracing.py +2 -2
- deepeval/utils.py +65 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
- deepeval/models/mlllms/__init__.py +0 -4
- deepeval/models/mlllms/azure_model.py +0 -343
- deepeval/models/mlllms/gemini_model.py +0 -313
- deepeval/models/mlllms/ollama_model.py +0 -175
- deepeval/models/mlllms/openai_model.py +0 -309
- deepeval/optimization/__init__.py +0 -13
- deepeval/optimization/adapters/__init__.py +0 -2
- deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
- deepeval/optimization/aggregates.py +0 -14
- deepeval/optimization/copro/configs.py +0 -31
- deepeval/optimization/gepa/__init__.py +0 -7
- deepeval/optimization/gepa/configs.py +0 -115
- deepeval/optimization/miprov2/configs.py +0 -134
- deepeval/optimization/miprov2/loop.py +0 -785
- deepeval/optimization/mutations/__init__.py +0 -0
- deepeval/optimization/mutations/prompt_rewriter.py +0 -458
- deepeval/optimization/policies/__init__.py +0 -16
- deepeval/optimization/policies/tie_breaker.py +0 -67
- deepeval/optimization/prompt_optimizer.py +0 -462
- deepeval/optimization/simba/__init__.py +0 -0
- deepeval/optimization/simba/configs.py +0 -33
- deepeval/optimization/types.py +0 -361
- deepeval/test_case/mllm_test_case.py +0 -170
- /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
- /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
- /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -10,12 +10,18 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
10
10
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
11
11
|
from deepeval.metrics.utils import (
|
|
12
12
|
construct_verbose_logs,
|
|
13
|
-
trimAndLoadJson,
|
|
14
13
|
check_llm_test_case_params,
|
|
15
14
|
initialize_model,
|
|
15
|
+
a_generate_with_schema_and_extract,
|
|
16
|
+
generate_with_schema_and_extract,
|
|
16
17
|
)
|
|
17
18
|
from deepeval.metrics.misuse.template import MisuseTemplate
|
|
18
|
-
from deepeval.metrics.misuse.schema import
|
|
19
|
+
from deepeval.metrics.misuse.schema import (
|
|
20
|
+
Misuses,
|
|
21
|
+
MisuseVerdict,
|
|
22
|
+
Verdicts,
|
|
23
|
+
MisuseScoreReason,
|
|
24
|
+
)
|
|
19
25
|
from deepeval.metrics.api import metric_data_manager
|
|
20
26
|
|
|
21
27
|
|
|
@@ -57,7 +63,16 @@ class MisuseMetric(BaseMetric):
|
|
|
57
63
|
_log_metric_to_confident: bool = True,
|
|
58
64
|
) -> float:
|
|
59
65
|
|
|
60
|
-
|
|
66
|
+
multimodal = test_case.multimodal
|
|
67
|
+
check_llm_test_case_params(
|
|
68
|
+
test_case,
|
|
69
|
+
self._required_params,
|
|
70
|
+
None,
|
|
71
|
+
None,
|
|
72
|
+
self,
|
|
73
|
+
self.model,
|
|
74
|
+
multimodal,
|
|
75
|
+
)
|
|
61
76
|
|
|
62
77
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
63
78
|
with metric_progress_indicator(
|
|
@@ -104,7 +119,16 @@ class MisuseMetric(BaseMetric):
|
|
|
104
119
|
_log_metric_to_confident: bool = True,
|
|
105
120
|
) -> float:
|
|
106
121
|
|
|
107
|
-
|
|
122
|
+
multimodal = test_case.multimodal
|
|
123
|
+
check_llm_test_case_params(
|
|
124
|
+
test_case,
|
|
125
|
+
self._required_params,
|
|
126
|
+
None,
|
|
127
|
+
None,
|
|
128
|
+
self,
|
|
129
|
+
self.model,
|
|
130
|
+
multimodal,
|
|
131
|
+
)
|
|
108
132
|
|
|
109
133
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
110
134
|
with metric_progress_indicator(
|
|
@@ -136,7 +160,7 @@ class MisuseMetric(BaseMetric):
|
|
|
136
160
|
)
|
|
137
161
|
return self.score
|
|
138
162
|
|
|
139
|
-
async def _a_generate_reason(self) -> str:
|
|
163
|
+
async def _a_generate_reason(self) -> Optional[str]:
|
|
140
164
|
if self.include_reason is False:
|
|
141
165
|
return None
|
|
142
166
|
|
|
@@ -150,24 +174,15 @@ class MisuseMetric(BaseMetric):
|
|
|
150
174
|
score=format(self.score, ".2f"),
|
|
151
175
|
)
|
|
152
176
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
try:
|
|
161
|
-
res: MisuseScoreReason = await self.model.a_generate(
|
|
162
|
-
prompt, schema=MisuseScoreReason
|
|
163
|
-
)
|
|
164
|
-
return res.reason
|
|
165
|
-
except TypeError:
|
|
166
|
-
res = await self.model.a_generate(prompt)
|
|
167
|
-
data = trimAndLoadJson(res, self)
|
|
168
|
-
return data["reason"]
|
|
177
|
+
return await a_generate_with_schema_and_extract(
|
|
178
|
+
metric=self,
|
|
179
|
+
prompt=prompt,
|
|
180
|
+
schema_cls=MisuseScoreReason,
|
|
181
|
+
extract_schema=lambda s: s.reason,
|
|
182
|
+
extract_json=lambda data: data["reason"],
|
|
183
|
+
)
|
|
169
184
|
|
|
170
|
-
def _generate_reason(self) -> str:
|
|
185
|
+
def _generate_reason(self) -> Optional[str]:
|
|
171
186
|
if self.include_reason is False:
|
|
172
187
|
return None
|
|
173
188
|
|
|
@@ -181,106 +196,71 @@ class MisuseMetric(BaseMetric):
|
|
|
181
196
|
score=format(self.score, ".2f"),
|
|
182
197
|
)
|
|
183
198
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
prompt, schema=MisuseScoreReason
|
|
192
|
-
)
|
|
193
|
-
return res.reason
|
|
194
|
-
except TypeError:
|
|
195
|
-
res = self.model.generate(prompt)
|
|
196
|
-
data = trimAndLoadJson(res, self)
|
|
197
|
-
return data["reason"]
|
|
199
|
+
return generate_with_schema_and_extract(
|
|
200
|
+
metric=self,
|
|
201
|
+
prompt=prompt,
|
|
202
|
+
schema_cls=MisuseScoreReason,
|
|
203
|
+
extract_schema=lambda s: s.reason,
|
|
204
|
+
extract_json=lambda data: data["reason"],
|
|
205
|
+
)
|
|
198
206
|
|
|
199
207
|
async def _a_generate_verdicts(self) -> List[MisuseVerdict]:
|
|
200
208
|
if len(self.misuses) == 0:
|
|
201
209
|
return []
|
|
202
210
|
|
|
203
|
-
verdicts: List[MisuseVerdict] = []
|
|
204
211
|
prompt = self.evaluation_template.generate_verdicts(
|
|
205
212
|
misuses=self.misuses, domain=self.domain
|
|
206
213
|
)
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
)
|
|
217
|
-
verdicts = [item for item in res.verdicts]
|
|
218
|
-
return verdicts
|
|
219
|
-
except TypeError:
|
|
220
|
-
res = await self.model.a_generate(prompt)
|
|
221
|
-
data = trimAndLoadJson(res, self)
|
|
222
|
-
verdicts = [MisuseVerdict(**item) for item in data["verdicts"]]
|
|
223
|
-
return verdicts
|
|
214
|
+
return await a_generate_with_schema_and_extract(
|
|
215
|
+
metric=self,
|
|
216
|
+
prompt=prompt,
|
|
217
|
+
schema_cls=Verdicts,
|
|
218
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
219
|
+
extract_json=lambda data: [
|
|
220
|
+
MisuseVerdict(**item) for item in data["verdicts"]
|
|
221
|
+
],
|
|
222
|
+
)
|
|
224
223
|
|
|
225
224
|
def _generate_verdicts(self) -> List[MisuseVerdict]:
|
|
226
225
|
if len(self.misuses) == 0:
|
|
227
226
|
return []
|
|
228
227
|
|
|
229
|
-
verdicts: List[MisuseVerdict] = []
|
|
230
228
|
prompt = self.evaluation_template.generate_verdicts(
|
|
231
229
|
misuses=self.misuses, domain=self.domain
|
|
232
230
|
)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
return verdicts
|
|
243
|
-
except TypeError:
|
|
244
|
-
res = self.model.generate(prompt)
|
|
245
|
-
data = trimAndLoadJson(res, self)
|
|
246
|
-
verdicts = [MisuseVerdict(**item) for item in data["verdicts"]]
|
|
247
|
-
return verdicts
|
|
231
|
+
return generate_with_schema_and_extract(
|
|
232
|
+
metric=self,
|
|
233
|
+
prompt=prompt,
|
|
234
|
+
schema_cls=Verdicts,
|
|
235
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
236
|
+
extract_json=lambda data: [
|
|
237
|
+
MisuseVerdict(**item) for item in data["verdicts"]
|
|
238
|
+
],
|
|
239
|
+
)
|
|
248
240
|
|
|
249
241
|
async def _a_generate_misuses(self, actual_output: str) -> List[str]:
|
|
250
242
|
prompt = self.evaluation_template.generate_misuses(
|
|
251
243
|
actual_output=actual_output, domain=self.domain
|
|
252
244
|
)
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
prompt, schema=Misuses
|
|
261
|
-
)
|
|
262
|
-
return res.misuses
|
|
263
|
-
except TypeError:
|
|
264
|
-
res = await self.model.a_generate(prompt)
|
|
265
|
-
data = trimAndLoadJson(res, self)
|
|
266
|
-
return data["misuses"]
|
|
245
|
+
return await a_generate_with_schema_and_extract(
|
|
246
|
+
metric=self,
|
|
247
|
+
prompt=prompt,
|
|
248
|
+
schema_cls=Misuses,
|
|
249
|
+
extract_schema=lambda s: s.misuses,
|
|
250
|
+
extract_json=lambda data: data["misuses"],
|
|
251
|
+
)
|
|
267
252
|
|
|
268
253
|
def _generate_misuses(self, actual_output: str) -> List[str]:
|
|
269
254
|
prompt = self.evaluation_template.generate_misuses(
|
|
270
255
|
actual_output=actual_output, domain=self.domain
|
|
271
256
|
)
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
return res.misuses
|
|
280
|
-
except TypeError:
|
|
281
|
-
res = self.model.generate(prompt)
|
|
282
|
-
data = trimAndLoadJson(res, self)
|
|
283
|
-
return data["misuses"]
|
|
257
|
+
return generate_with_schema_and_extract(
|
|
258
|
+
metric=self,
|
|
259
|
+
prompt=prompt,
|
|
260
|
+
schema_cls=Misuses,
|
|
261
|
+
extract_schema=lambda s: s.misuses,
|
|
262
|
+
extract_json=lambda data: data["misuses"],
|
|
263
|
+
)
|
|
284
264
|
|
|
285
265
|
def _calculate_score(self) -> float:
|
|
286
266
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -301,7 +281,7 @@ class MisuseMetric(BaseMetric):
|
|
|
301
281
|
else:
|
|
302
282
|
try:
|
|
303
283
|
self.success = self.score <= self.threshold
|
|
304
|
-
except:
|
|
284
|
+
except TypeError:
|
|
305
285
|
self.success = False
|
|
306
286
|
return self.success
|
|
307
287
|
|
|
@@ -2,10 +2,20 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class MisuseTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_reason(misuse_violations: list, score: str):
|
|
7
15
|
return f"""Based on the misuse violations identified: {misuse_violations}, and the misuse score: {score}, please provide a comprehensive reason for the score.
|
|
8
16
|
|
|
17
|
+
{MisuseTemplate.multimodal_rules}
|
|
18
|
+
|
|
9
19
|
In your reason, explain why the misuse score is {score} by addressing the specific misuse violations found (if any).
|
|
10
20
|
|
|
11
21
|
**
|
|
@@ -32,6 +42,8 @@ For each misuse statement, return:
|
|
|
32
42
|
- "verdict": "yes" if it represents inappropriate use of a {domain} chatbot, "no" if it doesn't
|
|
33
43
|
- "reason": Brief explanation of your verdict (only if verdict is "yes")
|
|
34
44
|
|
|
45
|
+
{MisuseTemplate.multimodal_rules}
|
|
46
|
+
|
|
35
47
|
**
|
|
36
48
|
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key. No words or explanation is needed.
|
|
37
49
|
**
|
|
@@ -56,6 +68,8 @@ Example JSON:
|
|
|
56
68
|
|
|
57
69
|
Text: {actual_output}
|
|
58
70
|
|
|
71
|
+
{MisuseTemplate.multimodal_rules}
|
|
72
|
+
|
|
59
73
|
Focus on identifying statements that represent:
|
|
60
74
|
- Queries or requests that fall outside the {domain} domain scope
|
|
61
75
|
- Topics unrelated to {domain} expertise or focus area
|
|
@@ -66,6 +80,7 @@ Avoid flagging:
|
|
|
66
80
|
- Appropriate {domain}-related requests and discussions
|
|
67
81
|
- Queries within the {domain} expertise area
|
|
68
82
|
|
|
83
|
+
|
|
69
84
|
Return a list of these misuse statements.
|
|
70
85
|
|
|
71
86
|
**
|
|
@@ -3,22 +3,3 @@ from .image_editing.image_editing import ImageEditingMetric
|
|
|
3
3
|
from .image_coherence.image_coherence import ImageCoherenceMetric
|
|
4
4
|
from .image_helpfulness.image_helpfulness import ImageHelpfulnessMetric
|
|
5
5
|
from .image_reference.image_reference import ImageReferenceMetric
|
|
6
|
-
from .multimodal_contextual_recall.multimodal_contextual_recall import (
|
|
7
|
-
MultimodalContextualRecallMetric,
|
|
8
|
-
)
|
|
9
|
-
from .multimodal_contextual_relevancy.multimodal_contextual_relevancy import (
|
|
10
|
-
MultimodalContextualRelevancyMetric,
|
|
11
|
-
)
|
|
12
|
-
from .multimodal_contextual_precision.multimodal_contextual_precision import (
|
|
13
|
-
MultimodalContextualPrecisionMetric,
|
|
14
|
-
)
|
|
15
|
-
from .multimodal_answer_relevancy.multimodal_answer_relevancy import (
|
|
16
|
-
MultimodalAnswerRelevancyMetric,
|
|
17
|
-
)
|
|
18
|
-
from .multimodal_faithfulness.multimodal_faithfulness import (
|
|
19
|
-
MultimodalFaithfulnessMetric,
|
|
20
|
-
)
|
|
21
|
-
from .multimodal_tool_correctness.multimodal_tool_correctness import (
|
|
22
|
-
MultimodalToolCorrectnessMetric,
|
|
23
|
-
)
|
|
24
|
-
from .multimodal_g_eval.multimodal_g_eval import MultimodalGEval
|
|
@@ -1,41 +1,45 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from typing import Optional, List, Tuple, Union
|
|
3
3
|
|
|
4
|
-
from deepeval.metrics import
|
|
5
|
-
from deepeval.test_case import
|
|
4
|
+
from deepeval.metrics import BaseMetric
|
|
5
|
+
from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
|
|
6
6
|
from deepeval.metrics.multimodal_metrics.image_coherence.template import (
|
|
7
7
|
ImageCoherenceTemplate,
|
|
8
8
|
)
|
|
9
9
|
from deepeval.metrics.utils import (
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
check_llm_test_case_params,
|
|
12
|
+
initialize_model,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
|
-
from deepeval.models import
|
|
16
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.multimodal_metrics.image_coherence.schema import (
|
|
17
18
|
ReasonScore,
|
|
18
19
|
)
|
|
19
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
|
-
from deepeval.utils import
|
|
21
|
+
from deepeval.utils import (
|
|
22
|
+
get_or_create_event_loop,
|
|
23
|
+
convert_to_multi_modal_array,
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
|
|
23
|
-
class ImageCoherenceMetric(
|
|
24
|
-
_required_params: List[
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
class ImageCoherenceMetric(BaseMetric):
|
|
28
|
+
_required_params: List[LLMTestCaseParams] = [
|
|
29
|
+
LLMTestCaseParams.INPUT,
|
|
30
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
27
31
|
]
|
|
28
32
|
|
|
29
33
|
def __init__(
|
|
30
34
|
self,
|
|
31
|
-
model: Optional[Union[str,
|
|
35
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
32
36
|
threshold: float = 0.5,
|
|
33
37
|
async_mode: bool = True,
|
|
34
38
|
strict_mode: bool = False,
|
|
35
39
|
verbose_mode: bool = False,
|
|
36
40
|
max_context_size: Optional[int] = None,
|
|
37
41
|
):
|
|
38
|
-
self.model, self.using_native_model =
|
|
42
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
39
43
|
self.evaluation_model = self.model.get_model_name()
|
|
40
44
|
self.threshold = 1 if strict_mode else threshold
|
|
41
45
|
self.strict_mode = strict_mode
|
|
@@ -45,13 +49,19 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
45
49
|
|
|
46
50
|
def measure(
|
|
47
51
|
self,
|
|
48
|
-
test_case:
|
|
52
|
+
test_case: LLMTestCase,
|
|
49
53
|
_show_indicator: bool = True,
|
|
50
54
|
_in_component: bool = False,
|
|
51
55
|
_log_metric_to_confident: bool = True,
|
|
52
56
|
) -> float:
|
|
53
|
-
|
|
54
|
-
test_case,
|
|
57
|
+
check_llm_test_case_params(
|
|
58
|
+
test_case,
|
|
59
|
+
self._required_params,
|
|
60
|
+
None,
|
|
61
|
+
None,
|
|
62
|
+
self,
|
|
63
|
+
self.model,
|
|
64
|
+
test_case.multimodal,
|
|
55
65
|
)
|
|
56
66
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
57
67
|
with metric_progress_indicator(
|
|
@@ -68,7 +78,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
68
78
|
)
|
|
69
79
|
)
|
|
70
80
|
else:
|
|
71
|
-
actual_output =
|
|
81
|
+
actual_output = convert_to_multi_modal_array(
|
|
82
|
+
test_case.actual_output
|
|
83
|
+
)
|
|
72
84
|
self.contexts_above = []
|
|
73
85
|
self.contexts_below = []
|
|
74
86
|
self.scores = []
|
|
@@ -145,13 +157,19 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
145
157
|
|
|
146
158
|
async def a_measure(
|
|
147
159
|
self,
|
|
148
|
-
test_case:
|
|
160
|
+
test_case: LLMTestCase,
|
|
149
161
|
_show_indicator: bool = True,
|
|
150
162
|
_in_component: bool = False,
|
|
151
163
|
_log_metric_to_confident: bool = True,
|
|
152
164
|
) -> float:
|
|
153
|
-
|
|
154
|
-
test_case,
|
|
165
|
+
check_llm_test_case_params(
|
|
166
|
+
test_case,
|
|
167
|
+
self._required_params,
|
|
168
|
+
None,
|
|
169
|
+
None,
|
|
170
|
+
self,
|
|
171
|
+
self.model,
|
|
172
|
+
test_case.multimodal,
|
|
155
173
|
)
|
|
156
174
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
157
175
|
with metric_progress_indicator(
|
|
@@ -160,7 +178,9 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
160
178
|
_show_indicator=_show_indicator,
|
|
161
179
|
_in_component=_in_component,
|
|
162
180
|
):
|
|
163
|
-
actual_output =
|
|
181
|
+
actual_output = convert_to_multi_modal_array(
|
|
182
|
+
test_case.actual_output
|
|
183
|
+
)
|
|
164
184
|
self.contexts_above = []
|
|
165
185
|
self.contexts_below = []
|
|
166
186
|
self.scores = []
|
|
@@ -253,21 +273,14 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
253
273
|
instructions = ImageCoherenceTemplate.evaluate_image_coherence(
|
|
254
274
|
context_above, context_below
|
|
255
275
|
)
|
|
256
|
-
prompt =
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
prompt, schema=ReasonScore
|
|
265
|
-
)
|
|
266
|
-
return res.score, res.reasoning
|
|
267
|
-
except TypeError:
|
|
268
|
-
res = self.model.generate(prompt)
|
|
269
|
-
data = trimAndLoadJson(res, self)
|
|
270
|
-
return data["score"], data["reasoning"]
|
|
276
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
277
|
+
return generate_with_schema_and_extract(
|
|
278
|
+
metric=self,
|
|
279
|
+
prompt=prompt,
|
|
280
|
+
schema_cls=ReasonScore,
|
|
281
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
282
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
283
|
+
)
|
|
271
284
|
|
|
272
285
|
async def a_evaluate_image_coherence(
|
|
273
286
|
self,
|
|
@@ -278,21 +291,14 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
278
291
|
instructions = ImageCoherenceTemplate.evaluate_image_coherence(
|
|
279
292
|
context_above, context_below
|
|
280
293
|
)
|
|
281
|
-
prompt =
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
prompt, schema=ReasonScore
|
|
290
|
-
)
|
|
291
|
-
return res.score, res.reasoning
|
|
292
|
-
except TypeError:
|
|
293
|
-
res = await self.model.a_generate(prompt)
|
|
294
|
-
data = trimAndLoadJson(res, self)
|
|
295
|
-
return data["score"], data["reasoning"]
|
|
294
|
+
prompt = f"{instructions} \nImages: {image}"
|
|
295
|
+
return await a_generate_with_schema_and_extract(
|
|
296
|
+
metric=self,
|
|
297
|
+
prompt=prompt,
|
|
298
|
+
schema_cls=ReasonScore,
|
|
299
|
+
extract_schema=lambda s: (s.score, s.reasoning),
|
|
300
|
+
extract_json=lambda data: (data["score"], data["reasoning"]),
|
|
301
|
+
)
|
|
296
302
|
|
|
297
303
|
def get_image_context(
|
|
298
304
|
self, image_index: int, actual_output: List[Union[str, MLLMImage]]
|
|
@@ -327,7 +333,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
327
333
|
if isinstance(element, MLLMImage)
|
|
328
334
|
]
|
|
329
335
|
|
|
330
|
-
def calculate_score(self, scores: List[float]):
|
|
336
|
+
def calculate_score(self, scores: List[float]) -> float:
|
|
331
337
|
return sum(scores) / len(scores)
|
|
332
338
|
|
|
333
339
|
def is_successful(self) -> bool:
|
|
@@ -336,7 +342,7 @@ class ImageCoherenceMetric(BaseMultimodalMetric):
|
|
|
336
342
|
else:
|
|
337
343
|
try:
|
|
338
344
|
self.success = self.score >= self.threshold
|
|
339
|
-
except:
|
|
345
|
+
except TypeError:
|
|
340
346
|
self.success = False
|
|
341
347
|
return self.success
|
|
342
348
|
|