deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -3,8 +3,16 @@ import textwrap
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class GoalAccuracyTemplate:
|
|
6
|
+
multimodal_rules = """
|
|
7
|
+
--- MULTIMODAL INPUT RULES ---
|
|
8
|
+
- Treat image content as factual evidence.
|
|
9
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
10
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
11
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
6
14
|
@staticmethod
|
|
7
|
-
def get_accuracy_score(task, steps_taken):
|
|
15
|
+
def get_accuracy_score(task, steps_taken, multimodal: bool = False):
|
|
8
16
|
return textwrap.dedent(
|
|
9
17
|
f"""You are an expert evaluator assessing the **goal accuracy** of an AI assistant's single interaction.
|
|
10
18
|
|
|
@@ -36,6 +44,8 @@ class GoalAccuracyTemplate:
|
|
|
36
44
|
- When uncertain, assume the goal was **not achieved**.
|
|
37
45
|
- The metric is designed to fail unless the assistant's output is precise, complete, and user-visible.
|
|
38
46
|
|
|
47
|
+
{GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
|
|
48
|
+
|
|
39
49
|
SCORING GUIDE:
|
|
40
50
|
|
|
41
51
|
- **1.0** → Goal completely and correctly achieved; all required outputs visible to the user.
|
|
@@ -102,7 +112,7 @@ class GoalAccuracyTemplate:
|
|
|
102
112
|
)
|
|
103
113
|
|
|
104
114
|
@staticmethod
|
|
105
|
-
def get_plan_evaluation_score(task, steps_taken):
|
|
115
|
+
def get_plan_evaluation_score(task, steps_taken, multimodal: bool = False):
|
|
106
116
|
return textwrap.dedent(
|
|
107
117
|
f"""You are an expert evaluator assessing the **planning quality** and **plan adherence** of an AI agent tasked with fulfilling a user's request.
|
|
108
118
|
|
|
@@ -132,6 +142,8 @@ class GoalAccuracyTemplate:
|
|
|
132
142
|
- Tool use should be coherent within the plan, not ad hoc or speculative.
|
|
133
143
|
- This evaluation excludes correctness or efficiency — focus solely on plan and adherence.
|
|
134
144
|
|
|
145
|
+
{GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
|
|
146
|
+
|
|
135
147
|
SCORING GUIDE:
|
|
136
148
|
|
|
137
149
|
- **1.0** → Complete, clear, and logical plan **fully followed** with all steps aligned to the user's goal.
|
|
@@ -188,7 +200,11 @@ class GoalAccuracyTemplate:
|
|
|
188
200
|
|
|
189
201
|
@staticmethod
|
|
190
202
|
def get_final_reason(
|
|
191
|
-
final_score,
|
|
203
|
+
final_score,
|
|
204
|
+
threshold,
|
|
205
|
+
goal_evaluations,
|
|
206
|
+
plan_evalautions,
|
|
207
|
+
multimodal: bool = False,
|
|
192
208
|
):
|
|
193
209
|
return textwrap.dedent(
|
|
194
210
|
f"""You are an expert evaluator providing a **final justification** for whether an AI agent has passed or failed an evaluation metric.
|
|
@@ -213,6 +229,8 @@ class GoalAccuracyTemplate:
|
|
|
213
229
|
- If the agent **failed**, explain which aspects (task or plan or both) led to the failure.
|
|
214
230
|
- Avoid vague praise or criticism — ground the reason in the actual scores and justifications.
|
|
215
231
|
|
|
232
|
+
{GoalAccuracyTemplate.multimodal_rules if multimodal else ""}
|
|
233
|
+
|
|
216
234
|
---
|
|
217
235
|
|
|
218
236
|
FORMAT:
|
|
@@ -8,14 +8,19 @@ from deepeval.metrics import BaseMetric
|
|
|
8
8
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
9
9
|
from deepeval.metrics.utils import (
|
|
10
10
|
construct_verbose_logs,
|
|
11
|
-
trimAndLoadJson,
|
|
12
11
|
check_llm_test_case_params,
|
|
13
12
|
initialize_model,
|
|
13
|
+
a_generate_with_schema_and_extract,
|
|
14
|
+
generate_with_schema_and_extract,
|
|
14
15
|
)
|
|
15
16
|
from deepeval.metrics.hallucination.template import HallucinationTemplate
|
|
16
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
|
-
from deepeval.metrics.hallucination.schema import
|
|
19
|
+
from deepeval.metrics.hallucination.schema import (
|
|
20
|
+
HallucinationVerdict,
|
|
21
|
+
Verdicts,
|
|
22
|
+
HallucinationScoreReason,
|
|
23
|
+
)
|
|
19
24
|
from deepeval.metrics.api import metric_data_manager
|
|
20
25
|
|
|
21
26
|
|
|
@@ -55,7 +60,16 @@ class HallucinationMetric(BaseMetric):
|
|
|
55
60
|
_log_metric_to_confident: bool = True,
|
|
56
61
|
) -> float:
|
|
57
62
|
|
|
58
|
-
|
|
63
|
+
multimodal = test_case.multimodal
|
|
64
|
+
check_llm_test_case_params(
|
|
65
|
+
test_case,
|
|
66
|
+
self._required_params,
|
|
67
|
+
None,
|
|
68
|
+
None,
|
|
69
|
+
self,
|
|
70
|
+
self.model,
|
|
71
|
+
multimodal,
|
|
72
|
+
)
|
|
59
73
|
|
|
60
74
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
61
75
|
with metric_progress_indicator(
|
|
@@ -102,7 +116,16 @@ class HallucinationMetric(BaseMetric):
|
|
|
102
116
|
_log_metric_to_confident: bool = True,
|
|
103
117
|
) -> float:
|
|
104
118
|
|
|
105
|
-
|
|
119
|
+
multimodal = test_case.multimodal
|
|
120
|
+
check_llm_test_case_params(
|
|
121
|
+
test_case,
|
|
122
|
+
self._required_params,
|
|
123
|
+
None,
|
|
124
|
+
None,
|
|
125
|
+
self,
|
|
126
|
+
self.model,
|
|
127
|
+
multimodal,
|
|
128
|
+
)
|
|
106
129
|
|
|
107
130
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
108
131
|
with metric_progress_indicator(
|
|
@@ -150,22 +173,13 @@ class HallucinationMetric(BaseMetric):
|
|
|
150
173
|
score=format(self.score, ".2f"),
|
|
151
174
|
)
|
|
152
175
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
try:
|
|
161
|
-
res: HallucinationScoreReason = await self.model.a_generate(
|
|
162
|
-
prompt, schema=HallucinationScoreReason
|
|
163
|
-
)
|
|
164
|
-
return res.reason
|
|
165
|
-
except TypeError:
|
|
166
|
-
res = await self.model.a_generate(prompt)
|
|
167
|
-
data = trimAndLoadJson(res, self)
|
|
168
|
-
return data["reason"]
|
|
176
|
+
return await a_generate_with_schema_and_extract(
|
|
177
|
+
metric=self,
|
|
178
|
+
prompt=prompt,
|
|
179
|
+
schema_cls=HallucinationScoreReason,
|
|
180
|
+
extract_schema=lambda s: s.reason,
|
|
181
|
+
extract_json=lambda data: data["reason"],
|
|
182
|
+
)
|
|
169
183
|
|
|
170
184
|
def _generate_reason(self):
|
|
171
185
|
if self.include_reason is False:
|
|
@@ -185,74 +199,45 @@ class HallucinationMetric(BaseMetric):
|
|
|
185
199
|
score=format(self.score, ".2f"),
|
|
186
200
|
)
|
|
187
201
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
try:
|
|
196
|
-
res: HallucinationScoreReason = self.model.generate(
|
|
197
|
-
prompt, schema=HallucinationScoreReason
|
|
198
|
-
)
|
|
199
|
-
return res.reason
|
|
200
|
-
except TypeError:
|
|
201
|
-
res = self.model.generate(prompt)
|
|
202
|
-
data = trimAndLoadJson(res, self)
|
|
203
|
-
return data["reason"]
|
|
202
|
+
return generate_with_schema_and_extract(
|
|
203
|
+
metric=self,
|
|
204
|
+
prompt=prompt,
|
|
205
|
+
schema_cls=HallucinationScoreReason,
|
|
206
|
+
extract_schema=lambda s: s.reason,
|
|
207
|
+
extract_json=lambda data: data["reason"],
|
|
208
|
+
)
|
|
204
209
|
|
|
205
210
|
async def _a_generate_verdicts(
|
|
206
211
|
self, actual_output: str, contexts: List[str]
|
|
207
212
|
) -> List[HallucinationVerdict]:
|
|
208
|
-
verdicts: List[HallucinationVerdict] = []
|
|
209
213
|
prompt = self.evaluation_template.generate_verdicts(
|
|
210
214
|
actual_output=actual_output, contexts=contexts
|
|
211
215
|
)
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
)
|
|
222
|
-
verdicts = [item for item in res.verdicts]
|
|
223
|
-
return verdicts
|
|
224
|
-
except TypeError:
|
|
225
|
-
res = await self.model.a_generate(prompt)
|
|
226
|
-
data = trimAndLoadJson(res, self)
|
|
227
|
-
verdicts = [
|
|
228
|
-
HallucinationVerdict(**item) for item in data["verdicts"]
|
|
229
|
-
]
|
|
230
|
-
return verdicts
|
|
216
|
+
return await a_generate_with_schema_and_extract(
|
|
217
|
+
metric=self,
|
|
218
|
+
prompt=prompt,
|
|
219
|
+
schema_cls=Verdicts,
|
|
220
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
221
|
+
extract_json=lambda data: [
|
|
222
|
+
HallucinationVerdict(**item) for item in data["verdicts"]
|
|
223
|
+
],
|
|
224
|
+
)
|
|
231
225
|
|
|
232
226
|
def _generate_verdicts(
|
|
233
227
|
self, actual_output: str, contexts: List[str]
|
|
234
228
|
) -> List[HallucinationVerdict]:
|
|
235
|
-
verdicts: List[HallucinationVerdict] = []
|
|
236
229
|
prompt = self.evaluation_template.generate_verdicts(
|
|
237
230
|
actual_output=actual_output, contexts=contexts
|
|
238
231
|
)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
return verdicts
|
|
249
|
-
except TypeError:
|
|
250
|
-
res = self.model.generate(prompt)
|
|
251
|
-
data = trimAndLoadJson(res, self)
|
|
252
|
-
verdicts = [
|
|
253
|
-
HallucinationVerdict(**item) for item in data["verdicts"]
|
|
254
|
-
]
|
|
255
|
-
return verdicts
|
|
232
|
+
return generate_with_schema_and_extract(
|
|
233
|
+
metric=self,
|
|
234
|
+
prompt=prompt,
|
|
235
|
+
schema_cls=Verdicts,
|
|
236
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
237
|
+
extract_json=lambda data: [
|
|
238
|
+
HallucinationVerdict(**item) for item in data["verdicts"]
|
|
239
|
+
],
|
|
240
|
+
)
|
|
256
241
|
|
|
257
242
|
def _calculate_score(self) -> float:
|
|
258
243
|
number_of_verdicts = len(self.verdicts)
|
|
@@ -273,7 +258,7 @@ class HallucinationMetric(BaseMetric):
|
|
|
273
258
|
else:
|
|
274
259
|
try:
|
|
275
260
|
self.success = self.score <= self.threshold
|
|
276
|
-
except:
|
|
261
|
+
except TypeError:
|
|
277
262
|
self.success = False
|
|
278
263
|
return self.success
|
|
279
264
|
|
|
@@ -2,9 +2,20 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class HallucinationTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_verdicts(actual_output: str, contexts: List[str]):
|
|
7
15
|
return f"""For each context in contexts, which is a list of strings, please generate a list of JSON objects to indicate whether the given 'actual output' agrees with EACH context. The JSON will have 2 fields: 'verdict' and 'reason'.
|
|
16
|
+
|
|
17
|
+
{HallucinationTemplate.multimodal_rules}
|
|
18
|
+
|
|
8
19
|
The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the given text agrees with the context.
|
|
9
20
|
The 'reason' is the reason for the verdict. When the answer is 'no', try to provide a correction in the reason.
|
|
10
21
|
|
|
@@ -46,6 +57,8 @@ JSON:
|
|
|
46
57
|
):
|
|
47
58
|
return f"""Given a list of factual alignments and contradictions, which highlights alignment/contradictions between the `actual output` and `contexts, use it to provide a reason for the hallucination score in a CONCISELY. Note that The hallucination score ranges from 0 - 1, and the lower the better.
|
|
48
59
|
|
|
60
|
+
{HallucinationTemplate.multimodal_rules}
|
|
61
|
+
|
|
49
62
|
**
|
|
50
63
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
51
64
|
Example JSON:
|
deepeval/metrics/indicator.py
CHANGED
|
@@ -1,24 +1,24 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
1
5
|
from rich.console import Console
|
|
2
6
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
3
7
|
from contextlib import contextmanager
|
|
4
|
-
import sys
|
|
5
8
|
from typing import List, Optional, Union
|
|
6
|
-
import time
|
|
7
|
-
import asyncio
|
|
8
9
|
|
|
9
10
|
from deepeval.errors import MissingTestCaseParamsError
|
|
10
11
|
from deepeval.metrics import (
|
|
11
12
|
BaseMetric,
|
|
12
13
|
BaseConversationalMetric,
|
|
13
|
-
BaseMultimodalMetric,
|
|
14
14
|
BaseArenaMetric,
|
|
15
15
|
)
|
|
16
16
|
from deepeval.test_case import LLMTestCase, ConversationalTestCase
|
|
17
17
|
from deepeval.test_run.cache import CachedTestCase, Cache
|
|
18
18
|
from deepeval.telemetry import capture_metric_type
|
|
19
19
|
from deepeval.utils import update_pbar
|
|
20
|
+
from deepeval.config.settings import get_settings
|
|
20
21
|
|
|
21
|
-
import logging
|
|
22
22
|
|
|
23
23
|
logger = logging.getLogger(__name__)
|
|
24
24
|
|
|
@@ -74,7 +74,7 @@ def metric_progress_indicator(
|
|
|
74
74
|
async def measure_metric_task(
|
|
75
75
|
task_id,
|
|
76
76
|
progress,
|
|
77
|
-
metric: Union[BaseMetric,
|
|
77
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
78
78
|
test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
|
|
79
79
|
cached_test_case: Union[CachedTestCase, None],
|
|
80
80
|
ignore_errors: bool,
|
|
@@ -156,9 +156,7 @@ async def measure_metric_task(
|
|
|
156
156
|
|
|
157
157
|
|
|
158
158
|
async def measure_metrics_with_indicator(
|
|
159
|
-
metrics: List[
|
|
160
|
-
Union[BaseMetric, BaseMultimodalMetric, BaseConversationalMetric]
|
|
161
|
-
],
|
|
159
|
+
metrics: List[Union[BaseMetric, BaseConversationalMetric]],
|
|
162
160
|
test_case: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
|
|
163
161
|
cached_test_case: Union[CachedTestCase, None],
|
|
164
162
|
ignore_errors: bool,
|
|
@@ -238,7 +236,7 @@ async def measure_metrics_with_indicator(
|
|
|
238
236
|
|
|
239
237
|
|
|
240
238
|
async def safe_a_measure(
|
|
241
|
-
metric: Union[BaseMetric,
|
|
239
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
242
240
|
tc: Union[LLMTestCase, LLMTestCase, ConversationalTestCase],
|
|
243
241
|
ignore_errors: bool,
|
|
244
242
|
skip_on_missing_params: bool,
|
|
@@ -263,6 +261,9 @@ async def safe_a_measure(
|
|
|
263
261
|
"Timed out/cancelled while evaluating metric. "
|
|
264
262
|
"Increase DEEPEVAL_PER_TASK_TIMEOUT_SECONDS_OVERRIDE or set "
|
|
265
263
|
"DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
264
|
+
if not get_settings().DEEPEVAL_DISABLE_TIMEOUTS
|
|
265
|
+
else "Cancelled while evaluating metric (DeepEval timeouts are disabled; this likely came from upstream orchestration or the provider/network layer). "
|
|
266
|
+
"Set DEEPEVAL_LOG_STACK_TRACES=1 for full traceback."
|
|
266
267
|
)
|
|
267
268
|
metric.success = False
|
|
268
269
|
|
|
@@ -11,7 +11,8 @@ from deepeval.metrics.utils import (
|
|
|
11
11
|
construct_verbose_logs,
|
|
12
12
|
check_llm_test_case_params,
|
|
13
13
|
initialize_model,
|
|
14
|
-
|
|
14
|
+
a_generate_with_schema_and_extract,
|
|
15
|
+
generate_with_schema_and_extract,
|
|
15
16
|
)
|
|
16
17
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
@@ -46,6 +47,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
46
47
|
self.async_mode = async_mode
|
|
47
48
|
self.verbose_mode = verbose_mode
|
|
48
49
|
self.expected_schema = expected_schema
|
|
50
|
+
self.evaluation_model = self.model.get_model_name()
|
|
49
51
|
|
|
50
52
|
def measure(
|
|
51
53
|
self,
|
|
@@ -55,7 +57,16 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
55
57
|
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
|
-
|
|
60
|
+
multimodal = test_case.multimodal
|
|
61
|
+
check_llm_test_case_params(
|
|
62
|
+
test_case,
|
|
63
|
+
self._required_params,
|
|
64
|
+
None,
|
|
65
|
+
None,
|
|
66
|
+
self,
|
|
67
|
+
self.model,
|
|
68
|
+
multimodal,
|
|
69
|
+
)
|
|
59
70
|
|
|
60
71
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
61
72
|
with metric_progress_indicator(
|
|
@@ -77,7 +88,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
77
88
|
self.expected_schema.model_validate_json(
|
|
78
89
|
test_case.actual_output
|
|
79
90
|
)
|
|
80
|
-
except ValidationError
|
|
91
|
+
except ValidationError:
|
|
81
92
|
valid_json = False
|
|
82
93
|
|
|
83
94
|
self.score = 1 if valid_json else 0
|
|
@@ -106,7 +117,16 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
106
117
|
_log_metric_to_confident: bool = True,
|
|
107
118
|
) -> float:
|
|
108
119
|
|
|
109
|
-
|
|
120
|
+
multimodal = test_case.multimodal
|
|
121
|
+
check_llm_test_case_params(
|
|
122
|
+
test_case,
|
|
123
|
+
self._required_params,
|
|
124
|
+
None,
|
|
125
|
+
None,
|
|
126
|
+
self,
|
|
127
|
+
self.model,
|
|
128
|
+
multimodal,
|
|
129
|
+
)
|
|
110
130
|
|
|
111
131
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
112
132
|
with metric_progress_indicator(
|
|
@@ -120,7 +140,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
120
140
|
self.expected_schema.model_validate_json(
|
|
121
141
|
test_case.actual_output
|
|
122
142
|
)
|
|
123
|
-
except ValidationError
|
|
143
|
+
except ValidationError:
|
|
124
144
|
valid_json = False
|
|
125
145
|
|
|
126
146
|
self.score = 1 if valid_json else 0
|
|
@@ -156,22 +176,13 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
156
176
|
is_valid_json=is_valid_json,
|
|
157
177
|
)
|
|
158
178
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
try:
|
|
167
|
-
res: JsonCorrectnessScoreReason = await self.model.a_generate(
|
|
168
|
-
prompt, schema=JsonCorrectnessScoreReason
|
|
169
|
-
)
|
|
170
|
-
return res.reason
|
|
171
|
-
except TypeError:
|
|
172
|
-
res = await self.model.a_generate(prompt)
|
|
173
|
-
data = trimAndLoadJson(res, self)
|
|
174
|
-
return data["reason"]
|
|
179
|
+
return await a_generate_with_schema_and_extract(
|
|
180
|
+
metric=self,
|
|
181
|
+
prompt=prompt,
|
|
182
|
+
schema_cls=JsonCorrectnessScoreReason,
|
|
183
|
+
extract_schema=lambda s: s.reason,
|
|
184
|
+
extract_json=lambda data: data["reason"],
|
|
185
|
+
)
|
|
175
186
|
|
|
176
187
|
def generate_reason(self, actual_output: str) -> str:
|
|
177
188
|
if self.include_reason is False:
|
|
@@ -189,22 +200,13 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
189
200
|
is_valid_json=is_valid_json,
|
|
190
201
|
)
|
|
191
202
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
try:
|
|
200
|
-
res: JsonCorrectnessScoreReason = self.model.generate(
|
|
201
|
-
prompt, schema=JsonCorrectnessScoreReason
|
|
202
|
-
)
|
|
203
|
-
return res.reason
|
|
204
|
-
except TypeError:
|
|
205
|
-
res = self.model.generate(prompt)
|
|
206
|
-
data = trimAndLoadJson(res, self)
|
|
207
|
-
return data["reason"]
|
|
203
|
+
return generate_with_schema_and_extract(
|
|
204
|
+
metric=self,
|
|
205
|
+
prompt=prompt,
|
|
206
|
+
schema_cls=JsonCorrectnessScoreReason,
|
|
207
|
+
extract_schema=lambda s: s.reason,
|
|
208
|
+
extract_json=lambda data: data["reason"],
|
|
209
|
+
)
|
|
208
210
|
|
|
209
211
|
def is_successful(self) -> bool:
|
|
210
212
|
if self.error is not None:
|
|
@@ -212,7 +214,7 @@ class JsonCorrectnessMetric(BaseMetric):
|
|
|
212
214
|
else:
|
|
213
215
|
try:
|
|
214
216
|
self.success = self.score >= self.threshold
|
|
215
|
-
except:
|
|
217
|
+
except TypeError:
|
|
216
218
|
self.success = False
|
|
217
219
|
return self.success
|
|
218
220
|
|
|
@@ -2,12 +2,22 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class JsonCorrectnessTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_reason(
|
|
7
15
|
actual_output: str, expected_schema: str, is_valid_json: bool
|
|
8
16
|
):
|
|
9
17
|
return f"""Based on the given generated json, generated by an LLM, and a boolean stating whether it is a valid JSON based on the expected json schema, give a reason why it is OR is not a valid Json.
|
|
10
18
|
|
|
19
|
+
{JsonCorrectnessTemplate.multimodal_rules}
|
|
20
|
+
|
|
11
21
|
**
|
|
12
22
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
13
23
|
Example JSON:
|