deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -2,12 +2,17 @@ import asyncio
|
|
|
2
2
|
|
|
3
3
|
from typing import Optional, List, Union
|
|
4
4
|
|
|
5
|
-
from deepeval.utils import
|
|
5
|
+
from deepeval.utils import (
|
|
6
|
+
get_or_create_event_loop,
|
|
7
|
+
prettify_list,
|
|
8
|
+
get_per_task_timeout,
|
|
9
|
+
)
|
|
6
10
|
from deepeval.metrics.utils import (
|
|
7
11
|
construct_verbose_logs,
|
|
8
|
-
trimAndLoadJson,
|
|
9
12
|
check_llm_test_case_params,
|
|
10
13
|
initialize_model,
|
|
14
|
+
a_generate_with_schema_and_extract,
|
|
15
|
+
generate_with_schema_and_extract,
|
|
11
16
|
)
|
|
12
17
|
from deepeval.test_case import (
|
|
13
18
|
LLMTestCase,
|
|
@@ -18,7 +23,6 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
18
23
|
from deepeval.metrics.prompt_alignment.template import PromptAlignmentTemplate
|
|
19
24
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
25
|
from deepeval.metrics.prompt_alignment import schema as paschema
|
|
21
|
-
from deepeval.config.settings import get_settings
|
|
22
26
|
|
|
23
27
|
from deepeval.metrics.api import metric_data_manager
|
|
24
28
|
|
|
@@ -60,7 +64,15 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
60
64
|
_log_metric_to_confident: bool = True,
|
|
61
65
|
) -> float:
|
|
62
66
|
|
|
63
|
-
check_llm_test_case_params(
|
|
67
|
+
check_llm_test_case_params(
|
|
68
|
+
test_case,
|
|
69
|
+
self._required_params,
|
|
70
|
+
None,
|
|
71
|
+
None,
|
|
72
|
+
self,
|
|
73
|
+
self.model,
|
|
74
|
+
test_case.multimodal,
|
|
75
|
+
)
|
|
64
76
|
|
|
65
77
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
66
78
|
with metric_progress_indicator(
|
|
@@ -72,16 +84,19 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
72
84
|
test_case,
|
|
73
85
|
_show_indicator=False,
|
|
74
86
|
_in_component=_in_component,
|
|
87
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
75
88
|
)
|
|
76
89
|
loop.run_until_complete(
|
|
77
90
|
asyncio.wait_for(
|
|
78
91
|
coro,
|
|
79
|
-
timeout=
|
|
92
|
+
timeout=get_per_task_timeout(),
|
|
80
93
|
)
|
|
81
94
|
)
|
|
82
95
|
else:
|
|
83
|
-
self.verdicts: paschema.
|
|
84
|
-
|
|
96
|
+
self.verdicts: List[paschema.PromptAlignmentVerdict] = (
|
|
97
|
+
self._generate_verdicts(
|
|
98
|
+
test_case.input, test_case.actual_output
|
|
99
|
+
)
|
|
85
100
|
)
|
|
86
101
|
self.score = self._calculate_score()
|
|
87
102
|
self.reason = self._generate_reason(
|
|
@@ -111,7 +126,15 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
111
126
|
_log_metric_to_confident: bool = True,
|
|
112
127
|
) -> float:
|
|
113
128
|
|
|
114
|
-
check_llm_test_case_params(
|
|
129
|
+
check_llm_test_case_params(
|
|
130
|
+
test_case,
|
|
131
|
+
self._required_params,
|
|
132
|
+
None,
|
|
133
|
+
None,
|
|
134
|
+
self,
|
|
135
|
+
self.model,
|
|
136
|
+
test_case.multimodal,
|
|
137
|
+
)
|
|
115
138
|
|
|
116
139
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
117
140
|
with metric_progress_indicator(
|
|
@@ -120,8 +143,10 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
120
143
|
_show_indicator=_show_indicator,
|
|
121
144
|
_in_component=_in_component,
|
|
122
145
|
):
|
|
123
|
-
self.verdicts: paschema.
|
|
124
|
-
|
|
146
|
+
self.verdicts: List[paschema.PromptAlignmentVerdict] = (
|
|
147
|
+
await self._a_generate_verdicts(
|
|
148
|
+
test_case.input, test_case.actual_output
|
|
149
|
+
)
|
|
125
150
|
)
|
|
126
151
|
self.score = self._calculate_score()
|
|
127
152
|
self.reason = await self._a_generate_reason(
|
|
@@ -142,7 +167,9 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
142
167
|
)
|
|
143
168
|
return self.score
|
|
144
169
|
|
|
145
|
-
async def _a_generate_reason(
|
|
170
|
+
async def _a_generate_reason(
|
|
171
|
+
self, input: str, actual_output: str
|
|
172
|
+
) -> Optional[str]:
|
|
146
173
|
if self.include_reason is False:
|
|
147
174
|
return None
|
|
148
175
|
|
|
@@ -157,27 +184,16 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
157
184
|
actual_output=actual_output,
|
|
158
185
|
score=format(self.score, ".2f"),
|
|
159
186
|
)
|
|
160
|
-
if self.using_native_model:
|
|
161
|
-
res, cost = await self.model.a_generate(
|
|
162
|
-
prompt, schema=paschema.PromptAlignmentScoreReason
|
|
163
|
-
)
|
|
164
|
-
self.evaluation_cost += cost
|
|
165
|
-
return res.reason
|
|
166
|
-
else:
|
|
167
|
-
try:
|
|
168
|
-
res: paschema.PromptAlignmentScoreReason = (
|
|
169
|
-
await self.model.a_generate(
|
|
170
|
-
prompt=prompt,
|
|
171
|
-
schema=paschema.PromptAlignmentScoreReason,
|
|
172
|
-
)
|
|
173
|
-
)
|
|
174
|
-
return res.reason
|
|
175
|
-
except TypeError:
|
|
176
|
-
res = await self.model.a_generate(prompt)
|
|
177
|
-
data = trimAndLoadJson(res, self)
|
|
178
|
-
return data["reason"]
|
|
179
187
|
|
|
180
|
-
|
|
188
|
+
return await a_generate_with_schema_and_extract(
|
|
189
|
+
metric=self,
|
|
190
|
+
prompt=prompt,
|
|
191
|
+
schema_cls=paschema.PromptAlignmentScoreReason,
|
|
192
|
+
extract_schema=lambda s: s.reason,
|
|
193
|
+
extract_json=lambda data: data["reason"],
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def _generate_reason(self, input: str, actual_output: str) -> Optional[str]:
|
|
181
197
|
if self.include_reason is False:
|
|
182
198
|
return None
|
|
183
199
|
|
|
@@ -192,78 +208,54 @@ class PromptAlignmentMetric(BaseMetric):
|
|
|
192
208
|
actual_output=actual_output,
|
|
193
209
|
score=format(self.score, ".2f"),
|
|
194
210
|
)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
res: paschema.PromptAlignmentScoreReason = self.model.generate(
|
|
204
|
-
prompt=prompt, schema=paschema.PromptAlignmentScoreReason
|
|
205
|
-
)
|
|
206
|
-
return res.reason
|
|
207
|
-
except TypeError:
|
|
208
|
-
res = self.model.generate(prompt)
|
|
209
|
-
data = trimAndLoadJson(res, self)
|
|
210
|
-
return data["reason"]
|
|
211
|
+
|
|
212
|
+
return generate_with_schema_and_extract(
|
|
213
|
+
metric=self,
|
|
214
|
+
prompt=prompt,
|
|
215
|
+
schema_cls=paschema.PromptAlignmentScoreReason,
|
|
216
|
+
extract_schema=lambda s: s.reason,
|
|
217
|
+
extract_json=lambda data: data["reason"],
|
|
218
|
+
)
|
|
211
219
|
|
|
212
220
|
async def _a_generate_verdicts(
|
|
213
221
|
self, input: str, actual_output: str
|
|
214
|
-
) -> paschema.
|
|
222
|
+
) -> List[paschema.PromptAlignmentVerdict]:
|
|
215
223
|
prompt = PromptAlignmentTemplate.generate_verdicts(
|
|
216
224
|
prompt_instructions=self.prompt_instructions,
|
|
217
225
|
input=input,
|
|
218
226
|
actual_output=actual_output,
|
|
219
227
|
)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
)
|
|
231
|
-
return [item for item in res.verdicts]
|
|
232
|
-
except TypeError:
|
|
233
|
-
res = await self.model.a_generate(prompt)
|
|
234
|
-
data = trimAndLoadJson(res, self)
|
|
235
|
-
return [
|
|
236
|
-
paschema.PromptAlignmentVerdict(**item)
|
|
237
|
-
for item in data["verdicts"]
|
|
238
|
-
]
|
|
228
|
+
return await a_generate_with_schema_and_extract(
|
|
229
|
+
metric=self,
|
|
230
|
+
prompt=prompt,
|
|
231
|
+
schema_cls=paschema.Verdicts,
|
|
232
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
233
|
+
extract_json=lambda data: [
|
|
234
|
+
paschema.PromptAlignmentVerdict(**item)
|
|
235
|
+
for item in data["verdicts"]
|
|
236
|
+
],
|
|
237
|
+
)
|
|
239
238
|
|
|
240
239
|
def _generate_verdicts(
|
|
241
240
|
self, input: str, actual_output: str
|
|
242
|
-
) -> paschema.
|
|
241
|
+
) -> List[paschema.PromptAlignmentVerdict]:
|
|
243
242
|
prompt = PromptAlignmentTemplate.generate_verdicts(
|
|
244
243
|
prompt_instructions=self.prompt_instructions,
|
|
245
244
|
input=input,
|
|
246
245
|
actual_output=actual_output,
|
|
247
246
|
)
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
except TypeError:
|
|
259
|
-
res = self.model.generate(prompt)
|
|
260
|
-
data = trimAndLoadJson(res, self)
|
|
261
|
-
return [
|
|
262
|
-
paschema.PromptAlignmentVerdict(**item)
|
|
263
|
-
for item in data["verdicts"]
|
|
264
|
-
]
|
|
247
|
+
return generate_with_schema_and_extract(
|
|
248
|
+
metric=self,
|
|
249
|
+
prompt=prompt,
|
|
250
|
+
schema_cls=paschema.Verdicts,
|
|
251
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
252
|
+
extract_json=lambda data: [
|
|
253
|
+
paschema.PromptAlignmentVerdict(**item)
|
|
254
|
+
for item in data["verdicts"]
|
|
255
|
+
],
|
|
256
|
+
)
|
|
265
257
|
|
|
266
|
-
def _calculate_score(self):
|
|
258
|
+
def _calculate_score(self) -> float:
|
|
267
259
|
number_of_verdicts = len(self.verdicts)
|
|
268
260
|
if number_of_verdicts == 0:
|
|
269
261
|
return 1
|
|
@@ -2,6 +2,14 @@ from typing import List
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class PromptAlignmentTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def generate_verdicts(
|
|
7
15
|
prompt_instructions: List[str], input: str, actual_output: str
|
|
@@ -14,6 +22,8 @@ The 'reason' is the reason for the verdict.
|
|
|
14
22
|
Provide a 'reason' ONLY if the answer is 'no'.
|
|
15
23
|
The provided prompt instructions are the instructions to be followed in the prompt, which you have no access to.
|
|
16
24
|
|
|
25
|
+
{PromptAlignmentTemplate.multimodal_rules}
|
|
26
|
+
|
|
17
27
|
**
|
|
18
28
|
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects.
|
|
19
29
|
Example input: What number is the stars of the sky?
|
|
@@ -63,6 +73,8 @@ The unalignments represent prompt instructions that are not followed by the LLM
|
|
|
63
73
|
If there no unaligments, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
|
64
74
|
Don't have to talk about whether the actual output is a good fit for the input, access ENTIRELY based on the unalignment reasons.
|
|
65
75
|
|
|
76
|
+
{PromptAlignmentTemplate.multimodal_rules}
|
|
77
|
+
|
|
66
78
|
**
|
|
67
79
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
68
80
|
Example JSON:
|
|
@@ -4,20 +4,21 @@ from deepeval.metrics import BaseConversationalMetric
|
|
|
4
4
|
from deepeval.metrics.api import metric_data_manager
|
|
5
5
|
from deepeval.metrics.role_adherence.schema import (
|
|
6
6
|
OutOfCharacterResponseVerdicts,
|
|
7
|
+
RoleAdherenceScoreReason,
|
|
7
8
|
)
|
|
8
9
|
from deepeval.metrics.role_adherence.template import RoleAdherenceTemplate
|
|
9
10
|
from deepeval.metrics.utils import (
|
|
10
11
|
check_conversational_test_case_params,
|
|
11
12
|
construct_verbose_logs,
|
|
12
13
|
convert_turn_to_dict,
|
|
13
|
-
trimAndLoadJson,
|
|
14
14
|
initialize_model,
|
|
15
|
+
a_generate_with_schema_and_extract,
|
|
16
|
+
generate_with_schema_and_extract,
|
|
15
17
|
)
|
|
16
18
|
from deepeval.models import DeepEvalBaseLLM
|
|
17
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
20
|
from deepeval.test_case import Turn, ConversationalTestCase, TurnParams
|
|
19
21
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
20
|
-
from deepeval.metrics.role_adherence.schema import *
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class RoleAdherenceMetric(BaseConversationalMetric):
|
|
@@ -51,7 +52,9 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
51
52
|
test_case,
|
|
52
53
|
self._required_test_case_params,
|
|
53
54
|
self,
|
|
54
|
-
|
|
55
|
+
True,
|
|
56
|
+
self.model,
|
|
57
|
+
test_case.multimodal,
|
|
55
58
|
)
|
|
56
59
|
|
|
57
60
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -102,7 +105,9 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
102
105
|
test_case,
|
|
103
106
|
self._required_test_case_params,
|
|
104
107
|
self,
|
|
105
|
-
|
|
108
|
+
True,
|
|
109
|
+
self.model,
|
|
110
|
+
test_case.multimodal,
|
|
106
111
|
)
|
|
107
112
|
|
|
108
113
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -138,7 +143,7 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
138
143
|
)
|
|
139
144
|
return self.score
|
|
140
145
|
|
|
141
|
-
async def _a_generate_reason(self, role: str) -> str:
|
|
146
|
+
async def _a_generate_reason(self, role: str) -> Optional[str]:
|
|
142
147
|
if self.include_reason is False:
|
|
143
148
|
return None
|
|
144
149
|
|
|
@@ -150,24 +155,17 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
150
155
|
for verdict in self.out_of_character_verdicts.verdicts
|
|
151
156
|
],
|
|
152
157
|
)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
try:
|
|
161
|
-
res: RoleAdherenceScoreReason = await self.model.a_generate(
|
|
162
|
-
prompt, schema=RoleAdherenceScoreReason
|
|
163
|
-
)
|
|
164
|
-
return res.reason
|
|
165
|
-
except TypeError:
|
|
166
|
-
res = await self.model.a_generate(prompt)
|
|
167
|
-
data = trimAndLoadJson(res, self)
|
|
168
|
-
return data["reason"]
|
|
158
|
+
return await a_generate_with_schema_and_extract(
|
|
159
|
+
metric=self,
|
|
160
|
+
prompt=prompt,
|
|
161
|
+
schema_cls=RoleAdherenceScoreReason,
|
|
162
|
+
extract_schema=lambda s: s.reason,
|
|
163
|
+
extract_json=lambda data: data["reason"],
|
|
164
|
+
)
|
|
169
165
|
|
|
170
|
-
def _generate_reason(self, role: str) -> str:
|
|
166
|
+
def _generate_reason(self, role: str) -> Optional[str]:
|
|
167
|
+
if self.include_reason is False:
|
|
168
|
+
return None
|
|
171
169
|
prompt = RoleAdherenceTemplate.generate_reason(
|
|
172
170
|
score=self.score,
|
|
173
171
|
role=role,
|
|
@@ -176,22 +174,13 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
176
174
|
for verdict in self.out_of_character_verdicts.verdicts
|
|
177
175
|
],
|
|
178
176
|
)
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
try:
|
|
187
|
-
res: RoleAdherenceScoreReason = self.model.generate(
|
|
188
|
-
prompt, schema=RoleAdherenceScoreReason
|
|
189
|
-
)
|
|
190
|
-
return res.reason
|
|
191
|
-
except TypeError:
|
|
192
|
-
res = self.model.generate(prompt)
|
|
193
|
-
data = trimAndLoadJson(res, self)
|
|
194
|
-
return data["reason"]
|
|
177
|
+
return generate_with_schema_and_extract(
|
|
178
|
+
metric=self,
|
|
179
|
+
prompt=prompt,
|
|
180
|
+
schema_cls=RoleAdherenceScoreReason,
|
|
181
|
+
extract_schema=lambda s: s.reason,
|
|
182
|
+
extract_json=lambda data: data["reason"],
|
|
183
|
+
)
|
|
195
184
|
|
|
196
185
|
async def _a_extract_out_of_character_verdicts(
|
|
197
186
|
self, turns: List[Turn], role: str
|
|
@@ -202,28 +191,23 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
202
191
|
role=role,
|
|
203
192
|
)
|
|
204
193
|
)
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
194
|
+
res: OutOfCharacterResponseVerdicts = (
|
|
195
|
+
await a_generate_with_schema_and_extract(
|
|
196
|
+
metric=self,
|
|
197
|
+
prompt=prompt,
|
|
198
|
+
schema_cls=OutOfCharacterResponseVerdicts,
|
|
199
|
+
extract_schema=lambda s: s,
|
|
200
|
+
extract_json=lambda data: OutOfCharacterResponseVerdicts(
|
|
201
|
+
**data
|
|
202
|
+
),
|
|
208
203
|
)
|
|
209
|
-
|
|
210
|
-
else:
|
|
211
|
-
try:
|
|
212
|
-
res: OutOfCharacterResponseVerdicts = (
|
|
213
|
-
await self.model.a_generate(
|
|
214
|
-
prompt, schema=OutOfCharacterResponseVerdicts
|
|
215
|
-
)
|
|
216
|
-
)
|
|
217
|
-
except TypeError:
|
|
218
|
-
res = await self.model.a_generate(prompt)
|
|
219
|
-
data = trimAndLoadJson(res, self)
|
|
220
|
-
res = OutOfCharacterResponseVerdicts(**data)
|
|
204
|
+
)
|
|
221
205
|
|
|
222
206
|
for verdict in res.verdicts:
|
|
223
207
|
try:
|
|
224
208
|
index = verdict.index
|
|
225
209
|
verdict.ai_message = f"{turns[index].content} (turn #{index+1})"
|
|
226
|
-
except:
|
|
210
|
+
except Exception:
|
|
227
211
|
pass
|
|
228
212
|
return res
|
|
229
213
|
|
|
@@ -236,26 +220,19 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
236
220
|
role=role,
|
|
237
221
|
)
|
|
238
222
|
)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
res: OutOfCharacterResponseVerdicts = self.model.generate(
|
|
247
|
-
prompt, schema=OutOfCharacterResponseVerdicts
|
|
248
|
-
)
|
|
249
|
-
except TypeError:
|
|
250
|
-
res = self.model.generate(prompt)
|
|
251
|
-
data = trimAndLoadJson(res, self)
|
|
252
|
-
res = OutOfCharacterResponseVerdicts(**data)
|
|
223
|
+
res: OutOfCharacterResponseVerdicts = generate_with_schema_and_extract(
|
|
224
|
+
metric=self,
|
|
225
|
+
prompt=prompt,
|
|
226
|
+
schema_cls=OutOfCharacterResponseVerdicts,
|
|
227
|
+
extract_schema=lambda s: s,
|
|
228
|
+
extract_json=lambda data: OutOfCharacterResponseVerdicts(**data),
|
|
229
|
+
)
|
|
253
230
|
|
|
254
231
|
for verdict in res.verdicts:
|
|
255
232
|
try:
|
|
256
233
|
index = verdict.index
|
|
257
234
|
verdict.ai_message = f"{turns[index].content} (turn #{index+1})"
|
|
258
|
-
except:
|
|
235
|
+
except Exception:
|
|
259
236
|
pass
|
|
260
237
|
return res
|
|
261
238
|
|
|
@@ -278,8 +255,8 @@ class RoleAdherenceMetric(BaseConversationalMetric):
|
|
|
278
255
|
self.success = False
|
|
279
256
|
else:
|
|
280
257
|
try:
|
|
281
|
-
self.score >= self.threshold
|
|
282
|
-
except:
|
|
258
|
+
self.success = self.score >= self.threshold
|
|
259
|
+
except TypeError:
|
|
283
260
|
self.success = False
|
|
284
261
|
return self.success
|
|
285
262
|
|
|
@@ -2,11 +2,22 @@ from typing import List, Dict
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class RoleAdherenceTemplate:
|
|
5
|
+
multimodal_rules = """
|
|
6
|
+
--- MULTIMODAL INPUT RULES ---
|
|
7
|
+
- Treat image content as factual evidence.
|
|
8
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
9
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
10
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
11
|
+
"""
|
|
12
|
+
|
|
5
13
|
@staticmethod
|
|
6
14
|
def extract_out_of_character_response_verdicts(
|
|
7
15
|
turns: List[Dict], role: str
|
|
8
16
|
):
|
|
9
17
|
return f"""Based on the given list of message exchanges between a user and an LLM chatbot, generate a JSON object to specify which `ai_message` did not adhere to the specified chatbot role.
|
|
18
|
+
|
|
19
|
+
{RoleAdherenceTemplate.multimodal_rules}
|
|
20
|
+
|
|
10
21
|
The JSON will have 1 field: "verdicts", which is a list of verdicts specifying the indices and reasons of the LLM ai_message/responses that did NOT adhere to the chatbot role.
|
|
11
22
|
You MUST USE look at all messages provided in the list of messages to make an informed judgement on role adherence.
|
|
12
23
|
|
|
@@ -72,6 +83,9 @@ JSON:
|
|
|
72
83
|
return f"""Below is a list of LLM chatbot responses (ai_message) that is out of character with respect to the specified chatbot role. It is drawn from a list of messages in a conversation, which you have minimal knowledge of.
|
|
73
84
|
Given the role adherence score, which is a 0-1 score indicating how well the chatbot responses has adhered to the given role through a conversation, with 1 being the best and 0 being worst, provide a reason by quoting the out of character responses to justify the score.
|
|
74
85
|
|
|
86
|
+
|
|
87
|
+
{RoleAdherenceTemplate.multimodal_rules}
|
|
88
|
+
|
|
75
89
|
**
|
|
76
90
|
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
77
91
|
Example JSON:
|