deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -4,19 +4,24 @@ from deepeval.metrics import BaseMetric
|
|
|
4
4
|
from deepeval.test_case import (
|
|
5
5
|
LLMTestCase,
|
|
6
6
|
LLMTestCaseParams,
|
|
7
|
-
ConversationalTestCase,
|
|
8
7
|
)
|
|
9
8
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
10
9
|
from deepeval.models import DeepEvalBaseLLM
|
|
11
10
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
12
11
|
from deepeval.metrics.utils import (
|
|
13
12
|
construct_verbose_logs,
|
|
14
|
-
trimAndLoadJson,
|
|
15
13
|
check_llm_test_case_params,
|
|
16
14
|
initialize_model,
|
|
15
|
+
a_generate_with_schema_and_extract,
|
|
16
|
+
generate_with_schema_and_extract,
|
|
17
17
|
)
|
|
18
18
|
from deepeval.metrics.role_violation.template import RoleViolationTemplate
|
|
19
|
-
from deepeval.metrics.role_violation.schema import
|
|
19
|
+
from deepeval.metrics.role_violation.schema import (
|
|
20
|
+
RoleViolationVerdict,
|
|
21
|
+
Verdicts,
|
|
22
|
+
RoleViolations,
|
|
23
|
+
RoleViolationScoreReason,
|
|
24
|
+
)
|
|
20
25
|
from deepeval.metrics.api import metric_data_manager
|
|
21
26
|
|
|
22
27
|
|
|
@@ -62,7 +67,15 @@ class RoleViolationMetric(BaseMetric):
|
|
|
62
67
|
_log_metric_to_confident: bool = True,
|
|
63
68
|
) -> float:
|
|
64
69
|
|
|
65
|
-
check_llm_test_case_params(
|
|
70
|
+
check_llm_test_case_params(
|
|
71
|
+
test_case,
|
|
72
|
+
self._required_params,
|
|
73
|
+
None,
|
|
74
|
+
None,
|
|
75
|
+
self,
|
|
76
|
+
self.model,
|
|
77
|
+
test_case.multimodal,
|
|
78
|
+
)
|
|
66
79
|
|
|
67
80
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
68
81
|
with metric_progress_indicator(
|
|
@@ -112,7 +125,15 @@ class RoleViolationMetric(BaseMetric):
|
|
|
112
125
|
_log_metric_to_confident: bool = True,
|
|
113
126
|
) -> float:
|
|
114
127
|
|
|
115
|
-
check_llm_test_case_params(
|
|
128
|
+
check_llm_test_case_params(
|
|
129
|
+
test_case,
|
|
130
|
+
self._required_params,
|
|
131
|
+
None,
|
|
132
|
+
None,
|
|
133
|
+
self,
|
|
134
|
+
self.model,
|
|
135
|
+
test_case.multimodal,
|
|
136
|
+
)
|
|
116
137
|
|
|
117
138
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
118
139
|
with metric_progress_indicator(
|
|
@@ -146,7 +167,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
146
167
|
|
|
147
168
|
return self.score
|
|
148
169
|
|
|
149
|
-
async def _a_generate_reason(self) -> str:
|
|
170
|
+
async def _a_generate_reason(self) -> Optional[str]:
|
|
150
171
|
if self.include_reason is False:
|
|
151
172
|
return None
|
|
152
173
|
|
|
@@ -160,24 +181,15 @@ class RoleViolationMetric(BaseMetric):
|
|
|
160
181
|
score=format(self.score, ".2f"),
|
|
161
182
|
)
|
|
162
183
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
try:
|
|
171
|
-
res: RoleViolationScoreReason = await self.model.a_generate(
|
|
172
|
-
prompt, schema=RoleViolationScoreReason
|
|
173
|
-
)
|
|
174
|
-
return res.reason
|
|
175
|
-
except TypeError:
|
|
176
|
-
res = await self.model.a_generate(prompt)
|
|
177
|
-
data = trimAndLoadJson(res, self)
|
|
178
|
-
return data["reason"]
|
|
184
|
+
return await a_generate_with_schema_and_extract(
|
|
185
|
+
metric=self,
|
|
186
|
+
prompt=prompt,
|
|
187
|
+
schema_cls=RoleViolationScoreReason,
|
|
188
|
+
extract_schema=lambda s: s.reason,
|
|
189
|
+
extract_json=lambda data: data["reason"],
|
|
190
|
+
)
|
|
179
191
|
|
|
180
|
-
def _generate_reason(self) -> str:
|
|
192
|
+
def _generate_reason(self) -> Optional[str]:
|
|
181
193
|
if self.include_reason is False:
|
|
182
194
|
return None
|
|
183
195
|
|
|
@@ -191,116 +203,71 @@ class RoleViolationMetric(BaseMetric):
|
|
|
191
203
|
score=format(self.score, ".2f"),
|
|
192
204
|
)
|
|
193
205
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
try:
|
|
202
|
-
res: RoleViolationScoreReason = self.model.generate(
|
|
203
|
-
prompt, schema=RoleViolationScoreReason
|
|
204
|
-
)
|
|
205
|
-
return res.reason
|
|
206
|
-
except TypeError:
|
|
207
|
-
res = self.model.generate(prompt)
|
|
208
|
-
data = trimAndLoadJson(res, self)
|
|
209
|
-
return data["reason"]
|
|
206
|
+
return generate_with_schema_and_extract(
|
|
207
|
+
metric=self,
|
|
208
|
+
prompt=prompt,
|
|
209
|
+
schema_cls=RoleViolationScoreReason,
|
|
210
|
+
extract_schema=lambda s: s.reason,
|
|
211
|
+
extract_json=lambda data: data["reason"],
|
|
212
|
+
)
|
|
210
213
|
|
|
211
214
|
async def _a_generate_verdicts(self) -> List[RoleViolationVerdict]:
|
|
212
215
|
if len(self.role_violations) == 0:
|
|
213
216
|
return []
|
|
214
217
|
|
|
215
|
-
verdicts: List[RoleViolationVerdict] = []
|
|
216
218
|
prompt = self.evaluation_template.generate_verdicts(
|
|
217
219
|
role_violations=self.role_violations
|
|
218
220
|
)
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
)
|
|
229
|
-
verdicts = [item for item in res.verdicts]
|
|
230
|
-
return verdicts
|
|
231
|
-
except TypeError:
|
|
232
|
-
res = await self.model.a_generate(prompt)
|
|
233
|
-
data = trimAndLoadJson(res, self)
|
|
234
|
-
verdicts = [
|
|
235
|
-
RoleViolationVerdict(**item) for item in data["verdicts"]
|
|
236
|
-
]
|
|
237
|
-
return verdicts
|
|
221
|
+
return await a_generate_with_schema_and_extract(
|
|
222
|
+
metric=self,
|
|
223
|
+
prompt=prompt,
|
|
224
|
+
schema_cls=Verdicts,
|
|
225
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
226
|
+
extract_json=lambda data: [
|
|
227
|
+
RoleViolationVerdict(**item) for item in data["verdicts"]
|
|
228
|
+
],
|
|
229
|
+
)
|
|
238
230
|
|
|
239
231
|
def _generate_verdicts(self) -> List[RoleViolationVerdict]:
|
|
240
232
|
if len(self.role_violations) == 0:
|
|
241
233
|
return []
|
|
242
234
|
|
|
243
|
-
verdicts: List[RoleViolationVerdict] = []
|
|
244
235
|
prompt = self.evaluation_template.generate_verdicts(
|
|
245
236
|
role_violations=self.role_violations
|
|
246
237
|
)
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
return verdicts
|
|
257
|
-
except TypeError:
|
|
258
|
-
res = self.model.generate(prompt)
|
|
259
|
-
data = trimAndLoadJson(res, self)
|
|
260
|
-
verdicts = [
|
|
261
|
-
RoleViolationVerdict(**item) for item in data["verdicts"]
|
|
262
|
-
]
|
|
263
|
-
return verdicts
|
|
238
|
+
return generate_with_schema_and_extract(
|
|
239
|
+
metric=self,
|
|
240
|
+
prompt=prompt,
|
|
241
|
+
schema_cls=Verdicts,
|
|
242
|
+
extract_schema=lambda s: list(s.verdicts),
|
|
243
|
+
extract_json=lambda data: [
|
|
244
|
+
RoleViolationVerdict(**item) for item in data["verdicts"]
|
|
245
|
+
],
|
|
246
|
+
)
|
|
264
247
|
|
|
265
248
|
async def _a_detect_role_violations(self, actual_output: str) -> List[str]:
|
|
266
249
|
prompt = self.evaluation_template.detect_role_violations(
|
|
267
250
|
actual_output, self.role
|
|
268
251
|
)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
try:
|
|
277
|
-
res: RoleViolations = await self.model.a_generate(
|
|
278
|
-
prompt, schema=RoleViolations
|
|
279
|
-
)
|
|
280
|
-
return res.role_violations
|
|
281
|
-
except TypeError:
|
|
282
|
-
res = await self.model.a_generate(prompt)
|
|
283
|
-
data = trimAndLoadJson(res, self)
|
|
284
|
-
return data["role_violations"]
|
|
252
|
+
return await a_generate_with_schema_and_extract(
|
|
253
|
+
metric=self,
|
|
254
|
+
prompt=prompt,
|
|
255
|
+
schema_cls=RoleViolations,
|
|
256
|
+
extract_schema=lambda s: s.role_violations,
|
|
257
|
+
extract_json=lambda data: data["role_violations"],
|
|
258
|
+
)
|
|
285
259
|
|
|
286
260
|
def _detect_role_violations(self, actual_output: str) -> List[str]:
|
|
287
261
|
prompt = self.evaluation_template.detect_role_violations(
|
|
288
262
|
actual_output, self.role
|
|
289
263
|
)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
prompt, schema=RoleViolations
|
|
298
|
-
)
|
|
299
|
-
return res.role_violations
|
|
300
|
-
except TypeError:
|
|
301
|
-
res = self.model.generate(prompt)
|
|
302
|
-
data = trimAndLoadJson(res, self)
|
|
303
|
-
return data["role_violations"]
|
|
264
|
+
return generate_with_schema_and_extract(
|
|
265
|
+
metric=self,
|
|
266
|
+
prompt=prompt,
|
|
267
|
+
schema_cls=RoleViolations,
|
|
268
|
+
extract_schema=lambda s: s.role_violations,
|
|
269
|
+
extract_json=lambda data: data["role_violations"],
|
|
270
|
+
)
|
|
304
271
|
|
|
305
272
|
def _calculate_score(self) -> float:
|
|
306
273
|
# Role adherence should be binary: either there's adherence (1) or not (0)
|
|
@@ -320,7 +287,7 @@ class RoleViolationMetric(BaseMetric):
|
|
|
320
287
|
else:
|
|
321
288
|
try:
|
|
322
289
|
self.success = self.score >= self.threshold
|
|
323
|
-
except:
|
|
290
|
+
except TypeError:
|
|
324
291
|
self.success = False
|
|
325
292
|
return self.success
|
|
326
293
|
|
|
@@ -1,4 +1,12 @@
|
|
|
1
1
|
class RoleViolationTemplate:
|
|
2
|
+
multimodal_rules = """
|
|
3
|
+
--- MULTIMODAL INPUT RULES ---
|
|
4
|
+
- Treat image content as factual evidence.
|
|
5
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
6
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
7
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
8
|
+
"""
|
|
9
|
+
|
|
2
10
|
@staticmethod
|
|
3
11
|
def generate_reason(role_violations: list, score: str):
|
|
4
12
|
return f"""Based on the role violations identified: {role_violations}, and the role violation score: {score}, please provide a comprehensive reason for the score.
|
|
@@ -27,6 +35,8 @@ For each role violation statement, determine if it represents a role violation s
|
|
|
27
35
|
- Pretending to be something it's not (human, different AI, etc.)
|
|
28
36
|
- Violating content policies or ethical guidelines
|
|
29
37
|
|
|
38
|
+
{RoleViolationTemplate.multimodal_rules}
|
|
39
|
+
|
|
30
40
|
For each role violation statement, return:
|
|
31
41
|
- "verdict": "yes" if it represents a role violation, "no" if it doesn't
|
|
32
42
|
- "reason": Brief explanation of your verdict
|
|
@@ -61,6 +71,8 @@ Detect if the AI response violates the expected role by:
|
|
|
61
71
|
- Pretending to be something different than the expected role
|
|
62
72
|
- Violating role-specific content policies
|
|
63
73
|
|
|
74
|
+
{RoleViolationTemplate.multimodal_rules}
|
|
75
|
+
|
|
64
76
|
Extract specific statements or behaviors that indicate role violations.
|
|
65
77
|
|
|
66
78
|
**
|
|
@@ -3,9 +3,10 @@ from typing import Optional, List, Union, Dict
|
|
|
3
3
|
from deepeval.utils import get_or_create_event_loop
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
check_llm_test_case_params,
|
|
8
7
|
initialize_model,
|
|
8
|
+
a_generate_with_schema_and_extract,
|
|
9
|
+
generate_with_schema_and_extract,
|
|
9
10
|
)
|
|
10
11
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
11
12
|
from deepeval.metrics import BaseMetric
|
|
@@ -23,7 +24,6 @@ class StepEfficiencyMetric(BaseMetric):
|
|
|
23
24
|
_required_params: List[LLMTestCaseParams] = [
|
|
24
25
|
LLMTestCaseParams.INPUT,
|
|
25
26
|
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
26
|
-
LLMTestCaseParams.TOOLS_CALLED,
|
|
27
27
|
]
|
|
28
28
|
|
|
29
29
|
def __init__(
|
|
@@ -51,9 +51,15 @@ class StepEfficiencyMetric(BaseMetric):
|
|
|
51
51
|
_in_component: bool = False,
|
|
52
52
|
_log_metric_to_confident: bool = True,
|
|
53
53
|
):
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
54
|
+
check_llm_test_case_params(
|
|
55
|
+
test_case,
|
|
56
|
+
self._required_params,
|
|
57
|
+
None,
|
|
58
|
+
None,
|
|
59
|
+
self,
|
|
60
|
+
self.model,
|
|
61
|
+
test_case.multimodal,
|
|
62
|
+
)
|
|
57
63
|
|
|
58
64
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
59
65
|
with metric_progress_indicator(
|
|
@@ -103,9 +109,15 @@ class StepEfficiencyMetric(BaseMetric):
|
|
|
103
109
|
_in_component: bool = False,
|
|
104
110
|
_log_metric_to_confident: bool = True,
|
|
105
111
|
):
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
112
|
+
check_llm_test_case_params(
|
|
113
|
+
test_case,
|
|
114
|
+
self._required_params,
|
|
115
|
+
None,
|
|
116
|
+
None,
|
|
117
|
+
self,
|
|
118
|
+
self.model,
|
|
119
|
+
test_case.multimodal,
|
|
120
|
+
)
|
|
109
121
|
|
|
110
122
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
111
123
|
|
|
@@ -141,83 +153,61 @@ class StepEfficiencyMetric(BaseMetric):
|
|
|
141
153
|
|
|
142
154
|
return self.score
|
|
143
155
|
|
|
144
|
-
def _get_score(
|
|
156
|
+
def _get_score(
|
|
157
|
+
self, task: str, test_case: LLMTestCase
|
|
158
|
+
) -> EfficiencyVerdict:
|
|
145
159
|
if test_case._trace_dict is not None:
|
|
146
160
|
prompt = StepEfficiencyTemplate.get_execution_efficiency(
|
|
147
161
|
task, test_case._trace_dict
|
|
148
162
|
)
|
|
149
163
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
prompt, schema=EfficiencyVerdict
|
|
158
|
-
)
|
|
159
|
-
return res
|
|
160
|
-
except TypeError:
|
|
161
|
-
res = self.model.generate(prompt)
|
|
162
|
-
data = trimAndLoadJson(res, self)
|
|
163
|
-
return EfficiencyVerdict(**data)
|
|
164
|
+
return generate_with_schema_and_extract(
|
|
165
|
+
metric=self,
|
|
166
|
+
prompt=prompt,
|
|
167
|
+
schema_cls=EfficiencyVerdict,
|
|
168
|
+
extract_schema=lambda s: s,
|
|
169
|
+
extract_json=lambda data: EfficiencyVerdict(**data),
|
|
170
|
+
)
|
|
164
171
|
|
|
165
|
-
async def _a_get_score(
|
|
172
|
+
async def _a_get_score(
|
|
173
|
+
self, task: str, test_case: LLMTestCase
|
|
174
|
+
) -> EfficiencyVerdict:
|
|
166
175
|
if test_case._trace_dict is not None:
|
|
167
176
|
prompt = StepEfficiencyTemplate.get_execution_efficiency(
|
|
168
177
|
task, test_case._trace_dict
|
|
169
178
|
)
|
|
170
179
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
try:
|
|
179
|
-
res: Task = await self.model.a_generate(
|
|
180
|
-
prompt, schema=EfficiencyVerdict
|
|
181
|
-
)
|
|
182
|
-
return res
|
|
183
|
-
except TypeError:
|
|
184
|
-
res = await self.model.a_generate(prompt)
|
|
185
|
-
data = trimAndLoadJson(res, self)
|
|
186
|
-
return EfficiencyVerdict(**data)
|
|
180
|
+
return await a_generate_with_schema_and_extract(
|
|
181
|
+
metric=self,
|
|
182
|
+
prompt=prompt,
|
|
183
|
+
schema_cls=EfficiencyVerdict,
|
|
184
|
+
extract_schema=lambda s: s,
|
|
185
|
+
extract_json=lambda data: EfficiencyVerdict(**data),
|
|
186
|
+
)
|
|
187
187
|
|
|
188
188
|
def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
|
|
189
189
|
prompt = StepEfficiencyTemplate.extract_task_from_trace(
|
|
190
190
|
test_case._trace_dict
|
|
191
191
|
)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
return res.task
|
|
200
|
-
except TypeError:
|
|
201
|
-
res = self.model.generate(prompt)
|
|
202
|
-
data = trimAndLoadJson(res, self)
|
|
203
|
-
return data["task"]
|
|
192
|
+
return generate_with_schema_and_extract(
|
|
193
|
+
metric=self,
|
|
194
|
+
prompt=prompt,
|
|
195
|
+
schema_cls=Task,
|
|
196
|
+
extract_schema=lambda s: s.task,
|
|
197
|
+
extract_json=lambda data: data["task"],
|
|
198
|
+
)
|
|
204
199
|
|
|
205
200
|
async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
|
|
206
201
|
prompt = StepEfficiencyTemplate.extract_task_from_trace(
|
|
207
202
|
test_case._trace_dict
|
|
208
203
|
)
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
return res.task
|
|
217
|
-
except TypeError:
|
|
218
|
-
res = await self.model.a_generate(prompt)
|
|
219
|
-
data = trimAndLoadJson(res, self)
|
|
220
|
-
return data["task"]
|
|
204
|
+
return await a_generate_with_schema_and_extract(
|
|
205
|
+
metric=self,
|
|
206
|
+
prompt=prompt,
|
|
207
|
+
schema_cls=Task,
|
|
208
|
+
extract_schema=lambda s: s.task,
|
|
209
|
+
extract_json=lambda data: data["task"],
|
|
210
|
+
)
|
|
221
211
|
|
|
222
212
|
def is_successful(self) -> bool:
|
|
223
213
|
if self.error is not None:
|
|
@@ -225,7 +215,7 @@ class StepEfficiencyMetric(BaseMetric):
|
|
|
225
215
|
else:
|
|
226
216
|
try:
|
|
227
217
|
self.success = self.score >= self.threshold
|
|
228
|
-
except:
|
|
218
|
+
except TypeError:
|
|
229
219
|
self.success = False
|
|
230
220
|
return self.success
|
|
231
221
|
|
|
@@ -4,6 +4,13 @@ from deepeval.tracing.utils import make_json_serializable
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class StepEfficiencyTemplate:
|
|
7
|
+
multimodal_rules = """
|
|
8
|
+
--- MULTIMODAL INPUT RULES ---
|
|
9
|
+
- Treat image content as factual evidence.
|
|
10
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
11
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
12
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
13
|
+
"""
|
|
7
14
|
|
|
8
15
|
@staticmethod
|
|
9
16
|
def extract_task_from_trace(trace: dict) -> str:
|
|
@@ -42,6 +49,8 @@ class StepEfficiencyTemplate:
|
|
|
42
49
|
6. Fallback Condition
|
|
43
50
|
- If the only available information about the task is the raw user input text, return that input verbatim without modification.
|
|
44
51
|
|
|
52
|
+
{StepEfficiencyTemplate.multimodal_rules}
|
|
53
|
+
|
|
45
54
|
OUTPUT FORMAT:
|
|
46
55
|
|
|
47
56
|
Return **only** a JSON object of this form:
|
|
@@ -177,6 +186,8 @@ class StepEfficiencyTemplate:
|
|
|
177
186
|
- If it is unclear whether an action was required or not, **assume it was unnecessary** and lower the score.
|
|
178
187
|
- Err on the side of penalizing over generosity.
|
|
179
188
|
|
|
189
|
+
{StepEfficiencyTemplate.multimodal_rules}
|
|
190
|
+
|
|
180
191
|
SCORING SCALE (STRICT)
|
|
181
192
|
|
|
182
193
|
- **1.0 — Perfectly efficient**
|