deepeval 3.7.5__py3-none-any.whl → 3.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +35 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +10 -222
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +3 -0
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +13 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +87 -78
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +3 -6
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +68 -38
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
- deepeval/metrics/mcp/template.py +52 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/tool_use.py +42 -66
- deepeval/metrics/topic_adherence/template.py +13 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +1 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +110 -68
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +104 -61
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +106 -65
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +104 -73
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +145 -90
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/amazon_bedrock_model.py +226 -71
- deepeval/models/llms/anthropic_model.py +141 -47
- deepeval/models/llms/azure_model.py +167 -94
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +79 -29
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +125 -59
- deepeval/models/llms/kimi_model.py +126 -81
- deepeval/models/llms/litellm_model.py +92 -18
- deepeval/models/llms/local_model.py +114 -15
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +167 -310
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/utils.py +60 -4
- deepeval/simulator/conversation_simulator.py +43 -0
- deepeval/simulator/template.py +13 -0
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -1
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/RECORD +128 -132
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
|
@@ -7,8 +7,9 @@ from deepeval.metrics.utils import (
|
|
|
7
7
|
check_conversational_test_case_params,
|
|
8
8
|
construct_verbose_logs,
|
|
9
9
|
get_unit_interactions,
|
|
10
|
-
trimAndLoadJson,
|
|
11
10
|
initialize_model,
|
|
11
|
+
a_generate_with_schema_and_extract,
|
|
12
|
+
generate_with_schema_and_extract,
|
|
12
13
|
)
|
|
13
14
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
14
15
|
from deepeval.test_case import ConversationalTestCase, TurnParams
|
|
@@ -50,7 +51,12 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
50
51
|
_log_metric_to_confident: bool = True,
|
|
51
52
|
):
|
|
52
53
|
check_conversational_test_case_params(
|
|
53
|
-
test_case,
|
|
54
|
+
test_case,
|
|
55
|
+
self._required_test_case_params,
|
|
56
|
+
self,
|
|
57
|
+
False,
|
|
58
|
+
self.model,
|
|
59
|
+
test_case.multimodal,
|
|
54
60
|
)
|
|
55
61
|
|
|
56
62
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -120,7 +126,12 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
120
126
|
_log_metric_to_confident: bool = True,
|
|
121
127
|
):
|
|
122
128
|
check_conversational_test_case_params(
|
|
123
|
-
test_case,
|
|
129
|
+
test_case,
|
|
130
|
+
self._required_test_case_params,
|
|
131
|
+
self,
|
|
132
|
+
False,
|
|
133
|
+
self.model,
|
|
134
|
+
test_case.multimodal,
|
|
124
135
|
)
|
|
125
136
|
|
|
126
137
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -185,18 +196,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
185
196
|
prompt = MCPTaskCompletionTemplate.get_tool_correctness_score(
|
|
186
197
|
task, test_case.mcp_servers
|
|
187
198
|
)
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
return res
|
|
196
|
-
except TypeError:
|
|
197
|
-
res = self.model.generate(prompt)
|
|
198
|
-
data = trimAndLoadJson(res, self)
|
|
199
|
-
return ToolScore(**data)
|
|
199
|
+
return generate_with_schema_and_extract(
|
|
200
|
+
metric=self,
|
|
201
|
+
prompt=prompt,
|
|
202
|
+
schema_cls=ToolScore,
|
|
203
|
+
extract_schema=lambda s: s,
|
|
204
|
+
extract_json=lambda data: ToolScore(**data),
|
|
205
|
+
)
|
|
200
206
|
|
|
201
207
|
async def _a_get_tool_accuracy_score(
|
|
202
208
|
self, task: Task, test_case: ConversationalTestCase
|
|
@@ -204,20 +210,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
204
210
|
prompt = MCPTaskCompletionTemplate.get_tool_correctness_score(
|
|
205
211
|
task, test_case.mcp_servers
|
|
206
212
|
)
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
prompt, schema=ToolScore
|
|
215
|
-
)
|
|
216
|
-
return res
|
|
217
|
-
except TypeError:
|
|
218
|
-
res = await self.model.a_generate(prompt)
|
|
219
|
-
data = trimAndLoadJson(res, self)
|
|
220
|
-
return ToolScore(**data)
|
|
213
|
+
return await a_generate_with_schema_and_extract(
|
|
214
|
+
metric=self,
|
|
215
|
+
prompt=prompt,
|
|
216
|
+
schema_cls=ToolScore,
|
|
217
|
+
extract_schema=lambda s: s,
|
|
218
|
+
extract_json=lambda data: ToolScore(**data),
|
|
219
|
+
)
|
|
221
220
|
|
|
222
221
|
def _get_args_score(
|
|
223
222
|
self, task: Task, test_case: ConversationalTestCase
|
|
@@ -225,18 +224,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
225
224
|
prompt = MCPTaskCompletionTemplate.get_args_correctness_score(
|
|
226
225
|
task, test_case.mcp_servers
|
|
227
226
|
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
return res
|
|
236
|
-
except TypeError:
|
|
237
|
-
res = self.model.generate(prompt)
|
|
238
|
-
data = trimAndLoadJson(res, self)
|
|
239
|
-
return ArgsScore(**data)
|
|
227
|
+
return generate_with_schema_and_extract(
|
|
228
|
+
metric=self,
|
|
229
|
+
prompt=prompt,
|
|
230
|
+
schema_cls=ArgsScore,
|
|
231
|
+
extract_schema=lambda s: s,
|
|
232
|
+
extract_json=lambda data: ArgsScore(**data),
|
|
233
|
+
)
|
|
240
234
|
|
|
241
235
|
async def _a_get_args_score(
|
|
242
236
|
self, task: Task, test_case: ConversationalTestCase
|
|
@@ -244,20 +238,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
244
238
|
prompt = MCPTaskCompletionTemplate.get_args_correctness_score(
|
|
245
239
|
task, test_case.mcp_servers
|
|
246
240
|
)
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
prompt, schema=ArgsScore
|
|
255
|
-
)
|
|
256
|
-
return res
|
|
257
|
-
except TypeError:
|
|
258
|
-
res = await self.model.a_generate(prompt)
|
|
259
|
-
data = trimAndLoadJson(res, self)
|
|
260
|
-
return ArgsScore(**data)
|
|
241
|
+
return await a_generate_with_schema_and_extract(
|
|
242
|
+
metric=self,
|
|
243
|
+
prompt=prompt,
|
|
244
|
+
schema_cls=ArgsScore,
|
|
245
|
+
extract_schema=lambda s: s,
|
|
246
|
+
extract_json=lambda data: ArgsScore(**data),
|
|
247
|
+
)
|
|
261
248
|
|
|
262
249
|
def _get_tasks(self, unit_interactions: List) -> List[Task]:
|
|
263
250
|
tasks = []
|
|
@@ -334,32 +321,63 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
|
|
|
334
321
|
self,
|
|
335
322
|
tool_accuracy_score: List[ToolScore],
|
|
336
323
|
args_accuracy_score: List[ArgsScore],
|
|
337
|
-
) -> str:
|
|
338
|
-
|
|
324
|
+
) -> Optional[str]:
|
|
325
|
+
if not self.include_reason:
|
|
326
|
+
return None
|
|
327
|
+
|
|
328
|
+
reasons = []
|
|
339
329
|
for task_score in tool_accuracy_score:
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
330
|
+
reasons.append(task_score.reason)
|
|
331
|
+
|
|
332
|
+
for arg_score in args_accuracy_score:
|
|
333
|
+
reasons.append(arg_score.reason)
|
|
334
|
+
|
|
335
|
+
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
336
|
+
self.score, self.success, reasons
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
if self.using_native_model:
|
|
340
|
+
res, cost = self.model.generate(prompt)
|
|
341
|
+
self.evaluation_cost += cost
|
|
342
|
+
return res
|
|
343
|
+
else:
|
|
344
|
+
res = self.model.generate(prompt)
|
|
345
|
+
return res
|
|
346
|
+
|
|
347
|
+
async def _a_generate_reason(
|
|
348
|
+
self,
|
|
349
|
+
tool_accuracy_score: List[ToolScore],
|
|
350
|
+
args_accuracy_score: List[ArgsScore],
|
|
351
|
+
) -> Optional[str]:
|
|
352
|
+
if not self.include_reason:
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
reasons = []
|
|
356
|
+
for task_score in tool_accuracy_score:
|
|
357
|
+
reasons.append(task_score.reason)
|
|
358
|
+
|
|
359
|
+
for arg_score in args_accuracy_score:
|
|
360
|
+
reasons.append(arg_score.reason)
|
|
361
|
+
|
|
362
|
+
prompt = MCPTaskCompletionTemplate.generate_final_reason(
|
|
363
|
+
self.score, self.success, reasons
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
if self.using_native_model:
|
|
367
|
+
res, cost = await self.model.a_generate(prompt)
|
|
368
|
+
self.evaluation_cost += cost
|
|
369
|
+
return res
|
|
370
|
+
else:
|
|
371
|
+
res = await self.model.a_generate(prompt)
|
|
372
|
+
return res
|
|
355
373
|
|
|
356
374
|
def is_successful(self) -> bool:
|
|
357
375
|
if self.error is not None:
|
|
358
376
|
self.success = False
|
|
359
377
|
else:
|
|
360
378
|
try:
|
|
361
|
-
self.score >= self.threshold
|
|
362
|
-
except:
|
|
379
|
+
self.success = self.score >= self.threshold
|
|
380
|
+
except TypeError:
|
|
363
381
|
self.success = False
|
|
364
382
|
return self.success
|
|
365
383
|
|
deepeval/metrics/mcp/template.py
CHANGED
|
@@ -4,6 +4,14 @@ from deepeval.test_case import MCPServer
|
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class MCPTaskCompletionTemplate:
|
|
7
|
+
multimodal_rules = """
|
|
8
|
+
--- MULTIMODAL INPUT RULES ---
|
|
9
|
+
- Treat image content as factual evidence.
|
|
10
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
11
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
12
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
13
|
+
"""
|
|
14
|
+
|
|
7
15
|
@staticmethod
|
|
8
16
|
def get_args_correctness_score(task: Task, mcp_servers: List[MCPServer]):
|
|
9
17
|
available_tools = [data.available_tools for data in mcp_servers]
|
|
@@ -12,6 +20,8 @@ class MCPTaskCompletionTemplate:
|
|
|
12
20
|
steps_taken = "\n".join(task.steps_taken)
|
|
13
21
|
return f"""Evaluate whether the arguments (inputs) provided by the agent to the tools, resources, and prompts were correct and aligned with their respective input schemas. Your job is to determine if the agent supplied appropriate, complete, and well-formatted arguments for each invocation.
|
|
14
22
|
|
|
23
|
+
{MCPTaskCompletionTemplate.multimodal_rules}
|
|
24
|
+
|
|
15
25
|
Output a JSON object with exactly two fields: 'score' and 'reason'.
|
|
16
26
|
|
|
17
27
|
Scoring:
|
|
@@ -55,6 +65,8 @@ JSON:
|
|
|
55
65
|
steps_taken = "\n".join(task.steps_taken)
|
|
56
66
|
return f"""Evaluate whether the tools, resources, and prompts used by the agent were appropriate and optimal, based strictly on the list of available tools and resources provided. Your job is to determine whether the agent selected the most suitable tools and prompts for the task at hand. Output a JSON object with exactly two fields: 'score' and 'reason'.
|
|
57
67
|
|
|
68
|
+
{MCPTaskCompletionTemplate.multimodal_rules}
|
|
69
|
+
|
|
58
70
|
Scoring:
|
|
59
71
|
- 'score' is a float between 0 and 1 inclusive.
|
|
60
72
|
- Use intermediate values (e.g., 0.25, 0.5, 0.75) to reflect partially appropriate tool use, suboptimal decisions, or missed better alternatives.
|
|
@@ -92,6 +104,9 @@ JSON:
|
|
|
92
104
|
def get_task_completion_score(task: Task):
|
|
93
105
|
steps_taken = "\n".join(task.steps_taken)
|
|
94
106
|
return f"""Evaluate whether the user's task has been successfully completed by the agent, based strictly on what the user can see in the agent's responses. You must return a JSON object with exactly two fields: 'score' and 'reason'.
|
|
107
|
+
|
|
108
|
+
{MCPTaskCompletionTemplate.multimodal_rules}
|
|
109
|
+
|
|
95
110
|
Scoring:
|
|
96
111
|
- 'score' is a float between 0 and 1 inclusive.
|
|
97
112
|
- Use intermediate values (e.g., 0.25, 0.5, 0.75) to reflect partial task success or missing/inaccurate information.
|
|
@@ -123,3 +138,40 @@ Example Output:
|
|
|
123
138
|
|
|
124
139
|
JSON:
|
|
125
140
|
"""
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def generate_final_reason(
|
|
144
|
+
final_score: float, success: bool, reasons: List[str]
|
|
145
|
+
):
|
|
146
|
+
return f"""You are an AI evaluator producing a single final explanation for the an MCP application's evaluation results using the provided reasons.
|
|
147
|
+
|
|
148
|
+
Context:
|
|
149
|
+
The reasons are from metrics that were used to evaluate an MCP application by determining whether the model accurately completed a task or called toos and resources with the right arguments.
|
|
150
|
+
|
|
151
|
+
Inputs:
|
|
152
|
+
- final_score: the averaged score across all interactions.
|
|
153
|
+
- success: whether the metric passed or failed
|
|
154
|
+
- reasons: a list of textual reasons generated from individual interactions.
|
|
155
|
+
|
|
156
|
+
Instructions:
|
|
157
|
+
1. Read all reasons and synthesize them into one unified explanation.
|
|
158
|
+
2. Do not repeat every reason; merge them into a concise, coherent narrative.
|
|
159
|
+
4. If the metric failed, state the dominant failure reasons. If it passed, state why the application has passed.
|
|
160
|
+
5. Output a single paragraph with no lists, no bullets, no markup.
|
|
161
|
+
|
|
162
|
+
Output:
|
|
163
|
+
A single paragraph explaining the final outcome.
|
|
164
|
+
|
|
165
|
+
Here's the inputs:
|
|
166
|
+
|
|
167
|
+
Final Score: {final_score}
|
|
168
|
+
|
|
169
|
+
Reasons:
|
|
170
|
+
{reasons}
|
|
171
|
+
|
|
172
|
+
Success: {success}
|
|
173
|
+
|
|
174
|
+
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
175
|
+
|
|
176
|
+
The final reason:
|
|
177
|
+
"""
|
|
@@ -3,9 +3,10 @@ from typing import Optional, List, Union
|
|
|
3
3
|
from deepeval.utils import get_or_create_event_loop
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
check_llm_test_case_params,
|
|
8
7
|
initialize_model,
|
|
8
|
+
a_generate_with_schema_and_extract,
|
|
9
|
+
generate_with_schema_and_extract,
|
|
9
10
|
)
|
|
10
11
|
from deepeval.test_case import (
|
|
11
12
|
LLMTestCase,
|
|
@@ -54,7 +55,16 @@ class MCPUseMetric(BaseMetric):
|
|
|
54
55
|
_in_component: bool = False,
|
|
55
56
|
_log_metric_to_confident: bool = True,
|
|
56
57
|
) -> float:
|
|
57
|
-
|
|
58
|
+
multimodal = test_case.multimodal
|
|
59
|
+
check_llm_test_case_params(
|
|
60
|
+
test_case,
|
|
61
|
+
self._required_params,
|
|
62
|
+
None,
|
|
63
|
+
None,
|
|
64
|
+
self,
|
|
65
|
+
self.model,
|
|
66
|
+
multimodal,
|
|
67
|
+
)
|
|
58
68
|
|
|
59
69
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
60
70
|
with metric_progress_indicator(
|
|
@@ -121,11 +131,23 @@ class MCPUseMetric(BaseMetric):
|
|
|
121
131
|
_in_component: bool = False,
|
|
122
132
|
_log_metric_to_confident: bool = True,
|
|
123
133
|
) -> float:
|
|
124
|
-
|
|
134
|
+
multimodal = test_case.multimodal
|
|
135
|
+
check_llm_test_case_params(
|
|
136
|
+
test_case,
|
|
137
|
+
self._required_params,
|
|
138
|
+
None,
|
|
139
|
+
None,
|
|
140
|
+
self,
|
|
141
|
+
self.model,
|
|
142
|
+
multimodal,
|
|
143
|
+
)
|
|
125
144
|
|
|
126
145
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
127
146
|
with metric_progress_indicator(
|
|
128
|
-
self,
|
|
147
|
+
self,
|
|
148
|
+
async_mode=True,
|
|
149
|
+
_show_indicator=_show_indicator,
|
|
150
|
+
_in_component=_in_component,
|
|
129
151
|
):
|
|
130
152
|
available_primitives, primitives_used = (
|
|
131
153
|
self._get_mcp_interaction_text(
|
|
@@ -177,20 +199,13 @@ class MCPUseMetric(BaseMetric):
|
|
|
177
199
|
prompt = MCPUseMetricTemplate.get_primitive_correctness_prompt(
|
|
178
200
|
test_case, available_primitives, primitives_used
|
|
179
201
|
)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
prompt, schema=MCPPrimitivesScore
|
|
188
|
-
)
|
|
189
|
-
return res
|
|
190
|
-
except TypeError:
|
|
191
|
-
res = self.model.generate(prompt)
|
|
192
|
-
data = trimAndLoadJson(res, self)
|
|
193
|
-
return MCPPrimitivesScore(**data)
|
|
202
|
+
return generate_with_schema_and_extract(
|
|
203
|
+
metric=self,
|
|
204
|
+
prompt=prompt,
|
|
205
|
+
schema_cls=MCPPrimitivesScore,
|
|
206
|
+
extract_schema=lambda s: s,
|
|
207
|
+
extract_json=lambda data: MCPPrimitivesScore(**data),
|
|
208
|
+
)
|
|
194
209
|
|
|
195
210
|
async def _a_get_primitives_used_score(
|
|
196
211
|
self,
|
|
@@ -201,22 +216,13 @@ class MCPUseMetric(BaseMetric):
|
|
|
201
216
|
prompt = MCPUseMetricTemplate.get_primitive_correctness_prompt(
|
|
202
217
|
test_case, available_primitives, primitives_used
|
|
203
218
|
)
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
try:
|
|
212
|
-
res: MCPPrimitivesScore = await self.model.a_generate(
|
|
213
|
-
prompt, schema=MCPPrimitivesScore
|
|
214
|
-
)
|
|
215
|
-
return res
|
|
216
|
-
except TypeError:
|
|
217
|
-
res = await self.model.a_generate(prompt)
|
|
218
|
-
data = trimAndLoadJson(res, self)
|
|
219
|
-
return MCPPrimitivesScore(**data)
|
|
219
|
+
return await a_generate_with_schema_and_extract(
|
|
220
|
+
metric=self,
|
|
221
|
+
prompt=prompt,
|
|
222
|
+
schema_cls=MCPPrimitivesScore,
|
|
223
|
+
extract_schema=lambda s: s,
|
|
224
|
+
extract_json=lambda data: MCPPrimitivesScore(**data),
|
|
225
|
+
)
|
|
220
226
|
|
|
221
227
|
def _get_argument_correctness_score(
|
|
222
228
|
self,
|
|
@@ -227,20 +233,13 @@ class MCPUseMetric(BaseMetric):
|
|
|
227
233
|
prompt = MCPUseMetricTemplate.get_mcp_argument_correctness_prompt(
|
|
228
234
|
test_case, available_primitives, primitives_used
|
|
229
235
|
)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
prompt, schema=MCPArgsScore
|
|
238
|
-
)
|
|
239
|
-
return res
|
|
240
|
-
except TypeError:
|
|
241
|
-
res = self.model.generate(prompt)
|
|
242
|
-
data = trimAndLoadJson(res, self)
|
|
243
|
-
return MCPArgsScore(**data)
|
|
236
|
+
return generate_with_schema_and_extract(
|
|
237
|
+
metric=self,
|
|
238
|
+
prompt=prompt,
|
|
239
|
+
schema_cls=MCPArgsScore,
|
|
240
|
+
extract_schema=lambda s: s,
|
|
241
|
+
extract_json=lambda data: MCPArgsScore(**data),
|
|
242
|
+
)
|
|
244
243
|
|
|
245
244
|
async def _a_get_argument_correctness_score(
|
|
246
245
|
self,
|
|
@@ -251,20 +250,13 @@ class MCPUseMetric(BaseMetric):
|
|
|
251
250
|
prompt = MCPUseMetricTemplate.get_mcp_argument_correctness_prompt(
|
|
252
251
|
test_case, available_primitives, primitives_used
|
|
253
252
|
)
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
prompt, schema=MCPArgsScore
|
|
262
|
-
)
|
|
263
|
-
return res
|
|
264
|
-
except TypeError:
|
|
265
|
-
res = await self.model.a_generate(prompt)
|
|
266
|
-
data = trimAndLoadJson(res, self)
|
|
267
|
-
return MCPArgsScore(**data)
|
|
253
|
+
return await a_generate_with_schema_and_extract(
|
|
254
|
+
metric=self,
|
|
255
|
+
prompt=prompt,
|
|
256
|
+
schema_cls=MCPArgsScore,
|
|
257
|
+
extract_schema=lambda s: s,
|
|
258
|
+
extract_json=lambda data: MCPArgsScore(**data),
|
|
259
|
+
)
|
|
268
260
|
|
|
269
261
|
def _calculate_score(
|
|
270
262
|
self,
|
|
@@ -280,7 +272,9 @@ class MCPUseMetric(BaseMetric):
|
|
|
280
272
|
self,
|
|
281
273
|
primitives_used_score: MCPPrimitivesScore,
|
|
282
274
|
argument_correctness_score: MCPArgsScore,
|
|
283
|
-
) -> str:
|
|
275
|
+
) -> Optional[str]:
|
|
276
|
+
if not self.include_reason:
|
|
277
|
+
return None
|
|
284
278
|
return (
|
|
285
279
|
f"[\n"
|
|
286
280
|
f"\t{primitives_used_score.reason}\n"
|
|
@@ -390,7 +384,7 @@ class MCPUseMetric(BaseMetric):
|
|
|
390
384
|
else:
|
|
391
385
|
try:
|
|
392
386
|
self.success = self.score >= self.threshold
|
|
393
|
-
except:
|
|
387
|
+
except TypeError:
|
|
394
388
|
self.success = False
|
|
395
389
|
return self.success
|
|
396
390
|
|
|
@@ -3,6 +3,14 @@ import textwrap
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
class MCPUseMetricTemplate:
|
|
6
|
+
multimodal_rules = """
|
|
7
|
+
--- MULTIMODAL INPUT RULES ---
|
|
8
|
+
- Treat image content as factual evidence.
|
|
9
|
+
- Only reference visual details that are explicitly and clearly visible.
|
|
10
|
+
- Do not infer or guess objects, text, or details not visibly present.
|
|
11
|
+
- If an image is unclear or ambiguous, mark uncertainty explicitly.
|
|
12
|
+
"""
|
|
13
|
+
|
|
6
14
|
@staticmethod
|
|
7
15
|
def get_mcp_argument_correctness_prompt(
|
|
8
16
|
test_case: LLMTestCase,
|
|
@@ -12,6 +20,8 @@ class MCPUseMetricTemplate:
|
|
|
12
20
|
return textwrap.dedent(
|
|
13
21
|
f"""Evaluate whether the arguments passed to each tool (primitive) used by the agent were appropriate and correct for the intended purpose. Focus on whether the input types, formats, and contents match the expectations of the tools and are suitable given the user's request.
|
|
14
22
|
|
|
23
|
+
{MCPUseMetricTemplate.multimodal_rules}
|
|
24
|
+
|
|
15
25
|
You must return a JSON object with exactly two fields: 'score' and 'reason'.
|
|
16
26
|
|
|
17
27
|
Scoring:
|
|
@@ -68,6 +78,8 @@ class MCPUseMetricTemplate:
|
|
|
68
78
|
return textwrap.dedent(
|
|
69
79
|
f"""Evaluate whether the tools (primitives) selected and used by the agent were appropriate and correct for fulfilling the user’s request. Base your judgment on the user input, the agent’s visible output, and the tools that were available to the agent. You must return a JSON object with exactly two fields: 'score' and 'reason'.
|
|
70
80
|
|
|
81
|
+
{MCPUseMetricTemplate.multimodal_rules}
|
|
82
|
+
|
|
71
83
|
Scoring:
|
|
72
84
|
- 'score' is a float between 0 and 1 inclusive.
|
|
73
85
|
- Use intermediate values (e.g., 0.25, 0.5, 0.75) to reflect cases where the tools used were partially correct, suboptimal, or only somewhat relevant.
|