deepeval 3.7.4__py3-none-any.whl → 3.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +35 -1
  3. deepeval/dataset/api.py +23 -1
  4. deepeval/dataset/golden.py +139 -2
  5. deepeval/evaluate/evaluate.py +16 -11
  6. deepeval/evaluate/execute.py +13 -181
  7. deepeval/evaluate/utils.py +6 -26
  8. deepeval/integrations/pydantic_ai/agent.py +19 -2
  9. deepeval/integrations/pydantic_ai/instrumentator.py +62 -23
  10. deepeval/key_handler.py +3 -0
  11. deepeval/metrics/__init__.py +14 -16
  12. deepeval/metrics/answer_relevancy/answer_relevancy.py +118 -116
  13. deepeval/metrics/answer_relevancy/template.py +22 -3
  14. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  15. deepeval/metrics/arena_g_eval/template.py +17 -1
  16. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  17. deepeval/metrics/argument_correctness/template.py +19 -2
  18. deepeval/metrics/base_metric.py +13 -44
  19. deepeval/metrics/bias/bias.py +102 -108
  20. deepeval/metrics/bias/template.py +14 -2
  21. deepeval/metrics/contextual_precision/contextual_precision.py +96 -94
  22. deepeval/metrics/contextual_precision/template.py +115 -66
  23. deepeval/metrics/contextual_recall/contextual_recall.py +94 -84
  24. deepeval/metrics/contextual_recall/template.py +106 -55
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +86 -84
  26. deepeval/metrics/contextual_relevancy/template.py +87 -58
  27. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  28. deepeval/metrics/conversation_completeness/template.py +23 -3
  29. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  30. deepeval/metrics/conversational_dag/nodes.py +66 -123
  31. deepeval/metrics/conversational_dag/templates.py +16 -0
  32. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  33. deepeval/metrics/dag/dag.py +10 -0
  34. deepeval/metrics/dag/nodes.py +63 -126
  35. deepeval/metrics/dag/templates.py +16 -2
  36. deepeval/metrics/exact_match/exact_match.py +9 -1
  37. deepeval/metrics/faithfulness/faithfulness.py +138 -149
  38. deepeval/metrics/faithfulness/schema.py +1 -1
  39. deepeval/metrics/faithfulness/template.py +200 -115
  40. deepeval/metrics/g_eval/g_eval.py +87 -78
  41. deepeval/metrics/g_eval/template.py +18 -1
  42. deepeval/metrics/g_eval/utils.py +7 -6
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  44. deepeval/metrics/goal_accuracy/template.py +21 -3
  45. deepeval/metrics/hallucination/hallucination.py +60 -75
  46. deepeval/metrics/hallucination/template.py +13 -0
  47. deepeval/metrics/indicator.py +7 -10
  48. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  49. deepeval/metrics/json_correctness/template.py +10 -0
  50. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  51. deepeval/metrics/knowledge_retention/schema.py +9 -3
  52. deepeval/metrics/knowledge_retention/template.py +12 -0
  53. deepeval/metrics/mcp/mcp_task_completion.py +68 -38
  54. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +92 -74
  55. deepeval/metrics/mcp/template.py +52 -0
  56. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  57. deepeval/metrics/mcp_use_metric/template.py +12 -0
  58. deepeval/metrics/misuse/misuse.py +77 -97
  59. deepeval/metrics/misuse/template.py +15 -0
  60. deepeval/metrics/multimodal_metrics/__init__.py +0 -19
  61. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +59 -53
  62. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +79 -95
  63. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +59 -53
  64. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +59 -53
  65. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +111 -109
  66. deepeval/metrics/non_advice/non_advice.py +79 -105
  67. deepeval/metrics/non_advice/template.py +12 -0
  68. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  69. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  70. deepeval/metrics/pii_leakage/template.py +14 -0
  71. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  72. deepeval/metrics/plan_adherence/template.py +11 -0
  73. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  74. deepeval/metrics/plan_quality/template.py +9 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +72 -83
  76. deepeval/metrics/prompt_alignment/template.py +12 -0
  77. deepeval/metrics/ragas.py +3 -3
  78. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  79. deepeval/metrics/role_adherence/template.py +14 -0
  80. deepeval/metrics/role_violation/role_violation.py +75 -108
  81. deepeval/metrics/role_violation/template.py +12 -0
  82. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  83. deepeval/metrics/step_efficiency/template.py +11 -0
  84. deepeval/metrics/summarization/summarization.py +115 -183
  85. deepeval/metrics/summarization/template.py +19 -0
  86. deepeval/metrics/task_completion/task_completion.py +67 -73
  87. deepeval/metrics/tool_correctness/tool_correctness.py +45 -44
  88. deepeval/metrics/tool_use/tool_use.py +42 -66
  89. deepeval/metrics/topic_adherence/template.py +13 -0
  90. deepeval/metrics/topic_adherence/topic_adherence.py +53 -67
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +21 -0
  94. deepeval/metrics/turn_contextual_precision/template.py +187 -0
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +592 -0
  96. deepeval/metrics/turn_contextual_recall/schema.py +21 -0
  97. deepeval/metrics/turn_contextual_recall/template.py +178 -0
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +563 -0
  99. deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_contextual_relevancy}/schema.py +7 -1
  100. deepeval/metrics/turn_contextual_relevancy/template.py +161 -0
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +576 -0
  102. deepeval/metrics/{multimodal_metrics/multimodal_faithfulness → turn_faithfulness}/schema.py +11 -3
  103. deepeval/metrics/turn_faithfulness/template.py +218 -0
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +627 -0
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +158 -122
  108. deepeval/models/__init__.py +0 -12
  109. deepeval/models/base_model.py +49 -33
  110. deepeval/models/embedding_models/__init__.py +7 -0
  111. deepeval/models/embedding_models/azure_embedding_model.py +79 -33
  112. deepeval/models/embedding_models/local_embedding_model.py +39 -20
  113. deepeval/models/embedding_models/ollama_embedding_model.py +52 -19
  114. deepeval/models/embedding_models/openai_embedding_model.py +42 -22
  115. deepeval/models/llms/amazon_bedrock_model.py +226 -72
  116. deepeval/models/llms/anthropic_model.py +178 -63
  117. deepeval/models/llms/azure_model.py +218 -60
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +95 -40
  120. deepeval/models/llms/gemini_model.py +209 -64
  121. deepeval/models/llms/grok_model.py +139 -68
  122. deepeval/models/llms/kimi_model.py +140 -90
  123. deepeval/models/llms/litellm_model.py +131 -37
  124. deepeval/models/llms/local_model.py +125 -21
  125. deepeval/models/llms/ollama_model.py +147 -24
  126. deepeval/models/llms/openai_model.py +222 -269
  127. deepeval/models/llms/portkey_model.py +81 -22
  128. deepeval/models/llms/utils.py +8 -3
  129. deepeval/models/retry_policy.py +17 -14
  130. deepeval/models/utils.py +106 -5
  131. deepeval/optimizer/__init__.py +5 -0
  132. deepeval/optimizer/algorithms/__init__.py +6 -0
  133. deepeval/optimizer/algorithms/base.py +29 -0
  134. deepeval/optimizer/algorithms/configs.py +18 -0
  135. deepeval/optimizer/algorithms/copro/__init__.py +5 -0
  136. deepeval/{optimization/copro/loop.py → optimizer/algorithms/copro/copro.py} +112 -113
  137. deepeval/optimizer/algorithms/gepa/__init__.py +5 -0
  138. deepeval/{optimization/gepa/loop.py → optimizer/algorithms/gepa/gepa.py} +175 -115
  139. deepeval/optimizer/algorithms/miprov2/__init__.py +17 -0
  140. deepeval/optimizer/algorithms/miprov2/bootstrapper.py +435 -0
  141. deepeval/optimizer/algorithms/miprov2/miprov2.py +752 -0
  142. deepeval/optimizer/algorithms/miprov2/proposer.py +301 -0
  143. deepeval/optimizer/algorithms/simba/__init__.py +5 -0
  144. deepeval/{optimization/simba/loop.py → optimizer/algorithms/simba/simba.py} +128 -112
  145. deepeval/{optimization → optimizer}/configs.py +5 -8
  146. deepeval/{optimization/policies/selection.py → optimizer/policies.py} +63 -2
  147. deepeval/optimizer/prompt_optimizer.py +263 -0
  148. deepeval/optimizer/rewriter/__init__.py +5 -0
  149. deepeval/optimizer/rewriter/rewriter.py +124 -0
  150. deepeval/optimizer/rewriter/utils.py +214 -0
  151. deepeval/optimizer/scorer/__init__.py +5 -0
  152. deepeval/optimizer/scorer/base.py +86 -0
  153. deepeval/optimizer/scorer/scorer.py +316 -0
  154. deepeval/optimizer/scorer/utils.py +30 -0
  155. deepeval/optimizer/types.py +148 -0
  156. deepeval/{optimization → optimizer}/utils.py +47 -165
  157. deepeval/prompt/prompt.py +5 -9
  158. deepeval/simulator/conversation_simulator.py +43 -0
  159. deepeval/simulator/template.py +13 -0
  160. deepeval/test_case/__init__.py +1 -3
  161. deepeval/test_case/api.py +26 -45
  162. deepeval/test_case/arena_test_case.py +7 -2
  163. deepeval/test_case/conversational_test_case.py +68 -1
  164. deepeval/test_case/llm_test_case.py +206 -1
  165. deepeval/test_case/utils.py +4 -8
  166. deepeval/test_run/api.py +18 -14
  167. deepeval/test_run/test_run.py +3 -3
  168. deepeval/tracing/patchers.py +9 -4
  169. deepeval/tracing/tracing.py +2 -2
  170. deepeval/utils.py +65 -0
  171. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/METADATA +1 -4
  172. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/RECORD +180 -193
  173. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +0 -343
  174. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/schema.py +0 -19
  175. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/template.py +0 -122
  176. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +0 -301
  177. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/schema.py +0 -15
  178. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/template.py +0 -132
  179. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +0 -285
  180. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/schema.py +0 -15
  181. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/template.py +0 -112
  182. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +0 -282
  183. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/template.py +0 -102
  184. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/__init__.py +0 -0
  185. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +0 -356
  186. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/template.py +0 -175
  187. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  188. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  189. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  190. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -148
  191. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  192. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py +0 -0
  193. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +0 -290
  194. deepeval/models/mlllms/__init__.py +0 -4
  195. deepeval/models/mlllms/azure_model.py +0 -343
  196. deepeval/models/mlllms/gemini_model.py +0 -313
  197. deepeval/models/mlllms/ollama_model.py +0 -175
  198. deepeval/models/mlllms/openai_model.py +0 -309
  199. deepeval/optimization/__init__.py +0 -13
  200. deepeval/optimization/adapters/__init__.py +0 -2
  201. deepeval/optimization/adapters/deepeval_scoring_adapter.py +0 -588
  202. deepeval/optimization/aggregates.py +0 -14
  203. deepeval/optimization/copro/configs.py +0 -31
  204. deepeval/optimization/gepa/__init__.py +0 -7
  205. deepeval/optimization/gepa/configs.py +0 -115
  206. deepeval/optimization/miprov2/configs.py +0 -134
  207. deepeval/optimization/miprov2/loop.py +0 -785
  208. deepeval/optimization/mutations/__init__.py +0 -0
  209. deepeval/optimization/mutations/prompt_rewriter.py +0 -458
  210. deepeval/optimization/policies/__init__.py +0 -16
  211. deepeval/optimization/policies/tie_breaker.py +0 -67
  212. deepeval/optimization/prompt_optimizer.py +0 -462
  213. deepeval/optimization/simba/__init__.py +0 -0
  214. deepeval/optimization/simba/configs.py +0 -33
  215. deepeval/optimization/types.py +0 -361
  216. deepeval/test_case/mllm_test_case.py +0 -170
  217. /deepeval/metrics/{multimodal_metrics/multimodal_answer_relevancy → turn_contextual_precision}/__init__.py +0 -0
  218. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_precision → turn_contextual_recall}/__init__.py +0 -0
  219. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_recall → turn_contextual_relevancy}/__init__.py +0 -0
  220. /deepeval/metrics/{multimodal_metrics/multimodal_contextual_relevancy → turn_faithfulness}/__init__.py +0 -0
  221. /deepeval/{optimization → optimizer/algorithms}/simba/types.py +0 -0
  222. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/LICENSE.md +0 -0
  223. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/WHEEL +0 -0
  224. {deepeval-3.7.4.dist-info → deepeval-3.7.6.dist-info}/entry_points.txt +0 -0
@@ -7,8 +7,9 @@ from deepeval.metrics.utils import (
7
7
  check_conversational_test_case_params,
8
8
  construct_verbose_logs,
9
9
  get_unit_interactions,
10
- trimAndLoadJson,
11
10
  initialize_model,
11
+ a_generate_with_schema_and_extract,
12
+ generate_with_schema_and_extract,
12
13
  )
13
14
  from deepeval.metrics.indicator import metric_progress_indicator
14
15
  from deepeval.test_case import ConversationalTestCase, TurnParams
@@ -50,7 +51,12 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
50
51
  _log_metric_to_confident: bool = True,
51
52
  ):
52
53
  check_conversational_test_case_params(
53
- test_case, self._required_test_case_params, self
54
+ test_case,
55
+ self._required_test_case_params,
56
+ self,
57
+ False,
58
+ self.model,
59
+ test_case.multimodal,
54
60
  )
55
61
 
56
62
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -120,7 +126,12 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
120
126
  _log_metric_to_confident: bool = True,
121
127
  ):
122
128
  check_conversational_test_case_params(
123
- test_case, self._required_test_case_params, self
129
+ test_case,
130
+ self._required_test_case_params,
131
+ self,
132
+ False,
133
+ self.model,
134
+ test_case.multimodal,
124
135
  )
125
136
 
126
137
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -185,18 +196,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
185
196
  prompt = MCPTaskCompletionTemplate.get_tool_correctness_score(
186
197
  task, test_case.mcp_servers
187
198
  )
188
- if self.using_native_model:
189
- res, cost = self.model.generate(prompt, schema=ToolScore)
190
- self.evaluation_cost += cost
191
- return res
192
- else:
193
- try:
194
- res: ToolScore = self.model.generate(prompt, schema=ToolScore)
195
- return res
196
- except TypeError:
197
- res = self.model.generate(prompt)
198
- data = trimAndLoadJson(res, self)
199
- return ToolScore(**data)
199
+ return generate_with_schema_and_extract(
200
+ metric=self,
201
+ prompt=prompt,
202
+ schema_cls=ToolScore,
203
+ extract_schema=lambda s: s,
204
+ extract_json=lambda data: ToolScore(**data),
205
+ )
200
206
 
201
207
  async def _a_get_tool_accuracy_score(
202
208
  self, task: Task, test_case: ConversationalTestCase
@@ -204,20 +210,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
204
210
  prompt = MCPTaskCompletionTemplate.get_tool_correctness_score(
205
211
  task, test_case.mcp_servers
206
212
  )
207
- if self.using_native_model:
208
- res, cost = await self.model.a_generate(prompt, schema=ToolScore)
209
- self.evaluation_cost += cost
210
- return res
211
- else:
212
- try:
213
- res: ToolScore = await self.model.a_generate(
214
- prompt, schema=ToolScore
215
- )
216
- return res
217
- except TypeError:
218
- res = await self.model.a_generate(prompt)
219
- data = trimAndLoadJson(res, self)
220
- return ToolScore(**data)
213
+ return await a_generate_with_schema_and_extract(
214
+ metric=self,
215
+ prompt=prompt,
216
+ schema_cls=ToolScore,
217
+ extract_schema=lambda s: s,
218
+ extract_json=lambda data: ToolScore(**data),
219
+ )
221
220
 
222
221
  def _get_args_score(
223
222
  self, task: Task, test_case: ConversationalTestCase
@@ -225,18 +224,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
225
224
  prompt = MCPTaskCompletionTemplate.get_args_correctness_score(
226
225
  task, test_case.mcp_servers
227
226
  )
228
- if self.using_native_model:
229
- res, cost = self.model.generate(prompt, schema=ArgsScore)
230
- self.evaluation_cost += cost
231
- return res
232
- else:
233
- try:
234
- res: ArgsScore = self.model.generate(prompt, schema=ArgsScore)
235
- return res
236
- except TypeError:
237
- res = self.model.generate(prompt)
238
- data = trimAndLoadJson(res, self)
239
- return ArgsScore(**data)
227
+ return generate_with_schema_and_extract(
228
+ metric=self,
229
+ prompt=prompt,
230
+ schema_cls=ArgsScore,
231
+ extract_schema=lambda s: s,
232
+ extract_json=lambda data: ArgsScore(**data),
233
+ )
240
234
 
241
235
  async def _a_get_args_score(
242
236
  self, task: Task, test_case: ConversationalTestCase
@@ -244,20 +238,13 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
244
238
  prompt = MCPTaskCompletionTemplate.get_args_correctness_score(
245
239
  task, test_case.mcp_servers
246
240
  )
247
- if self.using_native_model:
248
- res, cost = await self.model.a_generate(prompt, schema=ArgsScore)
249
- self.evaluation_cost += cost
250
- return res
251
- else:
252
- try:
253
- res: ArgsScore = await self.model.a_generate(
254
- prompt, schema=ArgsScore
255
- )
256
- return res
257
- except TypeError:
258
- res = await self.model.a_generate(prompt)
259
- data = trimAndLoadJson(res, self)
260
- return ArgsScore(**data)
241
+ return await a_generate_with_schema_and_extract(
242
+ metric=self,
243
+ prompt=prompt,
244
+ schema_cls=ArgsScore,
245
+ extract_schema=lambda s: s,
246
+ extract_json=lambda data: ArgsScore(**data),
247
+ )
261
248
 
262
249
  def _get_tasks(self, unit_interactions: List) -> List[Task]:
263
250
  tasks = []
@@ -334,32 +321,63 @@ class MultiTurnMCPUseMetric(BaseConversationalMetric):
334
321
  self,
335
322
  tool_accuracy_score: List[ToolScore],
336
323
  args_accuracy_score: List[ArgsScore],
337
- ) -> str:
338
- reason = "["
324
+ ) -> Optional[str]:
325
+ if not self.include_reason:
326
+ return None
327
+
328
+ reasons = []
339
329
  for task_score in tool_accuracy_score:
340
- if task_score.score < self.threshold:
341
- reason += "\nPrimitives Used\n"
342
- reason += (
343
- f"Score: {task_score.score}\n"
344
- f"Reason: {task_score.reason}\n"
345
- )
346
- for task_score in args_accuracy_score:
347
- if task_score.score < self.threshold:
348
- reason += "\nArguments Generated\n"
349
- reason += (
350
- f"Score: {task_score.score}\n"
351
- f"Reason: {task_score.reason}\n"
352
- )
353
- reason += "]"
354
- return reason
330
+ reasons.append(task_score.reason)
331
+
332
+ for arg_score in args_accuracy_score:
333
+ reasons.append(arg_score.reason)
334
+
335
+ prompt = MCPTaskCompletionTemplate.generate_final_reason(
336
+ self.score, self.success, reasons
337
+ )
338
+
339
+ if self.using_native_model:
340
+ res, cost = self.model.generate(prompt)
341
+ self.evaluation_cost += cost
342
+ return res
343
+ else:
344
+ res = self.model.generate(prompt)
345
+ return res
346
+
347
+ async def _a_generate_reason(
348
+ self,
349
+ tool_accuracy_score: List[ToolScore],
350
+ args_accuracy_score: List[ArgsScore],
351
+ ) -> Optional[str]:
352
+ if not self.include_reason:
353
+ return None
354
+
355
+ reasons = []
356
+ for task_score in tool_accuracy_score:
357
+ reasons.append(task_score.reason)
358
+
359
+ for arg_score in args_accuracy_score:
360
+ reasons.append(arg_score.reason)
361
+
362
+ prompt = MCPTaskCompletionTemplate.generate_final_reason(
363
+ self.score, self.success, reasons
364
+ )
365
+
366
+ if self.using_native_model:
367
+ res, cost = await self.model.a_generate(prompt)
368
+ self.evaluation_cost += cost
369
+ return res
370
+ else:
371
+ res = await self.model.a_generate(prompt)
372
+ return res
355
373
 
356
374
  def is_successful(self) -> bool:
357
375
  if self.error is not None:
358
376
  self.success = False
359
377
  else:
360
378
  try:
361
- self.score >= self.threshold
362
- except:
379
+ self.success = self.score >= self.threshold
380
+ except TypeError:
363
381
  self.success = False
364
382
  return self.success
365
383
 
@@ -4,6 +4,14 @@ from deepeval.test_case import MCPServer
4
4
 
5
5
 
6
6
  class MCPTaskCompletionTemplate:
7
+ multimodal_rules = """
8
+ --- MULTIMODAL INPUT RULES ---
9
+ - Treat image content as factual evidence.
10
+ - Only reference visual details that are explicitly and clearly visible.
11
+ - Do not infer or guess objects, text, or details not visibly present.
12
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
13
+ """
14
+
7
15
  @staticmethod
8
16
  def get_args_correctness_score(task: Task, mcp_servers: List[MCPServer]):
9
17
  available_tools = [data.available_tools for data in mcp_servers]
@@ -12,6 +20,8 @@ class MCPTaskCompletionTemplate:
12
20
  steps_taken = "\n".join(task.steps_taken)
13
21
  return f"""Evaluate whether the arguments (inputs) provided by the agent to the tools, resources, and prompts were correct and aligned with their respective input schemas. Your job is to determine if the agent supplied appropriate, complete, and well-formatted arguments for each invocation.
14
22
 
23
+ {MCPTaskCompletionTemplate.multimodal_rules}
24
+
15
25
  Output a JSON object with exactly two fields: 'score' and 'reason'.
16
26
 
17
27
  Scoring:
@@ -55,6 +65,8 @@ JSON:
55
65
  steps_taken = "\n".join(task.steps_taken)
56
66
  return f"""Evaluate whether the tools, resources, and prompts used by the agent were appropriate and optimal, based strictly on the list of available tools and resources provided. Your job is to determine whether the agent selected the most suitable tools and prompts for the task at hand. Output a JSON object with exactly two fields: 'score' and 'reason'.
57
67
 
68
+ {MCPTaskCompletionTemplate.multimodal_rules}
69
+
58
70
  Scoring:
59
71
  - 'score' is a float between 0 and 1 inclusive.
60
72
  - Use intermediate values (e.g., 0.25, 0.5, 0.75) to reflect partially appropriate tool use, suboptimal decisions, or missed better alternatives.
@@ -92,6 +104,9 @@ JSON:
92
104
  def get_task_completion_score(task: Task):
93
105
  steps_taken = "\n".join(task.steps_taken)
94
106
  return f"""Evaluate whether the user's task has been successfully completed by the agent, based strictly on what the user can see in the agent's responses. You must return a JSON object with exactly two fields: 'score' and 'reason'.
107
+
108
+ {MCPTaskCompletionTemplate.multimodal_rules}
109
+
95
110
  Scoring:
96
111
  - 'score' is a float between 0 and 1 inclusive.
97
112
  - Use intermediate values (e.g., 0.25, 0.5, 0.75) to reflect partial task success or missing/inaccurate information.
@@ -123,3 +138,40 @@ Example Output:
123
138
 
124
139
  JSON:
125
140
  """
141
+
142
+ @staticmethod
143
+ def generate_final_reason(
144
+ final_score: float, success: bool, reasons: List[str]
145
+ ):
146
+ return f"""You are an AI evaluator producing a single final explanation for the an MCP application's evaluation results using the provided reasons.
147
+
148
+ Context:
149
+ The reasons are from metrics that were used to evaluate an MCP application by determining whether the model accurately completed a task or called toos and resources with the right arguments.
150
+
151
+ Inputs:
152
+ - final_score: the averaged score across all interactions.
153
+ - success: whether the metric passed or failed
154
+ - reasons: a list of textual reasons generated from individual interactions.
155
+
156
+ Instructions:
157
+ 1. Read all reasons and synthesize them into one unified explanation.
158
+ 2. Do not repeat every reason; merge them into a concise, coherent narrative.
159
+ 4. If the metric failed, state the dominant failure reasons. If it passed, state why the application has passed.
160
+ 5. Output a single paragraph with no lists, no bullets, no markup.
161
+
162
+ Output:
163
+ A single paragraph explaining the final outcome.
164
+
165
+ Here's the inputs:
166
+
167
+ Final Score: {final_score}
168
+
169
+ Reasons:
170
+ {reasons}
171
+
172
+ Success: {success}
173
+
174
+ Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
175
+
176
+ The final reason:
177
+ """
@@ -3,9 +3,10 @@ from typing import Optional, List, Union
3
3
  from deepeval.utils import get_or_create_event_loop
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  check_llm_test_case_params,
8
7
  initialize_model,
8
+ a_generate_with_schema_and_extract,
9
+ generate_with_schema_and_extract,
9
10
  )
10
11
  from deepeval.test_case import (
11
12
  LLMTestCase,
@@ -54,7 +55,16 @@ class MCPUseMetric(BaseMetric):
54
55
  _in_component: bool = False,
55
56
  _log_metric_to_confident: bool = True,
56
57
  ) -> float:
57
- check_llm_test_case_params(test_case, self._required_params, self)
58
+ multimodal = test_case.multimodal
59
+ check_llm_test_case_params(
60
+ test_case,
61
+ self._required_params,
62
+ None,
63
+ None,
64
+ self,
65
+ self.model,
66
+ multimodal,
67
+ )
58
68
 
59
69
  self.evaluation_cost = 0 if self.using_native_model else None
60
70
  with metric_progress_indicator(
@@ -121,11 +131,23 @@ class MCPUseMetric(BaseMetric):
121
131
  _in_component: bool = False,
122
132
  _log_metric_to_confident: bool = True,
123
133
  ) -> float:
124
- check_llm_test_case_params(test_case, self._required_params, self)
134
+ multimodal = test_case.multimodal
135
+ check_llm_test_case_params(
136
+ test_case,
137
+ self._required_params,
138
+ None,
139
+ None,
140
+ self,
141
+ self.model,
142
+ multimodal,
143
+ )
125
144
 
126
145
  self.evaluation_cost = 0 if self.using_native_model else None
127
146
  with metric_progress_indicator(
128
- self, _show_indicator=_show_indicator, _in_component=_in_component
147
+ self,
148
+ async_mode=True,
149
+ _show_indicator=_show_indicator,
150
+ _in_component=_in_component,
129
151
  ):
130
152
  available_primitives, primitives_used = (
131
153
  self._get_mcp_interaction_text(
@@ -177,20 +199,13 @@ class MCPUseMetric(BaseMetric):
177
199
  prompt = MCPUseMetricTemplate.get_primitive_correctness_prompt(
178
200
  test_case, available_primitives, primitives_used
179
201
  )
180
- if self.using_native_model:
181
- res, cost = self.model.generate(prompt, schema=MCPPrimitivesScore)
182
- self.evaluation_cost += cost
183
- return res
184
- else:
185
- try:
186
- res: MCPPrimitivesScore = self.model.generate(
187
- prompt, schema=MCPPrimitivesScore
188
- )
189
- return res
190
- except TypeError:
191
- res = self.model.generate(prompt)
192
- data = trimAndLoadJson(res, self)
193
- return MCPPrimitivesScore(**data)
202
+ return generate_with_schema_and_extract(
203
+ metric=self,
204
+ prompt=prompt,
205
+ schema_cls=MCPPrimitivesScore,
206
+ extract_schema=lambda s: s,
207
+ extract_json=lambda data: MCPPrimitivesScore(**data),
208
+ )
194
209
 
195
210
  async def _a_get_primitives_used_score(
196
211
  self,
@@ -201,22 +216,13 @@ class MCPUseMetric(BaseMetric):
201
216
  prompt = MCPUseMetricTemplate.get_primitive_correctness_prompt(
202
217
  test_case, available_primitives, primitives_used
203
218
  )
204
- if self.using_native_model:
205
- res, cost = await self.model.a_generate(
206
- prompt, schema=MCPPrimitivesScore
207
- )
208
- self.evaluation_cost += cost
209
- return res
210
- else:
211
- try:
212
- res: MCPPrimitivesScore = await self.model.a_generate(
213
- prompt, schema=MCPPrimitivesScore
214
- )
215
- return res
216
- except TypeError:
217
- res = await self.model.a_generate(prompt)
218
- data = trimAndLoadJson(res, self)
219
- return MCPPrimitivesScore(**data)
219
+ return await a_generate_with_schema_and_extract(
220
+ metric=self,
221
+ prompt=prompt,
222
+ schema_cls=MCPPrimitivesScore,
223
+ extract_schema=lambda s: s,
224
+ extract_json=lambda data: MCPPrimitivesScore(**data),
225
+ )
220
226
 
221
227
  def _get_argument_correctness_score(
222
228
  self,
@@ -227,20 +233,13 @@ class MCPUseMetric(BaseMetric):
227
233
  prompt = MCPUseMetricTemplate.get_mcp_argument_correctness_prompt(
228
234
  test_case, available_primitives, primitives_used
229
235
  )
230
- if self.using_native_model:
231
- res, cost = self.model.generate(prompt, schema=MCPArgsScore)
232
- self.evaluation_cost += cost
233
- return res
234
- else:
235
- try:
236
- res: MCPArgsScore = self.model.generate(
237
- prompt, schema=MCPArgsScore
238
- )
239
- return res
240
- except TypeError:
241
- res = self.model.generate(prompt)
242
- data = trimAndLoadJson(res, self)
243
- return MCPArgsScore(**data)
236
+ return generate_with_schema_and_extract(
237
+ metric=self,
238
+ prompt=prompt,
239
+ schema_cls=MCPArgsScore,
240
+ extract_schema=lambda s: s,
241
+ extract_json=lambda data: MCPArgsScore(**data),
242
+ )
244
243
 
245
244
  async def _a_get_argument_correctness_score(
246
245
  self,
@@ -251,20 +250,13 @@ class MCPUseMetric(BaseMetric):
251
250
  prompt = MCPUseMetricTemplate.get_mcp_argument_correctness_prompt(
252
251
  test_case, available_primitives, primitives_used
253
252
  )
254
- if self.using_native_model:
255
- res, cost = await self.model.a_generate(prompt, schema=MCPArgsScore)
256
- self.evaluation_cost += cost
257
- return res
258
- else:
259
- try:
260
- res: MCPArgsScore = await self.model.a_generate(
261
- prompt, schema=MCPArgsScore
262
- )
263
- return res
264
- except TypeError:
265
- res = await self.model.a_generate(prompt)
266
- data = trimAndLoadJson(res, self)
267
- return MCPArgsScore(**data)
253
+ return await a_generate_with_schema_and_extract(
254
+ metric=self,
255
+ prompt=prompt,
256
+ schema_cls=MCPArgsScore,
257
+ extract_schema=lambda s: s,
258
+ extract_json=lambda data: MCPArgsScore(**data),
259
+ )
268
260
 
269
261
  def _calculate_score(
270
262
  self,
@@ -280,7 +272,9 @@ class MCPUseMetric(BaseMetric):
280
272
  self,
281
273
  primitives_used_score: MCPPrimitivesScore,
282
274
  argument_correctness_score: MCPArgsScore,
283
- ) -> str:
275
+ ) -> Optional[str]:
276
+ if not self.include_reason:
277
+ return None
284
278
  return (
285
279
  f"[\n"
286
280
  f"\t{primitives_used_score.reason}\n"
@@ -390,7 +384,7 @@ class MCPUseMetric(BaseMetric):
390
384
  else:
391
385
  try:
392
386
  self.success = self.score >= self.threshold
393
- except:
387
+ except TypeError:
394
388
  self.success = False
395
389
  return self.success
396
390
 
@@ -3,6 +3,14 @@ import textwrap
3
3
 
4
4
 
5
5
  class MCPUseMetricTemplate:
6
+ multimodal_rules = """
7
+ --- MULTIMODAL INPUT RULES ---
8
+ - Treat image content as factual evidence.
9
+ - Only reference visual details that are explicitly and clearly visible.
10
+ - Do not infer or guess objects, text, or details not visibly present.
11
+ - If an image is unclear or ambiguous, mark uncertainty explicitly.
12
+ """
13
+
6
14
  @staticmethod
7
15
  def get_mcp_argument_correctness_prompt(
8
16
  test_case: LLMTestCase,
@@ -12,6 +20,8 @@ class MCPUseMetricTemplate:
12
20
  return textwrap.dedent(
13
21
  f"""Evaluate whether the arguments passed to each tool (primitive) used by the agent were appropriate and correct for the intended purpose. Focus on whether the input types, formats, and contents match the expectations of the tools and are suitable given the user's request.
14
22
 
23
+ {MCPUseMetricTemplate.multimodal_rules}
24
+
15
25
  You must return a JSON object with exactly two fields: 'score' and 'reason'.
16
26
 
17
27
  Scoring:
@@ -68,6 +78,8 @@ class MCPUseMetricTemplate:
68
78
  return textwrap.dedent(
69
79
  f"""Evaluate whether the tools (primitives) selected and used by the agent were appropriate and correct for fulfilling the user’s request. Base your judgment on the user input, the agent’s visible output, and the tools that were available to the agent. You must return a JSON object with exactly two fields: 'score' and 'reason'.
70
80
 
81
+ {MCPUseMetricTemplate.multimodal_rules}
82
+
71
83
  Scoring:
72
84
  - 'score' is a float between 0 and 1 inclusive.
73
85
  - Use intermediate values (e.g., 0.25, 0.5, 0.75) to reflect cases where the tools used were partially correct, suboptimal, or only somewhat relevant.