deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/cli/main.py +2022 -759
  3. deepeval/cli/utils.py +208 -36
  4. deepeval/config/dotenv_handler.py +19 -0
  5. deepeval/config/settings.py +675 -245
  6. deepeval/config/utils.py +9 -1
  7. deepeval/dataset/api.py +23 -1
  8. deepeval/dataset/golden.py +106 -21
  9. deepeval/evaluate/evaluate.py +0 -3
  10. deepeval/evaluate/execute.py +162 -315
  11. deepeval/evaluate/utils.py +6 -30
  12. deepeval/key_handler.py +124 -51
  13. deepeval/metrics/__init__.py +0 -4
  14. deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
  15. deepeval/metrics/answer_relevancy/template.py +102 -179
  16. deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
  17. deepeval/metrics/arena_g_eval/template.py +17 -1
  18. deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
  19. deepeval/metrics/argument_correctness/template.py +19 -2
  20. deepeval/metrics/base_metric.py +19 -41
  21. deepeval/metrics/bias/bias.py +102 -108
  22. deepeval/metrics/bias/template.py +14 -2
  23. deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
  24. deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
  25. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
  26. deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
  27. deepeval/metrics/conversation_completeness/template.py +23 -3
  28. deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
  29. deepeval/metrics/conversational_dag/nodes.py +66 -123
  30. deepeval/metrics/conversational_dag/templates.py +16 -0
  31. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
  32. deepeval/metrics/dag/dag.py +10 -0
  33. deepeval/metrics/dag/nodes.py +63 -126
  34. deepeval/metrics/dag/templates.py +14 -0
  35. deepeval/metrics/exact_match/exact_match.py +9 -1
  36. deepeval/metrics/faithfulness/faithfulness.py +82 -136
  37. deepeval/metrics/g_eval/g_eval.py +93 -79
  38. deepeval/metrics/g_eval/template.py +18 -1
  39. deepeval/metrics/g_eval/utils.py +7 -6
  40. deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
  41. deepeval/metrics/goal_accuracy/template.py +21 -3
  42. deepeval/metrics/hallucination/hallucination.py +60 -75
  43. deepeval/metrics/hallucination/template.py +13 -0
  44. deepeval/metrics/indicator.py +11 -10
  45. deepeval/metrics/json_correctness/json_correctness.py +40 -38
  46. deepeval/metrics/json_correctness/template.py +10 -0
  47. deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
  48. deepeval/metrics/knowledge_retention/schema.py +9 -3
  49. deepeval/metrics/knowledge_retention/template.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +72 -43
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
  52. deepeval/metrics/mcp/schema.py +4 -0
  53. deepeval/metrics/mcp/template.py +59 -0
  54. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
  55. deepeval/metrics/mcp_use_metric/template.py +12 -0
  56. deepeval/metrics/misuse/misuse.py +77 -97
  57. deepeval/metrics/misuse/template.py +15 -0
  58. deepeval/metrics/multimodal_metrics/__init__.py +0 -1
  59. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
  60. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
  61. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
  62. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
  63. deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
  64. deepeval/metrics/non_advice/non_advice.py +79 -105
  65. deepeval/metrics/non_advice/template.py +12 -0
  66. deepeval/metrics/pattern_match/pattern_match.py +12 -4
  67. deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
  68. deepeval/metrics/pii_leakage/template.py +14 -0
  69. deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
  70. deepeval/metrics/plan_adherence/template.py +11 -0
  71. deepeval/metrics/plan_quality/plan_quality.py +63 -87
  72. deepeval/metrics/plan_quality/template.py +9 -0
  73. deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
  74. deepeval/metrics/prompt_alignment/template.py +12 -0
  75. deepeval/metrics/role_adherence/role_adherence.py +48 -71
  76. deepeval/metrics/role_adherence/template.py +14 -0
  77. deepeval/metrics/role_violation/role_violation.py +75 -108
  78. deepeval/metrics/role_violation/template.py +12 -0
  79. deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
  80. deepeval/metrics/step_efficiency/template.py +11 -0
  81. deepeval/metrics/summarization/summarization.py +115 -183
  82. deepeval/metrics/summarization/template.py +19 -0
  83. deepeval/metrics/task_completion/task_completion.py +67 -73
  84. deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
  85. deepeval/metrics/tool_use/schema.py +4 -0
  86. deepeval/metrics/tool_use/template.py +16 -2
  87. deepeval/metrics/tool_use/tool_use.py +72 -94
  88. deepeval/metrics/topic_adherence/schema.py +4 -0
  89. deepeval/metrics/topic_adherence/template.py +21 -1
  90. deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
  91. deepeval/metrics/toxicity/template.py +13 -0
  92. deepeval/metrics/toxicity/toxicity.py +80 -99
  93. deepeval/metrics/turn_contextual_precision/schema.py +3 -3
  94. deepeval/metrics/turn_contextual_precision/template.py +9 -2
  95. deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
  96. deepeval/metrics/turn_contextual_recall/schema.py +3 -3
  97. deepeval/metrics/turn_contextual_recall/template.py +8 -1
  98. deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
  99. deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
  100. deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
  101. deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
  102. deepeval/metrics/turn_faithfulness/schema.py +1 -1
  103. deepeval/metrics/turn_faithfulness/template.py +8 -1
  104. deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
  105. deepeval/metrics/turn_relevancy/template.py +14 -0
  106. deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
  107. deepeval/metrics/utils.py +161 -91
  108. deepeval/models/__init__.py +2 -0
  109. deepeval/models/base_model.py +44 -6
  110. deepeval/models/embedding_models/azure_embedding_model.py +34 -12
  111. deepeval/models/embedding_models/local_embedding_model.py +22 -7
  112. deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
  113. deepeval/models/embedding_models/openai_embedding_model.py +3 -2
  114. deepeval/models/llms/__init__.py +2 -0
  115. deepeval/models/llms/amazon_bedrock_model.py +229 -73
  116. deepeval/models/llms/anthropic_model.py +143 -48
  117. deepeval/models/llms/azure_model.py +169 -95
  118. deepeval/models/llms/constants.py +2032 -0
  119. deepeval/models/llms/deepseek_model.py +82 -35
  120. deepeval/models/llms/gemini_model.py +126 -67
  121. deepeval/models/llms/grok_model.py +128 -65
  122. deepeval/models/llms/kimi_model.py +129 -87
  123. deepeval/models/llms/litellm_model.py +94 -18
  124. deepeval/models/llms/local_model.py +115 -16
  125. deepeval/models/llms/ollama_model.py +97 -76
  126. deepeval/models/llms/openai_model.py +169 -311
  127. deepeval/models/llms/portkey_model.py +58 -16
  128. deepeval/models/llms/utils.py +5 -2
  129. deepeval/models/retry_policy.py +10 -5
  130. deepeval/models/utils.py +56 -4
  131. deepeval/simulator/conversation_simulator.py +49 -2
  132. deepeval/simulator/template.py +16 -1
  133. deepeval/synthesizer/synthesizer.py +19 -17
  134. deepeval/test_case/api.py +24 -45
  135. deepeval/test_case/arena_test_case.py +7 -2
  136. deepeval/test_case/conversational_test_case.py +55 -6
  137. deepeval/test_case/llm_test_case.py +60 -6
  138. deepeval/test_run/api.py +3 -0
  139. deepeval/test_run/test_run.py +6 -1
  140. deepeval/utils.py +26 -0
  141. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
  142. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
  143. deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
  144. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
  145. deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
  146. deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
  147. deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
  148. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
  149. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
  150. {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  from typing import Optional, List, Tuple, Union, Dict
2
2
 
3
- from deepeval.utils import get_or_create_event_loop, prettify_list
3
+ from deepeval.utils import get_or_create_event_loop
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  check_llm_test_case_params,
8
7
  initialize_model,
8
+ a_generate_with_schema_and_extract,
9
+ generate_with_schema_and_extract,
9
10
  )
10
11
  from deepeval.test_case import (
11
12
  LLMTestCase,
@@ -15,7 +16,11 @@ from deepeval.metrics import BaseMetric
15
16
  from deepeval.models import DeepEvalBaseLLM
16
17
  from deepeval.metrics.task_completion.template import TaskCompletionTemplate
17
18
  from deepeval.metrics.indicator import metric_progress_indicator
18
- from deepeval.metrics.task_completion.schema import *
19
+ from deepeval.metrics.task_completion.schema import (
20
+ TaskAndOutcome,
21
+ TaskCompletionVerdict,
22
+ )
23
+ from deepeval.metrics.api import metric_data_manager
19
24
 
20
25
 
21
26
  class TaskCompletionMetric(BaseMetric):
@@ -23,7 +28,6 @@ class TaskCompletionMetric(BaseMetric):
23
28
  _required_params: List[LLMTestCaseParams] = [
24
29
  LLMTestCaseParams.INPUT,
25
30
  LLMTestCaseParams.ACTUAL_OUTPUT,
26
- LLMTestCaseParams.TOOLS_CALLED,
27
31
  ]
28
32
 
29
33
  def __init__(
@@ -58,9 +62,15 @@ class TaskCompletionMetric(BaseMetric):
58
62
  _in_component: bool = False,
59
63
  _log_metric_to_confident: bool = True,
60
64
  ) -> float:
61
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
62
- if not has_trace:
63
- check_llm_test_case_params(test_case, self._required_params, self)
65
+ check_llm_test_case_params(
66
+ test_case,
67
+ self._required_params,
68
+ None,
69
+ None,
70
+ self,
71
+ self.model,
72
+ test_case.multimodal,
73
+ )
64
74
 
65
75
  self.evaluation_cost = 0 if self.using_native_model else None
66
76
  with metric_progress_indicator(
@@ -91,6 +101,12 @@ class TaskCompletionMetric(BaseMetric):
91
101
  f"Score: {self.score}\nReason: {self.reason}",
92
102
  ],
93
103
  )
104
+
105
+ if _log_metric_to_confident:
106
+ metric_data_manager.post_metric_if_enabled(
107
+ self, test_case=test_case
108
+ )
109
+
94
110
  return self.score
95
111
 
96
112
  async def a_measure(
@@ -100,9 +116,15 @@ class TaskCompletionMetric(BaseMetric):
100
116
  _in_component: bool = False,
101
117
  _log_metric_to_confident: bool = True,
102
118
  ) -> float:
103
- has_trace: bool = isinstance(test_case._trace_dict, Dict)
104
- if not has_trace:
105
- check_llm_test_case_params(test_case, self._required_params, self)
119
+ check_llm_test_case_params(
120
+ test_case,
121
+ self._required_params,
122
+ None,
123
+ None,
124
+ self,
125
+ self.model,
126
+ test_case.multimodal,
127
+ )
106
128
 
107
129
  self.evaluation_cost = 0 if self.using_native_model else None
108
130
  with metric_progress_indicator(
@@ -127,6 +149,12 @@ class TaskCompletionMetric(BaseMetric):
127
149
  f"Score: {self.score}\nReason: {self.reason}",
128
150
  ],
129
151
  )
152
+
153
+ if _log_metric_to_confident:
154
+ metric_data_manager.post_metric_if_enabled(
155
+ self, test_case=test_case
156
+ )
157
+
130
158
  return self.score
131
159
 
132
160
  async def _a_generate_verdicts(self) -> Tuple:
@@ -134,44 +162,26 @@ class TaskCompletionMetric(BaseMetric):
134
162
  task=self.task,
135
163
  actual_outcome=self.outcome,
136
164
  )
137
- if self.using_native_model:
138
- res, cost = await self.model.a_generate(
139
- prompt, schema=TaskCompletionVerdict
140
- )
141
- self.evaluation_cost += cost
142
- return res.verdict, res.reason
143
- else:
144
- try:
145
- res: TaskCompletionVerdict = await self.model.a_generate(
146
- prompt, schema=TaskCompletionVerdict
147
- )
148
- return res.verdict, res.reason
149
- except TypeError:
150
- res = await self.model.a_generate(prompt)
151
- data = trimAndLoadJson(res, self)
152
- return data["verdict"], data["reason"]
165
+ return await a_generate_with_schema_and_extract(
166
+ metric=self,
167
+ prompt=prompt,
168
+ schema_cls=TaskCompletionVerdict,
169
+ extract_schema=lambda s: (s.verdict, s.reason),
170
+ extract_json=lambda data: (data["verdict"], data["reason"]),
171
+ )
153
172
 
154
173
  def _generate_verdicts(self) -> Tuple:
155
174
  prompt = TaskCompletionTemplate.generate_verdict(
156
175
  task=self.task,
157
176
  actual_outcome=self.outcome,
158
177
  )
159
- if self.using_native_model:
160
- res, cost = self.model.generate(
161
- prompt, schema=TaskCompletionVerdict
162
- )
163
- self.evaluation_cost += cost
164
- return res.verdict, res.reason
165
- else:
166
- try:
167
- res: TaskCompletionVerdict = self.model.generate(
168
- prompt, schema=TaskCompletionVerdict
169
- )
170
- return res.verdict, res.reason
171
- except TypeError:
172
- res = self.model.generate(prompt)
173
- data = trimAndLoadJson(res, self)
174
- return data["verdict"], data["reason"]
178
+ return generate_with_schema_and_extract(
179
+ metric=self,
180
+ prompt=prompt,
181
+ schema_cls=TaskCompletionVerdict,
182
+ extract_schema=lambda s: (s.verdict, s.reason),
183
+ extract_json=lambda data: (data["verdict"], data["reason"]),
184
+ )
175
185
 
176
186
  async def _a_extract_task_and_outcome(
177
187
  self,
@@ -189,22 +199,13 @@ class TaskCompletionMetric(BaseMetric):
189
199
  actual_output=test_case.actual_output,
190
200
  tools_called=test_case.tools_called,
191
201
  )
192
- if self.using_native_model:
193
- res, cost = await self.model.a_generate(
194
- prompt, schema=TaskAndOutcome
195
- )
196
- self.evaluation_cost += cost
197
- return res.task, res.outcome
198
- else:
199
- try:
200
- res: TaskAndOutcome = await self.model.a_generate(
201
- prompt, schema=TaskAndOutcome
202
- )
203
- return res.task, res.outcome
204
- except TypeError:
205
- res = await self.model.a_generate(prompt)
206
- data = trimAndLoadJson(res, self)
207
- return data["task"], data["outcome"]
202
+ return await a_generate_with_schema_and_extract(
203
+ metric=self,
204
+ prompt=prompt,
205
+ schema_cls=TaskAndOutcome,
206
+ extract_schema=lambda s: (s.task, s.outcome),
207
+ extract_json=lambda data: (data["task"], data["outcome"]),
208
+ )
208
209
 
209
210
  def _extract_task_and_outcome(
210
211
  self,
@@ -222,20 +223,13 @@ class TaskCompletionMetric(BaseMetric):
222
223
  actual_output=test_case.actual_output,
223
224
  tools_called=test_case.tools_called,
224
225
  )
225
- if self.using_native_model:
226
- res, cost = self.model.generate(prompt, schema=TaskAndOutcome)
227
- self.evaluation_cost += cost
228
- return res.task, res.outcome
229
- else:
230
- try:
231
- res: TaskAndOutcome = self.model.generate(
232
- prompt, schema=TaskAndOutcome
233
- )
234
- return res.task, res.outcome
235
- except TypeError:
236
- res = self.model.generate(prompt)
237
- data = trimAndLoadJson(res, self)
238
- return data["task"], data["outcome"]
226
+ return generate_with_schema_and_extract(
227
+ metric=self,
228
+ prompt=prompt,
229
+ schema_cls=TaskAndOutcome,
230
+ extract_schema=lambda s: (s.task, s.outcome),
231
+ extract_json=lambda data: (data["task"], data["outcome"]),
232
+ )
239
233
 
240
234
  def _calculate_score(self):
241
235
  return (
@@ -250,7 +244,7 @@ class TaskCompletionMetric(BaseMetric):
250
244
  else:
251
245
  try:
252
246
  self.success = self.score >= self.threshold
253
- except:
247
+ except TypeError:
254
248
  self.success = False
255
249
  return self.success
256
250
 
@@ -1,13 +1,14 @@
1
- from typing import List, Dict, Optional, Union
1
+ from typing import List, Dict, Optional, Union, Tuple
2
2
 
3
3
  from deepeval.metrics.indicator import metric_progress_indicator
4
- from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.utils import get_or_create_event_loop
5
5
  from deepeval.metrics.utils import (
6
6
  construct_verbose_logs,
7
7
  check_llm_test_case_params,
8
- trimAndLoadJson,
9
8
  initialize_model,
10
9
  print_tools_called,
10
+ a_generate_with_schema_and_extract,
11
+ generate_with_schema_and_extract,
11
12
  )
12
13
  from deepeval.models import DeepEvalBaseLLM
13
14
  from deepeval.test_case import (
@@ -62,7 +63,15 @@ class ToolCorrectnessMetric(BaseMetric):
62
63
  _log_metric_to_confident: bool = True,
63
64
  ) -> float:
64
65
 
65
- check_llm_test_case_params(test_case, self._required_params, self)
66
+ check_llm_test_case_params(
67
+ test_case,
68
+ self._required_params,
69
+ None,
70
+ None,
71
+ self,
72
+ self.model,
73
+ test_case.multimodal,
74
+ )
66
75
  self.test_case = test_case
67
76
  self.evaluation_cost = 0 if self.using_native_model else None
68
77
 
@@ -90,11 +99,9 @@ class ToolCorrectnessMetric(BaseMetric):
90
99
  self.available_tools,
91
100
  )
92
101
  else:
93
- tool_selection_score = tool_selection_score = (
94
- ToolSelectionScore(
95
- score=1,
96
- reason="No available tools were provided to assess tool selection criteria",
97
- )
102
+ tool_selection_score = ToolSelectionScore(
103
+ score=1,
104
+ reason="No available tools were provided to assess tool selection criteria",
98
105
  )
99
106
  score = min(tool_calling_score, tool_selection_score.score)
100
107
  self.score = (
@@ -165,7 +172,15 @@ class ToolCorrectnessMetric(BaseMetric):
165
172
  _in_component: bool = False,
166
173
  _log_metric_to_confident: bool = True,
167
174
  ) -> float:
168
- check_llm_test_case_params(test_case, self._required_params, self)
175
+ check_llm_test_case_params(
176
+ test_case,
177
+ self._required_params,
178
+ None,
179
+ None,
180
+ self,
181
+ self.model,
182
+ test_case.multimodal,
183
+ )
169
184
 
170
185
  self.evaluation_cost = 0 if self.using_native_model else None
171
186
  with metric_progress_indicator(
@@ -324,18 +339,13 @@ class ToolCorrectnessMetric(BaseMetric):
324
339
  prompt = ToolCorrectnessTemplate.get_tool_selection_score(
325
340
  user_input, tools_called_formatted, available_tools_formatted
326
341
  )
327
- if self.using_native_model:
328
- res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
329
- self.evaluation_cost += cost
330
- return res
331
- else:
332
- try:
333
- res = self.model.generate(prompt, schema=ToolSelectionScore)
334
- return res
335
- except TypeError:
336
- res = self.model.generate(prompt)
337
- data = trimAndLoadJson(res, self)
338
- return ToolSelectionScore(**data)
342
+ return generate_with_schema_and_extract(
343
+ metric=self,
344
+ prompt=prompt,
345
+ schema_cls=ToolSelectionScore,
346
+ extract_schema=lambda s: s,
347
+ extract_json=lambda data: ToolSelectionScore(**data),
348
+ )
339
349
 
340
350
  async def _a_get_tool_selection_score(
341
351
  self, user_input, tools_called, available_tools
@@ -345,25 +355,16 @@ class ToolCorrectnessMetric(BaseMetric):
345
355
  prompt = ToolCorrectnessTemplate.get_tool_selection_score(
346
356
  user_input, tools_called_formatted, available_tools_formatted
347
357
  )
348
- if self.using_native_model:
349
- res, cost = await self.model.a_generate(
350
- prompt, schema=ToolSelectionScore
351
- )
352
- self.evaluation_cost += cost
353
- return res
354
- else:
355
- try:
356
- res = await self.model.a_generate(
357
- prompt, schema=ToolSelectionScore
358
- )
359
- return res
360
- except TypeError:
361
- res = await self.model.a_generate(prompt)
362
- data = trimAndLoadJson(res, self)
363
- return ToolSelectionScore(**data)
358
+ return await a_generate_with_schema_and_extract(
359
+ metric=self,
360
+ prompt=prompt,
361
+ schema_cls=ToolSelectionScore,
362
+ extract_schema=lambda s: s,
363
+ extract_json=lambda data: ToolSelectionScore(**data),
364
+ )
364
365
 
365
366
  # Calculate score
366
- def _calculate_score(self):
367
+ def _calculate_score(self) -> float:
367
368
  if self.should_exact_match:
368
369
  score = self._calculate_exact_match_score()
369
370
  elif self.should_consider_ordering:
@@ -382,7 +383,7 @@ class ToolCorrectnessMetric(BaseMetric):
382
383
  return 0 if self.strict_mode and score < self.threshold else score
383
384
 
384
385
  # Exact matching score
385
- def _calculate_exact_match_score(self):
386
+ def _calculate_exact_match_score(self) -> float:
386
387
  if len(self.tools_called) != len(self.expected_tools):
387
388
  return 0.0
388
389
  if (
@@ -405,7 +406,7 @@ class ToolCorrectnessMetric(BaseMetric):
405
406
  return 1.0
406
407
 
407
408
  # Non exact matching score
408
- def _calculate_non_exact_match_score(self):
409
+ def _calculate_non_exact_match_score(self) -> float:
409
410
  total_score = 0.0
410
411
  matched_called_tools = set()
411
412
  for expected_tool in self.expected_tools:
@@ -445,7 +446,7 @@ class ToolCorrectnessMetric(BaseMetric):
445
446
  )
446
447
 
447
448
  # Consider ordering score
448
- def _compute_weighted_lcs(self):
449
+ def _compute_weighted_lcs(self) -> Tuple[List[ToolCall], float]:
449
450
  m, n = len(self.expected_tools), len(self.tools_called)
450
451
  dp = [[0.0] * (n + 1) for _ in range(m + 1)]
451
452
  for i in range(1, m + 1):
@@ -17,3 +17,7 @@ class ToolSelectionScore(BaseModel):
17
17
  class ArgumentCorrectnessScore(BaseModel):
18
18
  score: float
19
19
  reason: str
20
+
21
+
22
+ class Reason(BaseModel):
23
+ reason: str
@@ -161,6 +161,13 @@ class ToolUseTemplate:
161
161
  - The key patterns or trends in the sub-reasons (e.g., consistent correct choices, repeated irrelevant tool calls, missed best-fit tools).
162
162
  - A clear statement linking the **score** and **threshold** outcome (e.g., “The agent passed because…” or “Failed because…”).
163
163
 
164
+ **
165
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
166
+ Example JSON:
167
+ {{
168
+ "reason": "The score is <score> because <your_reason>."
169
+ }}
170
+
164
171
  RULES:
165
172
  - Focus on *which tools were selected* and *why that selection pattern was or wasn't appropriate*.
166
173
  - Mention specific issues or strengths like redundancy, misuse, or perfect matching.
@@ -178,7 +185,7 @@ class ToolUseTemplate:
178
185
  Threshold: {threshold}
179
186
  Result: {"PASS" if final_score >= threshold else "FAIL"}
180
187
 
181
- Final Reason:
188
+ JSON:
182
189
  """
183
190
  )
184
191
 
@@ -199,6 +206,13 @@ class ToolUseTemplate:
199
206
  - The dominant strengths or weaknesses from the sub-reasons (e.g., correct parameterization, missing required fields, generic values, or misaligned arguments).
200
207
  - Whether the agent met or fell short of the threshold and why.
201
208
 
209
+ **
210
+ IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
211
+ Example JSON:
212
+ {{
213
+ "reason": "The score is <score> because <your_reason>."
214
+ }}
215
+
202
216
  RULES:
203
217
  - Focus strictly on **argument correctness** and **context alignment** — not which tools were chosen.
204
218
  - Reference specific argument-level problems or successes where helpful.
@@ -215,6 +229,6 @@ class ToolUseTemplate:
215
229
  Threshold: {threshold}
216
230
  Result: {"PASS" if final_score >= threshold else "FAIL"}
217
231
 
218
- Final Reason:
232
+ JSON:
219
233
  """
220
234
  )
@@ -3,11 +3,11 @@ import asyncio
3
3
  from deepeval.utils import get_or_create_event_loop, prettify_list
4
4
  from deepeval.metrics.utils import (
5
5
  construct_verbose_logs,
6
- trimAndLoadJson,
7
6
  get_unit_interactions,
8
- print_tools_called,
9
7
  check_conversational_test_case_params,
10
8
  initialize_model,
9
+ a_generate_with_schema_and_extract,
10
+ generate_with_schema_and_extract,
11
11
  )
12
12
  from deepeval.test_case import (
13
13
  ConversationalTestCase,
@@ -23,6 +23,7 @@ from deepeval.metrics.tool_use.schema import (
23
23
  ToolSelectionScore,
24
24
  UserInputAndTools,
25
25
  ArgumentCorrectnessScore,
26
+ Reason,
26
27
  )
27
28
  from deepeval.metrics.api import metric_data_manager
28
29
 
@@ -61,7 +62,12 @@ class ToolUseMetric(BaseConversationalMetric):
61
62
  _log_metric_to_confident: bool = True,
62
63
  ):
63
64
  check_conversational_test_case_params(
64
- test_case, self._required_test_case_params, self
65
+ test_case,
66
+ self._required_test_case_params,
67
+ self,
68
+ False,
69
+ self.model,
70
+ test_case.multimodal,
65
71
  )
66
72
 
67
73
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -136,7 +142,12 @@ class ToolUseMetric(BaseConversationalMetric):
136
142
  _log_metric_to_confident: bool = True,
137
143
  ):
138
144
  check_conversational_test_case_params(
139
- test_case, self._required_test_case_params, self
145
+ test_case,
146
+ self._required_test_case_params,
147
+ self,
148
+ False,
149
+ self.model,
150
+ test_case.multimodal,
140
151
  )
141
152
 
142
153
  self.evaluation_cost = 0 if self.using_native_model else None
@@ -206,22 +217,13 @@ class ToolUseMetric(BaseConversationalMetric):
206
217
  user_and_tools.tools_called,
207
218
  user_and_tools.available_tools,
208
219
  )
209
- if self.using_native_model:
210
- res, cost = self.model.generate(
211
- prompt, schema=ArgumentCorrectnessScore
212
- )
213
- self.evaluation_cost += cost
214
- return res
215
- else:
216
- try:
217
- res: ArgumentCorrectnessScore = self.model.generate(
218
- prompt, schema=ArgumentCorrectnessScore
219
- )
220
- return res
221
- except TypeError:
222
- res = self.model.generate(prompt)
223
- data = trimAndLoadJson(res, self)
224
- return ArgumentCorrectnessScore(**data)
220
+ return generate_with_schema_and_extract(
221
+ metric=self,
222
+ prompt=prompt,
223
+ schema_cls=ArgumentCorrectnessScore,
224
+ extract_schema=lambda s: s,
225
+ extract_json=lambda data: ArgumentCorrectnessScore(**data),
226
+ )
225
227
 
226
228
  async def _a_get_argument_correctness_score(
227
229
  self,
@@ -233,22 +235,13 @@ class ToolUseMetric(BaseConversationalMetric):
233
235
  user_and_tools.tools_called,
234
236
  user_and_tools.available_tools,
235
237
  )
236
- if self.using_native_model:
237
- res, cost = await self.model.a_generate(
238
- prompt, schema=ArgumentCorrectnessScore
239
- )
240
- self.evaluation_cost += cost
241
- return res
242
- else:
243
- try:
244
- res: ArgumentCorrectnessScore = await self.model.a_generate(
245
- prompt, schema=ArgumentCorrectnessScore
246
- )
247
- return res
248
- except TypeError:
249
- res = await self.model.a_generate(prompt)
250
- data = trimAndLoadJson(res, self)
251
- return ArgumentCorrectnessScore(**data)
238
+ return await a_generate_with_schema_and_extract(
239
+ metric=self,
240
+ prompt=prompt,
241
+ schema_cls=ArgumentCorrectnessScore,
242
+ extract_schema=lambda s: s,
243
+ extract_json=lambda data: ArgumentCorrectnessScore(**data),
244
+ )
252
245
 
253
246
  def _get_tool_selection_score(
254
247
  self,
@@ -260,20 +253,13 @@ class ToolUseMetric(BaseConversationalMetric):
260
253
  user_and_tools.tools_called,
261
254
  user_and_tools.available_tools,
262
255
  )
263
- if self.using_native_model:
264
- res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
265
- self.evaluation_cost += cost
266
- return res
267
- else:
268
- try:
269
- res: ToolSelectionScore = self.model.generate(
270
- prompt, schema=ToolSelectionScore
271
- )
272
- return res
273
- except TypeError:
274
- res = self.model.generate(prompt)
275
- data = trimAndLoadJson(res, self)
276
- return ToolSelectionScore(**data)
256
+ return generate_with_schema_and_extract(
257
+ metric=self,
258
+ prompt=prompt,
259
+ schema_cls=ToolSelectionScore,
260
+ extract_schema=lambda s: s,
261
+ extract_json=lambda data: ToolSelectionScore(**data),
262
+ )
277
263
 
278
264
  async def _a_get_tool_selection_score(
279
265
  self,
@@ -285,22 +271,13 @@ class ToolUseMetric(BaseConversationalMetric):
285
271
  user_and_tools.tools_called,
286
272
  user_and_tools.available_tools,
287
273
  )
288
- if self.using_native_model:
289
- res, cost = await self.model.a_generate(
290
- prompt, schema=ToolSelectionScore
291
- )
292
- self.evaluation_cost += cost
293
- return res
294
- else:
295
- try:
296
- res: ToolSelectionScore = await self.model.a_generate(
297
- prompt, schema=ToolSelectionScore
298
- )
299
- return res
300
- except TypeError:
301
- res = await self.model.a_generate(prompt)
302
- data = trimAndLoadJson(res, self)
303
- return ToolSelectionScore(**data)
274
+ return await a_generate_with_schema_and_extract(
275
+ metric=self,
276
+ prompt=prompt,
277
+ schema_cls=ToolSelectionScore,
278
+ extract_schema=lambda s: s,
279
+ extract_json=lambda data: ToolSelectionScore(**data),
280
+ )
304
281
 
305
282
  def _get_user_input_and_turns(
306
283
  self,
@@ -380,13 +357,14 @@ class ToolUseMetric(BaseConversationalMetric):
380
357
  prompt = ToolUseTemplate.get_tool_selection_final_reason(
381
358
  scores_and_reasons, self.score, self.threshold
382
359
  )
383
- if self.using_native_model:
384
- res, cost = self.model.generate(prompt)
385
- self.evaluation_cost += cost
386
- return res
387
- else:
388
- res = self.model.generate(prompt)
389
- return res
360
+
361
+ return generate_with_schema_and_extract(
362
+ metric=self,
363
+ prompt=prompt,
364
+ schema_cls=Reason,
365
+ extract_schema=lambda s: s.reason,
366
+ extract_json=lambda data: data["reason"],
367
+ )
390
368
 
391
369
  def _generate_reason_for_argument_correctness(
392
370
  self,
@@ -400,13 +378,13 @@ class ToolUseMetric(BaseConversationalMetric):
400
378
  prompt = ToolUseTemplate.get_tool_selection_final_reason(
401
379
  scores_and_reasons, self.score, self.threshold
402
380
  )
403
- if self.using_native_model:
404
- res, cost = self.model.generate(prompt)
405
- self.evaluation_cost += cost
406
- return res
407
- else:
408
- res = self.model.generate(prompt)
409
- return res
381
+ return generate_with_schema_and_extract(
382
+ metric=self,
383
+ prompt=prompt,
384
+ schema_cls=Reason,
385
+ extract_schema=lambda s: s.reason,
386
+ extract_json=lambda data: data["reason"],
387
+ )
410
388
 
411
389
  async def _a_generate_reason_for_tool_selection(
412
390
  self, tool_use_scores: List[ToolSelectionScore]
@@ -419,13 +397,13 @@ class ToolUseMetric(BaseConversationalMetric):
419
397
  prompt = ToolUseTemplate.get_tool_selection_final_reason(
420
398
  scores_and_reasons, self.score, self.threshold
421
399
  )
422
- if self.using_native_model:
423
- res, cost = await self.model.a_generate(prompt)
424
- self.evaluation_cost += cost
425
- return res
426
- else:
427
- res = await self.model.a_generate(prompt)
428
- return res
400
+ return await a_generate_with_schema_and_extract(
401
+ metric=self,
402
+ prompt=prompt,
403
+ schema_cls=Reason,
404
+ extract_schema=lambda s: s.reason,
405
+ extract_json=lambda data: data["reason"],
406
+ )
429
407
 
430
408
  async def _a_generate_reason_for_argument_correctness(
431
409
  self, argument_correctness_scores: List[ArgumentCorrectnessScore]
@@ -438,13 +416,13 @@ class ToolUseMetric(BaseConversationalMetric):
438
416
  prompt = ToolUseTemplate.get_tool_selection_final_reason(
439
417
  scores_and_reasons, self.score, self.threshold
440
418
  )
441
- if self.using_native_model:
442
- res, cost = await self.model.a_generate(prompt)
443
- self.evaluation_cost += cost
444
- return res
445
- else:
446
- res = await self.model.a_generate(prompt)
447
- return res
419
+ return await a_generate_with_schema_and_extract(
420
+ metric=self,
421
+ prompt=prompt,
422
+ schema_cls=Reason,
423
+ extract_schema=lambda s: s.reason,
424
+ extract_json=lambda data: data["reason"],
425
+ )
448
426
 
449
427
  def is_successful(self) -> bool:
450
428
  try: