deepeval 3.7.5__py3-none-any.whl → 3.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +675 -245
- deepeval/config/utils.py +9 -1
- deepeval/dataset/api.py +23 -1
- deepeval/dataset/golden.py +106 -21
- deepeval/evaluate/evaluate.py +0 -3
- deepeval/evaluate/execute.py +162 -315
- deepeval/evaluate/utils.py +6 -30
- deepeval/key_handler.py +124 -51
- deepeval/metrics/__init__.py +0 -4
- deepeval/metrics/answer_relevancy/answer_relevancy.py +89 -132
- deepeval/metrics/answer_relevancy/template.py +102 -179
- deepeval/metrics/arena_g_eval/arena_g_eval.py +98 -96
- deepeval/metrics/arena_g_eval/template.py +17 -1
- deepeval/metrics/argument_correctness/argument_correctness.py +81 -87
- deepeval/metrics/argument_correctness/template.py +19 -2
- deepeval/metrics/base_metric.py +19 -41
- deepeval/metrics/bias/bias.py +102 -108
- deepeval/metrics/bias/template.py +14 -2
- deepeval/metrics/contextual_precision/contextual_precision.py +56 -92
- deepeval/metrics/contextual_recall/contextual_recall.py +58 -85
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +53 -83
- deepeval/metrics/conversation_completeness/conversation_completeness.py +101 -119
- deepeval/metrics/conversation_completeness/template.py +23 -3
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -8
- deepeval/metrics/conversational_dag/nodes.py +66 -123
- deepeval/metrics/conversational_dag/templates.py +16 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +47 -66
- deepeval/metrics/dag/dag.py +10 -0
- deepeval/metrics/dag/nodes.py +63 -126
- deepeval/metrics/dag/templates.py +14 -0
- deepeval/metrics/exact_match/exact_match.py +9 -1
- deepeval/metrics/faithfulness/faithfulness.py +82 -136
- deepeval/metrics/g_eval/g_eval.py +93 -79
- deepeval/metrics/g_eval/template.py +18 -1
- deepeval/metrics/g_eval/utils.py +7 -6
- deepeval/metrics/goal_accuracy/goal_accuracy.py +91 -76
- deepeval/metrics/goal_accuracy/template.py +21 -3
- deepeval/metrics/hallucination/hallucination.py +60 -75
- deepeval/metrics/hallucination/template.py +13 -0
- deepeval/metrics/indicator.py +11 -10
- deepeval/metrics/json_correctness/json_correctness.py +40 -38
- deepeval/metrics/json_correctness/template.py +10 -0
- deepeval/metrics/knowledge_retention/knowledge_retention.py +60 -97
- deepeval/metrics/knowledge_retention/schema.py +9 -3
- deepeval/metrics/knowledge_retention/template.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +72 -43
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +93 -75
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +59 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +58 -64
- deepeval/metrics/mcp_use_metric/template.py +12 -0
- deepeval/metrics/misuse/misuse.py +77 -97
- deepeval/metrics/misuse/template.py +15 -0
- deepeval/metrics/multimodal_metrics/__init__.py +0 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +37 -38
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +55 -76
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +37 -38
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +37 -38
- deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +57 -76
- deepeval/metrics/non_advice/non_advice.py +79 -105
- deepeval/metrics/non_advice/template.py +12 -0
- deepeval/metrics/pattern_match/pattern_match.py +12 -4
- deepeval/metrics/pii_leakage/pii_leakage.py +75 -106
- deepeval/metrics/pii_leakage/template.py +14 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +63 -89
- deepeval/metrics/plan_adherence/template.py +11 -0
- deepeval/metrics/plan_quality/plan_quality.py +63 -87
- deepeval/metrics/plan_quality/template.py +9 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +78 -86
- deepeval/metrics/prompt_alignment/template.py +12 -0
- deepeval/metrics/role_adherence/role_adherence.py +48 -71
- deepeval/metrics/role_adherence/template.py +14 -0
- deepeval/metrics/role_violation/role_violation.py +75 -108
- deepeval/metrics/role_violation/template.py +12 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +55 -65
- deepeval/metrics/step_efficiency/template.py +11 -0
- deepeval/metrics/summarization/summarization.py +115 -183
- deepeval/metrics/summarization/template.py +19 -0
- deepeval/metrics/task_completion/task_completion.py +67 -73
- deepeval/metrics/tool_correctness/tool_correctness.py +43 -42
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +72 -94
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +21 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +68 -81
- deepeval/metrics/toxicity/template.py +13 -0
- deepeval/metrics/toxicity/toxicity.py +80 -99
- deepeval/metrics/turn_contextual_precision/schema.py +3 -3
- deepeval/metrics/turn_contextual_precision/template.py +9 -2
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +154 -154
- deepeval/metrics/turn_contextual_recall/schema.py +3 -3
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +148 -143
- deepeval/metrics/turn_contextual_relevancy/schema.py +2 -2
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +154 -157
- deepeval/metrics/turn_faithfulness/schema.py +1 -1
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +180 -203
- deepeval/metrics/turn_relevancy/template.py +14 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +56 -69
- deepeval/metrics/utils.py +161 -91
- deepeval/models/__init__.py +2 -0
- deepeval/models/base_model.py +44 -6
- deepeval/models/embedding_models/azure_embedding_model.py +34 -12
- deepeval/models/embedding_models/local_embedding_model.py +22 -7
- deepeval/models/embedding_models/ollama_embedding_model.py +17 -6
- deepeval/models/embedding_models/openai_embedding_model.py +3 -2
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +229 -73
- deepeval/models/llms/anthropic_model.py +143 -48
- deepeval/models/llms/azure_model.py +169 -95
- deepeval/models/llms/constants.py +2032 -0
- deepeval/models/llms/deepseek_model.py +82 -35
- deepeval/models/llms/gemini_model.py +126 -67
- deepeval/models/llms/grok_model.py +128 -65
- deepeval/models/llms/kimi_model.py +129 -87
- deepeval/models/llms/litellm_model.py +94 -18
- deepeval/models/llms/local_model.py +115 -16
- deepeval/models/llms/ollama_model.py +97 -76
- deepeval/models/llms/openai_model.py +169 -311
- deepeval/models/llms/portkey_model.py +58 -16
- deepeval/models/llms/utils.py +5 -2
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +56 -4
- deepeval/simulator/conversation_simulator.py +49 -2
- deepeval/simulator/template.py +16 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_case/api.py +24 -45
- deepeval/test_case/arena_test_case.py +7 -2
- deepeval/test_case/conversational_test_case.py +55 -6
- deepeval/test_case/llm_test_case.py +60 -6
- deepeval/test_run/api.py +3 -0
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/METADATA +3 -3
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/RECORD +145 -148
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/__init__.py +0 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +0 -386
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/schema.py +0 -11
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/template.py +0 -133
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/utils.py +0 -68
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/WHEEL +0 -0
- {deepeval-3.7.5.dist-info → deepeval-3.7.7.dist-info}/entry_points.txt +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
from typing import Optional, List, Tuple, Union, Dict
|
|
2
2
|
|
|
3
|
-
from deepeval.utils import get_or_create_event_loop
|
|
3
|
+
from deepeval.utils import get_or_create_event_loop
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
check_llm_test_case_params,
|
|
8
7
|
initialize_model,
|
|
8
|
+
a_generate_with_schema_and_extract,
|
|
9
|
+
generate_with_schema_and_extract,
|
|
9
10
|
)
|
|
10
11
|
from deepeval.test_case import (
|
|
11
12
|
LLMTestCase,
|
|
@@ -15,7 +16,11 @@ from deepeval.metrics import BaseMetric
|
|
|
15
16
|
from deepeval.models import DeepEvalBaseLLM
|
|
16
17
|
from deepeval.metrics.task_completion.template import TaskCompletionTemplate
|
|
17
18
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
|
-
from deepeval.metrics.task_completion.schema import
|
|
19
|
+
from deepeval.metrics.task_completion.schema import (
|
|
20
|
+
TaskAndOutcome,
|
|
21
|
+
TaskCompletionVerdict,
|
|
22
|
+
)
|
|
23
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
class TaskCompletionMetric(BaseMetric):
|
|
@@ -23,7 +28,6 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
23
28
|
_required_params: List[LLMTestCaseParams] = [
|
|
24
29
|
LLMTestCaseParams.INPUT,
|
|
25
30
|
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
26
|
-
LLMTestCaseParams.TOOLS_CALLED,
|
|
27
31
|
]
|
|
28
32
|
|
|
29
33
|
def __init__(
|
|
@@ -58,9 +62,15 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
58
62
|
_in_component: bool = False,
|
|
59
63
|
_log_metric_to_confident: bool = True,
|
|
60
64
|
) -> float:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
65
|
+
check_llm_test_case_params(
|
|
66
|
+
test_case,
|
|
67
|
+
self._required_params,
|
|
68
|
+
None,
|
|
69
|
+
None,
|
|
70
|
+
self,
|
|
71
|
+
self.model,
|
|
72
|
+
test_case.multimodal,
|
|
73
|
+
)
|
|
64
74
|
|
|
65
75
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
66
76
|
with metric_progress_indicator(
|
|
@@ -91,6 +101,12 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
91
101
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
92
102
|
],
|
|
93
103
|
)
|
|
104
|
+
|
|
105
|
+
if _log_metric_to_confident:
|
|
106
|
+
metric_data_manager.post_metric_if_enabled(
|
|
107
|
+
self, test_case=test_case
|
|
108
|
+
)
|
|
109
|
+
|
|
94
110
|
return self.score
|
|
95
111
|
|
|
96
112
|
async def a_measure(
|
|
@@ -100,9 +116,15 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
100
116
|
_in_component: bool = False,
|
|
101
117
|
_log_metric_to_confident: bool = True,
|
|
102
118
|
) -> float:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
119
|
+
check_llm_test_case_params(
|
|
120
|
+
test_case,
|
|
121
|
+
self._required_params,
|
|
122
|
+
None,
|
|
123
|
+
None,
|
|
124
|
+
self,
|
|
125
|
+
self.model,
|
|
126
|
+
test_case.multimodal,
|
|
127
|
+
)
|
|
106
128
|
|
|
107
129
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
108
130
|
with metric_progress_indicator(
|
|
@@ -127,6 +149,12 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
127
149
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
128
150
|
],
|
|
129
151
|
)
|
|
152
|
+
|
|
153
|
+
if _log_metric_to_confident:
|
|
154
|
+
metric_data_manager.post_metric_if_enabled(
|
|
155
|
+
self, test_case=test_case
|
|
156
|
+
)
|
|
157
|
+
|
|
130
158
|
return self.score
|
|
131
159
|
|
|
132
160
|
async def _a_generate_verdicts(self) -> Tuple:
|
|
@@ -134,44 +162,26 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
134
162
|
task=self.task,
|
|
135
163
|
actual_outcome=self.outcome,
|
|
136
164
|
)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
try:
|
|
145
|
-
res: TaskCompletionVerdict = await self.model.a_generate(
|
|
146
|
-
prompt, schema=TaskCompletionVerdict
|
|
147
|
-
)
|
|
148
|
-
return res.verdict, res.reason
|
|
149
|
-
except TypeError:
|
|
150
|
-
res = await self.model.a_generate(prompt)
|
|
151
|
-
data = trimAndLoadJson(res, self)
|
|
152
|
-
return data["verdict"], data["reason"]
|
|
165
|
+
return await a_generate_with_schema_and_extract(
|
|
166
|
+
metric=self,
|
|
167
|
+
prompt=prompt,
|
|
168
|
+
schema_cls=TaskCompletionVerdict,
|
|
169
|
+
extract_schema=lambda s: (s.verdict, s.reason),
|
|
170
|
+
extract_json=lambda data: (data["verdict"], data["reason"]),
|
|
171
|
+
)
|
|
153
172
|
|
|
154
173
|
def _generate_verdicts(self) -> Tuple:
|
|
155
174
|
prompt = TaskCompletionTemplate.generate_verdict(
|
|
156
175
|
task=self.task,
|
|
157
176
|
actual_outcome=self.outcome,
|
|
158
177
|
)
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
try:
|
|
167
|
-
res: TaskCompletionVerdict = self.model.generate(
|
|
168
|
-
prompt, schema=TaskCompletionVerdict
|
|
169
|
-
)
|
|
170
|
-
return res.verdict, res.reason
|
|
171
|
-
except TypeError:
|
|
172
|
-
res = self.model.generate(prompt)
|
|
173
|
-
data = trimAndLoadJson(res, self)
|
|
174
|
-
return data["verdict"], data["reason"]
|
|
178
|
+
return generate_with_schema_and_extract(
|
|
179
|
+
metric=self,
|
|
180
|
+
prompt=prompt,
|
|
181
|
+
schema_cls=TaskCompletionVerdict,
|
|
182
|
+
extract_schema=lambda s: (s.verdict, s.reason),
|
|
183
|
+
extract_json=lambda data: (data["verdict"], data["reason"]),
|
|
184
|
+
)
|
|
175
185
|
|
|
176
186
|
async def _a_extract_task_and_outcome(
|
|
177
187
|
self,
|
|
@@ -189,22 +199,13 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
189
199
|
actual_output=test_case.actual_output,
|
|
190
200
|
tools_called=test_case.tools_called,
|
|
191
201
|
)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
try:
|
|
200
|
-
res: TaskAndOutcome = await self.model.a_generate(
|
|
201
|
-
prompt, schema=TaskAndOutcome
|
|
202
|
-
)
|
|
203
|
-
return res.task, res.outcome
|
|
204
|
-
except TypeError:
|
|
205
|
-
res = await self.model.a_generate(prompt)
|
|
206
|
-
data = trimAndLoadJson(res, self)
|
|
207
|
-
return data["task"], data["outcome"]
|
|
202
|
+
return await a_generate_with_schema_and_extract(
|
|
203
|
+
metric=self,
|
|
204
|
+
prompt=prompt,
|
|
205
|
+
schema_cls=TaskAndOutcome,
|
|
206
|
+
extract_schema=lambda s: (s.task, s.outcome),
|
|
207
|
+
extract_json=lambda data: (data["task"], data["outcome"]),
|
|
208
|
+
)
|
|
208
209
|
|
|
209
210
|
def _extract_task_and_outcome(
|
|
210
211
|
self,
|
|
@@ -222,20 +223,13 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
222
223
|
actual_output=test_case.actual_output,
|
|
223
224
|
tools_called=test_case.tools_called,
|
|
224
225
|
)
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
prompt, schema=TaskAndOutcome
|
|
233
|
-
)
|
|
234
|
-
return res.task, res.outcome
|
|
235
|
-
except TypeError:
|
|
236
|
-
res = self.model.generate(prompt)
|
|
237
|
-
data = trimAndLoadJson(res, self)
|
|
238
|
-
return data["task"], data["outcome"]
|
|
226
|
+
return generate_with_schema_and_extract(
|
|
227
|
+
metric=self,
|
|
228
|
+
prompt=prompt,
|
|
229
|
+
schema_cls=TaskAndOutcome,
|
|
230
|
+
extract_schema=lambda s: (s.task, s.outcome),
|
|
231
|
+
extract_json=lambda data: (data["task"], data["outcome"]),
|
|
232
|
+
)
|
|
239
233
|
|
|
240
234
|
def _calculate_score(self):
|
|
241
235
|
return (
|
|
@@ -250,7 +244,7 @@ class TaskCompletionMetric(BaseMetric):
|
|
|
250
244
|
else:
|
|
251
245
|
try:
|
|
252
246
|
self.success = self.score >= self.threshold
|
|
253
|
-
except:
|
|
247
|
+
except TypeError:
|
|
254
248
|
self.success = False
|
|
255
249
|
return self.success
|
|
256
250
|
|
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
from typing import List, Dict, Optional, Union
|
|
1
|
+
from typing import List, Dict, Optional, Union, Tuple
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
|
-
from deepeval.utils import get_or_create_event_loop
|
|
4
|
+
from deepeval.utils import get_or_create_event_loop
|
|
5
5
|
from deepeval.metrics.utils import (
|
|
6
6
|
construct_verbose_logs,
|
|
7
7
|
check_llm_test_case_params,
|
|
8
|
-
trimAndLoadJson,
|
|
9
8
|
initialize_model,
|
|
10
9
|
print_tools_called,
|
|
10
|
+
a_generate_with_schema_and_extract,
|
|
11
|
+
generate_with_schema_and_extract,
|
|
11
12
|
)
|
|
12
13
|
from deepeval.models import DeepEvalBaseLLM
|
|
13
14
|
from deepeval.test_case import (
|
|
@@ -62,7 +63,15 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
62
63
|
_log_metric_to_confident: bool = True,
|
|
63
64
|
) -> float:
|
|
64
65
|
|
|
65
|
-
check_llm_test_case_params(
|
|
66
|
+
check_llm_test_case_params(
|
|
67
|
+
test_case,
|
|
68
|
+
self._required_params,
|
|
69
|
+
None,
|
|
70
|
+
None,
|
|
71
|
+
self,
|
|
72
|
+
self.model,
|
|
73
|
+
test_case.multimodal,
|
|
74
|
+
)
|
|
66
75
|
self.test_case = test_case
|
|
67
76
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
68
77
|
|
|
@@ -90,11 +99,9 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
90
99
|
self.available_tools,
|
|
91
100
|
)
|
|
92
101
|
else:
|
|
93
|
-
tool_selection_score =
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
reason="No available tools were provided to assess tool selection criteria",
|
|
97
|
-
)
|
|
102
|
+
tool_selection_score = ToolSelectionScore(
|
|
103
|
+
score=1,
|
|
104
|
+
reason="No available tools were provided to assess tool selection criteria",
|
|
98
105
|
)
|
|
99
106
|
score = min(tool_calling_score, tool_selection_score.score)
|
|
100
107
|
self.score = (
|
|
@@ -165,7 +172,15 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
165
172
|
_in_component: bool = False,
|
|
166
173
|
_log_metric_to_confident: bool = True,
|
|
167
174
|
) -> float:
|
|
168
|
-
check_llm_test_case_params(
|
|
175
|
+
check_llm_test_case_params(
|
|
176
|
+
test_case,
|
|
177
|
+
self._required_params,
|
|
178
|
+
None,
|
|
179
|
+
None,
|
|
180
|
+
self,
|
|
181
|
+
self.model,
|
|
182
|
+
test_case.multimodal,
|
|
183
|
+
)
|
|
169
184
|
|
|
170
185
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
171
186
|
with metric_progress_indicator(
|
|
@@ -324,18 +339,13 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
324
339
|
prompt = ToolCorrectnessTemplate.get_tool_selection_score(
|
|
325
340
|
user_input, tools_called_formatted, available_tools_formatted
|
|
326
341
|
)
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
return res
|
|
335
|
-
except TypeError:
|
|
336
|
-
res = self.model.generate(prompt)
|
|
337
|
-
data = trimAndLoadJson(res, self)
|
|
338
|
-
return ToolSelectionScore(**data)
|
|
342
|
+
return generate_with_schema_and_extract(
|
|
343
|
+
metric=self,
|
|
344
|
+
prompt=prompt,
|
|
345
|
+
schema_cls=ToolSelectionScore,
|
|
346
|
+
extract_schema=lambda s: s,
|
|
347
|
+
extract_json=lambda data: ToolSelectionScore(**data),
|
|
348
|
+
)
|
|
339
349
|
|
|
340
350
|
async def _a_get_tool_selection_score(
|
|
341
351
|
self, user_input, tools_called, available_tools
|
|
@@ -345,25 +355,16 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
345
355
|
prompt = ToolCorrectnessTemplate.get_tool_selection_score(
|
|
346
356
|
user_input, tools_called_formatted, available_tools_formatted
|
|
347
357
|
)
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
try:
|
|
356
|
-
res = await self.model.a_generate(
|
|
357
|
-
prompt, schema=ToolSelectionScore
|
|
358
|
-
)
|
|
359
|
-
return res
|
|
360
|
-
except TypeError:
|
|
361
|
-
res = await self.model.a_generate(prompt)
|
|
362
|
-
data = trimAndLoadJson(res, self)
|
|
363
|
-
return ToolSelectionScore(**data)
|
|
358
|
+
return await a_generate_with_schema_and_extract(
|
|
359
|
+
metric=self,
|
|
360
|
+
prompt=prompt,
|
|
361
|
+
schema_cls=ToolSelectionScore,
|
|
362
|
+
extract_schema=lambda s: s,
|
|
363
|
+
extract_json=lambda data: ToolSelectionScore(**data),
|
|
364
|
+
)
|
|
364
365
|
|
|
365
366
|
# Calculate score
|
|
366
|
-
def _calculate_score(self):
|
|
367
|
+
def _calculate_score(self) -> float:
|
|
367
368
|
if self.should_exact_match:
|
|
368
369
|
score = self._calculate_exact_match_score()
|
|
369
370
|
elif self.should_consider_ordering:
|
|
@@ -382,7 +383,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
382
383
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
383
384
|
|
|
384
385
|
# Exact matching score
|
|
385
|
-
def _calculate_exact_match_score(self):
|
|
386
|
+
def _calculate_exact_match_score(self) -> float:
|
|
386
387
|
if len(self.tools_called) != len(self.expected_tools):
|
|
387
388
|
return 0.0
|
|
388
389
|
if (
|
|
@@ -405,7 +406,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
405
406
|
return 1.0
|
|
406
407
|
|
|
407
408
|
# Non exact matching score
|
|
408
|
-
def _calculate_non_exact_match_score(self):
|
|
409
|
+
def _calculate_non_exact_match_score(self) -> float:
|
|
409
410
|
total_score = 0.0
|
|
410
411
|
matched_called_tools = set()
|
|
411
412
|
for expected_tool in self.expected_tools:
|
|
@@ -445,7 +446,7 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
445
446
|
)
|
|
446
447
|
|
|
447
448
|
# Consider ordering score
|
|
448
|
-
def _compute_weighted_lcs(self):
|
|
449
|
+
def _compute_weighted_lcs(self) -> Tuple[List[ToolCall], float]:
|
|
449
450
|
m, n = len(self.expected_tools), len(self.tools_called)
|
|
450
451
|
dp = [[0.0] * (n + 1) for _ in range(m + 1)]
|
|
451
452
|
for i in range(1, m + 1):
|
|
@@ -161,6 +161,13 @@ class ToolUseTemplate:
|
|
|
161
161
|
- The key patterns or trends in the sub-reasons (e.g., consistent correct choices, repeated irrelevant tool calls, missed best-fit tools).
|
|
162
162
|
- A clear statement linking the **score** and **threshold** outcome (e.g., “The agent passed because…” or “Failed because…”).
|
|
163
163
|
|
|
164
|
+
**
|
|
165
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
166
|
+
Example JSON:
|
|
167
|
+
{{
|
|
168
|
+
"reason": "The score is <score> because <your_reason>."
|
|
169
|
+
}}
|
|
170
|
+
|
|
164
171
|
RULES:
|
|
165
172
|
- Focus on *which tools were selected* and *why that selection pattern was or wasn't appropriate*.
|
|
166
173
|
- Mention specific issues or strengths like redundancy, misuse, or perfect matching.
|
|
@@ -178,7 +185,7 @@ class ToolUseTemplate:
|
|
|
178
185
|
Threshold: {threshold}
|
|
179
186
|
Result: {"PASS" if final_score >= threshold else "FAIL"}
|
|
180
187
|
|
|
181
|
-
|
|
188
|
+
JSON:
|
|
182
189
|
"""
|
|
183
190
|
)
|
|
184
191
|
|
|
@@ -199,6 +206,13 @@ class ToolUseTemplate:
|
|
|
199
206
|
- The dominant strengths or weaknesses from the sub-reasons (e.g., correct parameterization, missing required fields, generic values, or misaligned arguments).
|
|
200
207
|
- Whether the agent met or fell short of the threshold and why.
|
|
201
208
|
|
|
209
|
+
**
|
|
210
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
211
|
+
Example JSON:
|
|
212
|
+
{{
|
|
213
|
+
"reason": "The score is <score> because <your_reason>."
|
|
214
|
+
}}
|
|
215
|
+
|
|
202
216
|
RULES:
|
|
203
217
|
- Focus strictly on **argument correctness** and **context alignment** — not which tools were chosen.
|
|
204
218
|
- Reference specific argument-level problems or successes where helpful.
|
|
@@ -215,6 +229,6 @@ class ToolUseTemplate:
|
|
|
215
229
|
Threshold: {threshold}
|
|
216
230
|
Result: {"PASS" if final_score >= threshold else "FAIL"}
|
|
217
231
|
|
|
218
|
-
|
|
232
|
+
JSON:
|
|
219
233
|
"""
|
|
220
234
|
)
|
|
@@ -3,11 +3,11 @@ import asyncio
|
|
|
3
3
|
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
4
|
from deepeval.metrics.utils import (
|
|
5
5
|
construct_verbose_logs,
|
|
6
|
-
trimAndLoadJson,
|
|
7
6
|
get_unit_interactions,
|
|
8
|
-
print_tools_called,
|
|
9
7
|
check_conversational_test_case_params,
|
|
10
8
|
initialize_model,
|
|
9
|
+
a_generate_with_schema_and_extract,
|
|
10
|
+
generate_with_schema_and_extract,
|
|
11
11
|
)
|
|
12
12
|
from deepeval.test_case import (
|
|
13
13
|
ConversationalTestCase,
|
|
@@ -23,6 +23,7 @@ from deepeval.metrics.tool_use.schema import (
|
|
|
23
23
|
ToolSelectionScore,
|
|
24
24
|
UserInputAndTools,
|
|
25
25
|
ArgumentCorrectnessScore,
|
|
26
|
+
Reason,
|
|
26
27
|
)
|
|
27
28
|
from deepeval.metrics.api import metric_data_manager
|
|
28
29
|
|
|
@@ -61,7 +62,12 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
61
62
|
_log_metric_to_confident: bool = True,
|
|
62
63
|
):
|
|
63
64
|
check_conversational_test_case_params(
|
|
64
|
-
test_case,
|
|
65
|
+
test_case,
|
|
66
|
+
self._required_test_case_params,
|
|
67
|
+
self,
|
|
68
|
+
False,
|
|
69
|
+
self.model,
|
|
70
|
+
test_case.multimodal,
|
|
65
71
|
)
|
|
66
72
|
|
|
67
73
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -136,7 +142,12 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
136
142
|
_log_metric_to_confident: bool = True,
|
|
137
143
|
):
|
|
138
144
|
check_conversational_test_case_params(
|
|
139
|
-
test_case,
|
|
145
|
+
test_case,
|
|
146
|
+
self._required_test_case_params,
|
|
147
|
+
self,
|
|
148
|
+
False,
|
|
149
|
+
self.model,
|
|
150
|
+
test_case.multimodal,
|
|
140
151
|
)
|
|
141
152
|
|
|
142
153
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -206,22 +217,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
206
217
|
user_and_tools.tools_called,
|
|
207
218
|
user_and_tools.available_tools,
|
|
208
219
|
)
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
try:
|
|
217
|
-
res: ArgumentCorrectnessScore = self.model.generate(
|
|
218
|
-
prompt, schema=ArgumentCorrectnessScore
|
|
219
|
-
)
|
|
220
|
-
return res
|
|
221
|
-
except TypeError:
|
|
222
|
-
res = self.model.generate(prompt)
|
|
223
|
-
data = trimAndLoadJson(res, self)
|
|
224
|
-
return ArgumentCorrectnessScore(**data)
|
|
220
|
+
return generate_with_schema_and_extract(
|
|
221
|
+
metric=self,
|
|
222
|
+
prompt=prompt,
|
|
223
|
+
schema_cls=ArgumentCorrectnessScore,
|
|
224
|
+
extract_schema=lambda s: s,
|
|
225
|
+
extract_json=lambda data: ArgumentCorrectnessScore(**data),
|
|
226
|
+
)
|
|
225
227
|
|
|
226
228
|
async def _a_get_argument_correctness_score(
|
|
227
229
|
self,
|
|
@@ -233,22 +235,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
233
235
|
user_and_tools.tools_called,
|
|
234
236
|
user_and_tools.available_tools,
|
|
235
237
|
)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
try:
|
|
244
|
-
res: ArgumentCorrectnessScore = await self.model.a_generate(
|
|
245
|
-
prompt, schema=ArgumentCorrectnessScore
|
|
246
|
-
)
|
|
247
|
-
return res
|
|
248
|
-
except TypeError:
|
|
249
|
-
res = await self.model.a_generate(prompt)
|
|
250
|
-
data = trimAndLoadJson(res, self)
|
|
251
|
-
return ArgumentCorrectnessScore(**data)
|
|
238
|
+
return await a_generate_with_schema_and_extract(
|
|
239
|
+
metric=self,
|
|
240
|
+
prompt=prompt,
|
|
241
|
+
schema_cls=ArgumentCorrectnessScore,
|
|
242
|
+
extract_schema=lambda s: s,
|
|
243
|
+
extract_json=lambda data: ArgumentCorrectnessScore(**data),
|
|
244
|
+
)
|
|
252
245
|
|
|
253
246
|
def _get_tool_selection_score(
|
|
254
247
|
self,
|
|
@@ -260,20 +253,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
260
253
|
user_and_tools.tools_called,
|
|
261
254
|
user_and_tools.available_tools,
|
|
262
255
|
)
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
prompt, schema=ToolSelectionScore
|
|
271
|
-
)
|
|
272
|
-
return res
|
|
273
|
-
except TypeError:
|
|
274
|
-
res = self.model.generate(prompt)
|
|
275
|
-
data = trimAndLoadJson(res, self)
|
|
276
|
-
return ToolSelectionScore(**data)
|
|
256
|
+
return generate_with_schema_and_extract(
|
|
257
|
+
metric=self,
|
|
258
|
+
prompt=prompt,
|
|
259
|
+
schema_cls=ToolSelectionScore,
|
|
260
|
+
extract_schema=lambda s: s,
|
|
261
|
+
extract_json=lambda data: ToolSelectionScore(**data),
|
|
262
|
+
)
|
|
277
263
|
|
|
278
264
|
async def _a_get_tool_selection_score(
|
|
279
265
|
self,
|
|
@@ -285,22 +271,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
285
271
|
user_and_tools.tools_called,
|
|
286
272
|
user_and_tools.available_tools,
|
|
287
273
|
)
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
try:
|
|
296
|
-
res: ToolSelectionScore = await self.model.a_generate(
|
|
297
|
-
prompt, schema=ToolSelectionScore
|
|
298
|
-
)
|
|
299
|
-
return res
|
|
300
|
-
except TypeError:
|
|
301
|
-
res = await self.model.a_generate(prompt)
|
|
302
|
-
data = trimAndLoadJson(res, self)
|
|
303
|
-
return ToolSelectionScore(**data)
|
|
274
|
+
return await a_generate_with_schema_and_extract(
|
|
275
|
+
metric=self,
|
|
276
|
+
prompt=prompt,
|
|
277
|
+
schema_cls=ToolSelectionScore,
|
|
278
|
+
extract_schema=lambda s: s,
|
|
279
|
+
extract_json=lambda data: ToolSelectionScore(**data),
|
|
280
|
+
)
|
|
304
281
|
|
|
305
282
|
def _get_user_input_and_turns(
|
|
306
283
|
self,
|
|
@@ -380,13 +357,14 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
380
357
|
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
381
358
|
scores_and_reasons, self.score, self.threshold
|
|
382
359
|
)
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
self
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
360
|
+
|
|
361
|
+
return generate_with_schema_and_extract(
|
|
362
|
+
metric=self,
|
|
363
|
+
prompt=prompt,
|
|
364
|
+
schema_cls=Reason,
|
|
365
|
+
extract_schema=lambda s: s.reason,
|
|
366
|
+
extract_json=lambda data: data["reason"],
|
|
367
|
+
)
|
|
390
368
|
|
|
391
369
|
def _generate_reason_for_argument_correctness(
|
|
392
370
|
self,
|
|
@@ -400,13 +378,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
400
378
|
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
401
379
|
scores_and_reasons, self.score, self.threshold
|
|
402
380
|
)
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
381
|
+
return generate_with_schema_and_extract(
|
|
382
|
+
metric=self,
|
|
383
|
+
prompt=prompt,
|
|
384
|
+
schema_cls=Reason,
|
|
385
|
+
extract_schema=lambda s: s.reason,
|
|
386
|
+
extract_json=lambda data: data["reason"],
|
|
387
|
+
)
|
|
410
388
|
|
|
411
389
|
async def _a_generate_reason_for_tool_selection(
|
|
412
390
|
self, tool_use_scores: List[ToolSelectionScore]
|
|
@@ -419,13 +397,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
419
397
|
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
420
398
|
scores_and_reasons, self.score, self.threshold
|
|
421
399
|
)
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
400
|
+
return await a_generate_with_schema_and_extract(
|
|
401
|
+
metric=self,
|
|
402
|
+
prompt=prompt,
|
|
403
|
+
schema_cls=Reason,
|
|
404
|
+
extract_schema=lambda s: s.reason,
|
|
405
|
+
extract_json=lambda data: data["reason"],
|
|
406
|
+
)
|
|
429
407
|
|
|
430
408
|
async def _a_generate_reason_for_argument_correctness(
|
|
431
409
|
self, argument_correctness_scores: List[ArgumentCorrectnessScore]
|
|
@@ -438,13 +416,13 @@ class ToolUseMetric(BaseConversationalMetric):
|
|
|
438
416
|
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
439
417
|
scores_and_reasons, self.score, self.threshold
|
|
440
418
|
)
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
419
|
+
return await a_generate_with_schema_and_extract(
|
|
420
|
+
metric=self,
|
|
421
|
+
prompt=prompt,
|
|
422
|
+
schema_cls=Reason,
|
|
423
|
+
extract_schema=lambda s: s.reason,
|
|
424
|
+
extract_json=lambda data: data["reason"],
|
|
425
|
+
)
|
|
448
426
|
|
|
449
427
|
def is_successful(self) -> bool:
|
|
450
428
|
try:
|