deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,19 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class UserInputAndTools(BaseModel):
5
+ user_messages: str
6
+ assistant_messages: str
7
+ tools_called: str
8
+ available_tools: str
9
+ tools_used: bool
10
+
11
+
12
+ class ToolSelectionScore(BaseModel):
13
+ score: float
14
+ reason: str
15
+
16
+
17
+ class ArgumentCorrectnessScore(BaseModel):
18
+ score: float
19
+ reason: str
@@ -0,0 +1,220 @@
1
+ import textwrap
2
+ import json
3
+
4
+
5
+ class ToolUseTemplate:
6
+
7
+ @staticmethod
8
+ def get_tool_selection_score(
9
+ user_input: str,
10
+ assistant_messages: str,
11
+ tools_called: str,
12
+ available_tools: str,
13
+ ) -> str:
14
+ return textwrap.dedent(
15
+ f"""You are an expert evaluator assessing the **Tool Selection Quality** of an AI agent.
16
+
17
+ OBJECTIVE
18
+ Evaluate whether the agent **selected the most appropriate tools** for completing the user's task, given a list of available tools.
19
+
20
+ This metric focuses **only** on which tools were chosen — **not** how they were used or whether they succeeded.
21
+
22
+ EVALUATION RULES
23
+
24
+ 1. Relevance
25
+ - Each tool used must directly support the user's stated goal or a clear sub-task derived from it.
26
+ - Tools unrelated to the goal lower the score sharply.
27
+
28
+ 2. Appropriateness
29
+ - The chosen tools must match their described purpose.
30
+ - If a more suitable tool existed and was ignored, score ≤ 0.5.
31
+
32
+ 3. Necessity
33
+ - Every tool call must be justified by clear need.
34
+ - Redundant or speculative tool use (e.g., calling multiple tools that overlap) reduces the score.
35
+
36
+ 4. Strictness
37
+ - When uncertain if a tool was required or correctly chosen, assume it was **not** appropriate.
38
+ - Only perfect alignment between the task and tool choice earns a high score.
39
+
40
+ SCORING GUIDE:
41
+
42
+ - **1.0** → Every tool used was necessary and perfectly matched to the task; no better alternative ignored.
43
+ - **0.75** → Tool selection was mostly correct, with only minor redundancy or a small omission.
44
+ - **0.5** → Mixed quality; some appropriate selections, but others questionable or missing.
45
+ - **0.25** → Poor selection; major mismatches or misuse of available tools.
46
+ - **0.0** → Tool selection irrelevant, random, or unjustified.
47
+
48
+ OUTPUT FORMAT:
49
+
50
+ Return a JSON object with:
51
+
52
+ {{
53
+ "score": float between 0.0 and 1.0,
54
+ "reason": "1-3 factual sentences explaining which tools were appropriate or inappropriate for the task, referencing specific tool names."
55
+ }}
56
+
57
+ USER INPUT:
58
+ {user_input}
59
+
60
+ ASSISTANT MESSAGES:
61
+ {assistant_messages}
62
+
63
+ TOOLS CALLED:
64
+ {tools_called}
65
+
66
+ AVAILABLE TOOLS:
67
+ {available_tools}
68
+
69
+ JSON:
70
+ """
71
+ )
72
+
73
+ @staticmethod
74
+ def get_argument_correctness_score(
75
+ user_input: str,
76
+ assistant_messages: str,
77
+ tools_called: str,
78
+ available_tools: str,
79
+ ) -> str:
80
+ return textwrap.dedent(
81
+ f"""You are an expert evaluator assessing the **Tool Argument Quality** of an AI agent.
82
+
83
+ OBJECTIVE:
84
+
85
+ Evaluate whether the **arguments and parameters** passed to each tool were:
86
+ - Correctly structured and complete.
87
+ - Contextually appropriate for the user's goal.
88
+ - Compatible with each tool's intended purpose.
89
+
90
+ This metric focuses **only** on argument-level correctness and relevance — not which tools were chosen.
91
+
92
+ EVALUATION RULES
93
+
94
+ 1. Relevance
95
+ - Each argument must align with the task and the tool's documented input fields.
96
+ - Unrelated, empty, or default arguments reduce the score sharply.
97
+
98
+ 2. **Completeness**
99
+ - All required parameters must be provided.
100
+ - Missing or malformed arguments (e.g., wrong data types or incomplete context) lower the score.
101
+
102
+ 3. **Specificity**
103
+ - Arguments should reflect task-specific values, not generic placeholders.
104
+ - Overly vague or default arguments are penalized.
105
+
106
+ 4. **Justification**
107
+ - Each argument must make sense in context.
108
+ - If it doesn't clearly derive from the user's request, assume it's incorrect.
109
+
110
+ 5. **Strict Bias**
111
+ - When uncertain whether arguments fit the tool or task, assume they were **incorrect**.
112
+
113
+ SCORING GUIDE:
114
+
115
+ - **1.0** → All arguments are accurate, specific, and fully aligned with both the task and tool requirements.
116
+ - **0.75** → Mostly correct; minor omissions or small mismatches.
117
+ - **0.5** → Partial correctness; some valid parameters, but key ones missing or off-target.
118
+ - **0.25** → Poor argument quality; several invalid or irrelevant fields.
119
+ - **0.0** → Arguments nonsensical, generic, or unrelated to task/tool intent.
120
+
121
+ OUTPUT FORMAT:
122
+
123
+ Return a JSON object with:
124
+ {{
125
+ "score": float between 0.0 and 1.0,
126
+ "reason": "1-3 sentences explaining argument alignment or issues, referencing specific parameter names or values when possible."
127
+ }}
128
+
129
+ ---
130
+
131
+ USER INPUT:
132
+ {user_input}
133
+
134
+ ASSISTANT MESSAGES:
135
+ {assistant_messages}
136
+
137
+ TOOLS CALLED (with arguments):
138
+ {tools_called}
139
+
140
+ AVAILABLE TOOLS:
141
+ {available_tools}
142
+
143
+ JSON:
144
+ """
145
+ )
146
+
147
+ @staticmethod
148
+ def get_tool_selection_final_reason(
149
+ all_scores_and_reasons: str, final_score: float, threshold: float
150
+ ) -> str:
151
+ return textwrap.dedent(
152
+ f"""You are an expert evaluator summarizing the outcome of a **Tool Selection** evaluation.
153
+
154
+ You are given:
155
+ - A list of **tool selection sub-scores and reasons**, each describing how appropriately the agent chose tools for its task.
156
+ - The **final aggregated score** across all sub-evaluations.
157
+ - A **threshold** representing the minimum passing score.
158
+
159
+ Your task is to write a **single concise explanation (1-3 sentences)** that captures:
160
+ - Why the agent **passed or failed** based on tool choice quality.
161
+ - The key patterns or trends in the sub-reasons (e.g., consistent correct choices, repeated irrelevant tool calls, missed best-fit tools).
162
+ - A clear statement linking the **score** and **threshold** outcome (e.g., “The agent passed because…” or “Failed because…”).
163
+
164
+ RULES:
165
+ - Focus on *which tools were selected* and *why that selection pattern was or wasn't appropriate*.
166
+ - Mention specific issues or strengths like redundancy, misuse, or perfect matching.
167
+ - Avoid vague or subjective language such as “pretty good” or “reasonable”.
168
+ - Do **not** reference argument-level details; this summary is only for tool choice quality.
169
+ - The result must read as a self-contained, factual justification.
170
+
171
+ FORMAT:
172
+ Return only a single plain-text string. Do **not** include JSON or other formatting.
173
+
174
+ All Tool Selection Sub-Scores and Reasons:
175
+ {all_scores_and_reasons}
176
+
177
+ Final Score: {final_score}
178
+ Threshold: {threshold}
179
+ Result: {"PASS" if final_score >= threshold else "FAIL"}
180
+
181
+ Final Reason:
182
+ """
183
+ )
184
+
185
+ @staticmethod
186
+ def get_tool_argument_final_reason(
187
+ all_scores_and_reasons: str, final_score: float, threshold: float
188
+ ) -> str:
189
+ return textwrap.dedent(
190
+ f"""You are an expert evaluator summarizing the outcome of a **Tool Argument Quality** evaluation.
191
+
192
+ You are given:
193
+ - A list of **argument-level sub-scores and reasons**, each evaluating whether the arguments passed to tools were accurate, complete, and contextually appropriate.
194
+ - The **final aggregated score** across all argument evaluations.
195
+ - A **threshold** representing the minimum passing score.
196
+
197
+ Your task is to write a **single concise explanation (1-3 sentences)** that clearly states:
198
+ - Why the agent **passed or failed** in its use of tool arguments.
199
+ - The dominant strengths or weaknesses from the sub-reasons (e.g., correct parameterization, missing required fields, generic values, or misaligned arguments).
200
+ - Whether the agent met or fell short of the threshold and why.
201
+
202
+ RULES:
203
+ - Focus strictly on **argument correctness** and **context alignment** — not which tools were chosen.
204
+ - Reference specific argument-level problems or successes where helpful.
205
+ - Keep language objective and factual; avoid speculation or vague phrasing.
206
+ - The summary must stand alone as a clear explanation of the final result.
207
+
208
+ FORMAT:
209
+ Return only a single plain-text string. Do **not** include JSON or any extra formatting.
210
+
211
+ All Tool Argument Sub-Scores and Reasons:
212
+ {all_scores_and_reasons}
213
+
214
+ Final Score: {final_score}
215
+ Threshold: {threshold}
216
+ Result: {"PASS" if final_score >= threshold else "FAIL"}
217
+
218
+ Final Reason:
219
+ """
220
+ )
@@ -0,0 +1,458 @@
1
+ from typing import Optional, List, Union
2
+ import asyncio
3
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
+ from deepeval.metrics.utils import (
5
+ construct_verbose_logs,
6
+ trimAndLoadJson,
7
+ get_unit_interactions,
8
+ print_tools_called,
9
+ check_conversational_test_case_params,
10
+ initialize_model,
11
+ )
12
+ from deepeval.test_case import (
13
+ ConversationalTestCase,
14
+ TurnParams,
15
+ ToolCall,
16
+ Turn,
17
+ )
18
+ from deepeval.metrics import BaseConversationalMetric
19
+ from deepeval.models import DeepEvalBaseLLM
20
+ from deepeval.metrics.indicator import metric_progress_indicator
21
+ from deepeval.metrics.tool_use.template import ToolUseTemplate
22
+ from deepeval.metrics.tool_use.schema import (
23
+ ToolSelectionScore,
24
+ UserInputAndTools,
25
+ ArgumentCorrectnessScore,
26
+ )
27
+ from deepeval.metrics.api import metric_data_manager
28
+
29
+
30
+ class ToolUseMetric(BaseConversationalMetric):
31
+
32
+ _required_test_case_params = [
33
+ TurnParams.ROLE,
34
+ TurnParams.CONTENT,
35
+ ]
36
+
37
+ def __init__(
38
+ self,
39
+ available_tools: List[ToolCall],
40
+ threshold: float = 0.5,
41
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
42
+ include_reason: bool = True,
43
+ async_mode: bool = True,
44
+ strict_mode: bool = False,
45
+ verbose_mode: bool = False,
46
+ ):
47
+ self.available_tools = available_tools
48
+ self.threshold = 1 if strict_mode else threshold
49
+ self.model, self.using_native_model = initialize_model(model)
50
+ self.evaluation_model = self.model.get_model_name()
51
+ self.include_reason = include_reason
52
+ self.async_mode = async_mode
53
+ self.strict_mode = strict_mode
54
+ self.verbose_mode = verbose_mode
55
+
56
+ def measure(
57
+ self,
58
+ test_case: ConversationalTestCase,
59
+ _show_indicator: bool = True,
60
+ _in_component: bool = False,
61
+ _log_metric_to_confident: bool = True,
62
+ ):
63
+ check_conversational_test_case_params(
64
+ test_case, self._required_test_case_params, self
65
+ )
66
+
67
+ self.evaluation_cost = 0 if self.using_native_model else None
68
+ with metric_progress_indicator(
69
+ self, _show_indicator=_show_indicator, _in_component=_in_component
70
+ ):
71
+ if self.async_mode:
72
+ loop = get_or_create_event_loop()
73
+ loop.run_until_complete(
74
+ self.a_measure(
75
+ test_case,
76
+ _show_indicator=False,
77
+ _in_component=_in_component,
78
+ _log_metric_to_confident=_log_metric_to_confident,
79
+ )
80
+ )
81
+ else:
82
+ unit_interactions = get_unit_interactions(test_case.turns)
83
+ user_input_and_tools = self._get_user_input_and_turns(
84
+ unit_interactions
85
+ )
86
+ tool_selection_scores = [
87
+ self._get_tool_selection_score(user_and_tools)
88
+ for user_and_tools in user_input_and_tools
89
+ ]
90
+ argument_correctness_scores = [
91
+ self._get_argument_correctness_score(user_and_tools)
92
+ for user_and_tools in user_input_and_tools
93
+ if user_and_tools.tools_used
94
+ ]
95
+ self.score = self._calculate_score(
96
+ tool_selection_scores, argument_correctness_scores
97
+ )
98
+ tool_selection_reason = (
99
+ self._generate_reason_for_tool_selection(
100
+ tool_selection_scores
101
+ )
102
+ )
103
+ argument_correctness_reason = (
104
+ self._generate_reason_for_argument_correctness(
105
+ argument_correctness_scores
106
+ )
107
+ )
108
+ self.reason = str(
109
+ "\n".join(
110
+ [tool_selection_reason, argument_correctness_reason]
111
+ )
112
+ )
113
+
114
+ self.verbose_logs = construct_verbose_logs(
115
+ self,
116
+ steps=[
117
+ f"Tool Selection Scores: {prettify_list(tool_selection_scores)} \n",
118
+ f"Argument Correctness Scores: {prettify_list(argument_correctness_scores)} \n",
119
+ f"Final Score: {self.score}",
120
+ f"Final Reason: {self.reason}",
121
+ ],
122
+ )
123
+
124
+ if _log_metric_to_confident:
125
+ metric_data_manager.post_metric_if_enabled(
126
+ self, test_case=test_case
127
+ )
128
+
129
+ return self.score
130
+
131
+ async def a_measure(
132
+ self,
133
+ test_case: ConversationalTestCase,
134
+ _show_indicator: bool = True,
135
+ _in_component: bool = False,
136
+ _log_metric_to_confident: bool = True,
137
+ ):
138
+ check_conversational_test_case_params(
139
+ test_case, self._required_test_case_params, self
140
+ )
141
+
142
+ self.evaluation_cost = 0 if self.using_native_model else None
143
+ with metric_progress_indicator(
144
+ self,
145
+ async_mode=True,
146
+ _show_indicator=_show_indicator,
147
+ _in_component=_in_component,
148
+ ):
149
+ unit_interactions = get_unit_interactions(test_case.turns)
150
+ user_input_and_tools = self._get_user_input_and_turns(
151
+ unit_interactions
152
+ )
153
+ tool_selection_scores = await asyncio.gather(
154
+ *[
155
+ self._a_get_tool_selection_score(user_and_tools)
156
+ for user_and_tools in user_input_and_tools
157
+ ]
158
+ )
159
+ argument_correctness_scores = await asyncio.gather(
160
+ *[
161
+ self._a_get_argument_correctness_score(user_and_tools)
162
+ for user_and_tools in user_input_and_tools
163
+ if user_and_tools.tools_used
164
+ ]
165
+ )
166
+ self.score = self._calculate_score(
167
+ tool_selection_scores, argument_correctness_scores
168
+ )
169
+ tool_selection_reason = (
170
+ await self._a_generate_reason_for_tool_selection(
171
+ tool_selection_scores
172
+ )
173
+ )
174
+ argument_correctness_reason = (
175
+ await self._a_generate_reason_for_argument_correctness(
176
+ argument_correctness_scores
177
+ )
178
+ )
179
+ self.reason = str(
180
+ "\n".join([tool_selection_reason, argument_correctness_reason])
181
+ )
182
+
183
+ self.verbose_logs = construct_verbose_logs(
184
+ self,
185
+ steps=[
186
+ f"Tool Selection Scores: {prettify_list(tool_selection_scores)} \n",
187
+ f"Argument Correctness Scores: {prettify_list(argument_correctness_scores)} \n",
188
+ f"Final Score: {self.score}",
189
+ f"Final Reason: {self.reason}",
190
+ ],
191
+ )
192
+
193
+ if _log_metric_to_confident:
194
+ metric_data_manager.post_metric_if_enabled(
195
+ self, test_case=test_case
196
+ )
197
+
198
+ return self.score
199
+
200
+ def _get_argument_correctness_score(
201
+ self, user_and_tools: UserInputAndTools
202
+ ):
203
+ prompt = ToolUseTemplate.get_argument_correctness_score(
204
+ user_and_tools.user_messages,
205
+ user_and_tools.assistant_messages,
206
+ user_and_tools.tools_called,
207
+ user_and_tools.available_tools,
208
+ )
209
+ if self.using_native_model:
210
+ res, cost = self.model.generate(
211
+ prompt, schema=ArgumentCorrectnessScore
212
+ )
213
+ self.evaluation_cost += cost
214
+ return res
215
+ else:
216
+ try:
217
+ res: ArgumentCorrectnessScore = self.model.generate(
218
+ prompt, schema=ArgumentCorrectnessScore
219
+ )
220
+ return res
221
+ except TypeError:
222
+ res = self.model.generate(prompt)
223
+ data = trimAndLoadJson(res, self)
224
+ return ArgumentCorrectnessScore(**data)
225
+
226
+ async def _a_get_argument_correctness_score(
227
+ self,
228
+ user_and_tools: UserInputAndTools,
229
+ ):
230
+ prompt = ToolUseTemplate.get_argument_correctness_score(
231
+ user_and_tools.user_messages,
232
+ user_and_tools.assistant_messages,
233
+ user_and_tools.tools_called,
234
+ user_and_tools.available_tools,
235
+ )
236
+ if self.using_native_model:
237
+ res, cost = await self.model.a_generate(
238
+ prompt, schema=ArgumentCorrectnessScore
239
+ )
240
+ self.evaluation_cost += cost
241
+ return res
242
+ else:
243
+ try:
244
+ res: ArgumentCorrectnessScore = await self.model.a_generate(
245
+ prompt, schema=ArgumentCorrectnessScore
246
+ )
247
+ return res
248
+ except TypeError:
249
+ res = await self.model.a_generate(prompt)
250
+ data = trimAndLoadJson(res, self)
251
+ return ArgumentCorrectnessScore(**data)
252
+
253
+ def _get_tool_selection_score(
254
+ self,
255
+ user_and_tools: UserInputAndTools,
256
+ ):
257
+ prompt = ToolUseTemplate.get_tool_selection_score(
258
+ user_and_tools.user_messages,
259
+ user_and_tools.assistant_messages,
260
+ user_and_tools.tools_called,
261
+ user_and_tools.available_tools,
262
+ )
263
+ if self.using_native_model:
264
+ res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
265
+ self.evaluation_cost += cost
266
+ return res
267
+ else:
268
+ try:
269
+ res: ToolSelectionScore = self.model.generate(
270
+ prompt, schema=ToolSelectionScore
271
+ )
272
+ return res
273
+ except TypeError:
274
+ res = self.model.generate(prompt)
275
+ data = trimAndLoadJson(res, self)
276
+ return ToolSelectionScore(**data)
277
+
278
+ async def _a_get_tool_selection_score(
279
+ self,
280
+ user_and_tools: UserInputAndTools,
281
+ ):
282
+ prompt = ToolUseTemplate.get_tool_selection_score(
283
+ user_and_tools.user_messages,
284
+ user_and_tools.assistant_messages,
285
+ user_and_tools.tools_called,
286
+ user_and_tools.available_tools,
287
+ )
288
+ if self.using_native_model:
289
+ res, cost = await self.model.a_generate(
290
+ prompt, schema=ToolSelectionScore
291
+ )
292
+ self.evaluation_cost += cost
293
+ return res
294
+ else:
295
+ try:
296
+ res: ToolSelectionScore = await self.model.a_generate(
297
+ prompt, schema=ToolSelectionScore
298
+ )
299
+ return res
300
+ except TypeError:
301
+ res = await self.model.a_generate(prompt)
302
+ data = trimAndLoadJson(res, self)
303
+ return ToolSelectionScore(**data)
304
+
305
+ def _get_user_input_and_turns(
306
+ self,
307
+ unit_interactions: List[List[Turn]],
308
+ ) -> List[UserInputAndTools]:
309
+ user_inputs_and_tools = []
310
+ available_tools = ",".join(
311
+ [repr(tool) for tool in self.available_tools]
312
+ )
313
+ for unit_interaction in unit_interactions:
314
+ if len(unit_interaction) < 2:
315
+ continue
316
+ user_messages = ""
317
+ assistant_messages = ""
318
+ tools_called = []
319
+ tools_used = False
320
+ for turn in unit_interaction:
321
+ if turn.role == "user":
322
+ user_messages += f"{turn.content} \n"
323
+ else:
324
+ break
325
+ for turn in unit_interaction[1:]:
326
+ if turn.role == "assistant":
327
+ assistant_messages += f"{turn.content} \n"
328
+ if turn.tools_called:
329
+ tools_called.extend(turn.tools_called)
330
+ tools_used = True
331
+ tools_called = ",".join([repr(tool) for tool in tools_called])
332
+ new_user_input_tools = UserInputAndTools(
333
+ user_messages=user_messages,
334
+ assistant_messages=assistant_messages,
335
+ tools_called=tools_called,
336
+ available_tools=available_tools,
337
+ tools_used=tools_used,
338
+ )
339
+ user_inputs_and_tools.append(new_user_input_tools)
340
+ return user_inputs_and_tools
341
+
342
+ def _calculate_score(
343
+ self,
344
+ tool_use_scores: List[ToolSelectionScore],
345
+ argument_correctness_scores: List[ArgumentCorrectnessScore],
346
+ ):
347
+ tools_scores_sum = sum(
348
+ [tool_use_score.score for tool_use_score in tool_use_scores]
349
+ )
350
+ arguments_scores_sum = sum(
351
+ [
352
+ argument_correctness_score.score
353
+ for argument_correctness_score in argument_correctness_scores
354
+ ]
355
+ )
356
+ tool_selections_scores_divisor = (
357
+ len(tool_use_scores) if len(tool_use_scores) > 0 else 1
358
+ )
359
+ argument_correctness_score_divisor = (
360
+ len(argument_correctness_scores)
361
+ if len(argument_correctness_scores) > 0
362
+ else 1
363
+ )
364
+ tools_selction_score = tools_scores_sum / tool_selections_scores_divisor
365
+ argument_correctness_score = (
366
+ arguments_scores_sum / argument_correctness_score_divisor
367
+ )
368
+ score = min(tools_selction_score, argument_correctness_score)
369
+ return 0 if self.strict_mode and score < self.threshold else score
370
+
371
+ def _generate_reason_for_tool_selection(
372
+ self,
373
+ tool_use_scores: List[ToolSelectionScore],
374
+ ):
375
+ scores_and_reasons = ""
376
+ for tool_use in tool_use_scores:
377
+ scores_and_reasons += (
378
+ f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
379
+ )
380
+ prompt = ToolUseTemplate.get_tool_selection_final_reason(
381
+ scores_and_reasons, self.score, self.threshold
382
+ )
383
+ if self.using_native_model:
384
+ res, cost = self.model.generate(prompt)
385
+ self.evaluation_cost += cost
386
+ return res
387
+ else:
388
+ res = self.model.generate(prompt)
389
+ return res
390
+
391
+ def _generate_reason_for_argument_correctness(
392
+ self,
393
+ argument_correctness_scores: List[ArgumentCorrectnessScore],
394
+ ):
395
+ scores_and_reasons = ""
396
+ for tool_use in argument_correctness_scores:
397
+ scores_and_reasons += (
398
+ f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
399
+ )
400
+ prompt = ToolUseTemplate.get_tool_selection_final_reason(
401
+ scores_and_reasons, self.score, self.threshold
402
+ )
403
+ if self.using_native_model:
404
+ res, cost = self.model.generate(prompt)
405
+ self.evaluation_cost += cost
406
+ return res
407
+ else:
408
+ res = self.model.generate(prompt)
409
+ return res
410
+
411
+ async def _a_generate_reason_for_tool_selection(
412
+ self, tool_use_scores: List[ToolSelectionScore]
413
+ ):
414
+ scores_and_reasons = ""
415
+ for tool_use in tool_use_scores:
416
+ scores_and_reasons += (
417
+ f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
418
+ )
419
+ prompt = ToolUseTemplate.get_tool_selection_final_reason(
420
+ scores_and_reasons, self.score, self.threshold
421
+ )
422
+ if self.using_native_model:
423
+ res, cost = await self.model.a_generate(prompt)
424
+ self.evaluation_cost += cost
425
+ return res
426
+ else:
427
+ res = await self.model.a_generate(prompt)
428
+ return res
429
+
430
+ async def _a_generate_reason_for_argument_correctness(
431
+ self, argument_correctness_scores: List[ArgumentCorrectnessScore]
432
+ ):
433
+ scores_and_reasons = ""
434
+ for tool_use in argument_correctness_scores:
435
+ scores_and_reasons += (
436
+ f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
437
+ )
438
+ prompt = ToolUseTemplate.get_tool_selection_final_reason(
439
+ scores_and_reasons, self.score, self.threshold
440
+ )
441
+ if self.using_native_model:
442
+ res, cost = await self.model.a_generate(prompt)
443
+ self.evaluation_cost += cost
444
+ return res
445
+ else:
446
+ res = await self.model.a_generate(prompt)
447
+ return res
448
+
449
+ def is_successful(self) -> bool:
450
+ try:
451
+ self.success = self.score >= self.threshold
452
+ except (AttributeError, TypeError):
453
+ self.success = False
454
+ return self.success
455
+
456
+ @property
457
+ def __name__(self):
458
+ return "Tool Use"
@@ -0,0 +1 @@
1
+ from .topic_adherence import TopicAdherenceMetric