deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/config/settings.py +104 -36
  3. deepeval/config/utils.py +5 -0
  4. deepeval/dataset/dataset.py +162 -30
  5. deepeval/dataset/utils.py +41 -13
  6. deepeval/errors.py +20 -2
  7. deepeval/evaluate/execute.py +1662 -688
  8. deepeval/evaluate/types.py +1 -0
  9. deepeval/evaluate/utils.py +13 -3
  10. deepeval/integrations/crewai/__init__.py +2 -1
  11. deepeval/integrations/crewai/tool.py +71 -0
  12. deepeval/integrations/llama_index/__init__.py +0 -4
  13. deepeval/integrations/llama_index/handler.py +20 -21
  14. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  15. deepeval/metrics/__init__.py +13 -0
  16. deepeval/metrics/base_metric.py +1 -0
  17. deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
  18. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  19. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
  20. deepeval/metrics/dag/schema.py +1 -1
  21. deepeval/metrics/dag/templates.py +2 -2
  22. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  23. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  24. deepeval/metrics/goal_accuracy/schema.py +17 -0
  25. deepeval/metrics/goal_accuracy/template.py +235 -0
  26. deepeval/metrics/hallucination/hallucination.py +8 -8
  27. deepeval/metrics/indicator.py +21 -1
  28. deepeval/metrics/mcp/mcp_task_completion.py +7 -2
  29. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
  30. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
  31. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
  32. deepeval/metrics/plan_adherence/__init__.py +1 -0
  33. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  34. deepeval/metrics/plan_adherence/schema.py +11 -0
  35. deepeval/metrics/plan_adherence/template.py +170 -0
  36. deepeval/metrics/plan_quality/__init__.py +1 -0
  37. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  38. deepeval/metrics/plan_quality/schema.py +11 -0
  39. deepeval/metrics/plan_quality/template.py +101 -0
  40. deepeval/metrics/step_efficiency/__init__.py +1 -0
  41. deepeval/metrics/step_efficiency/schema.py +11 -0
  42. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  43. deepeval/metrics/step_efficiency/template.py +256 -0
  44. deepeval/metrics/task_completion/task_completion.py +1 -0
  45. deepeval/metrics/tool_correctness/schema.py +6 -0
  46. deepeval/metrics/tool_correctness/template.py +88 -0
  47. deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
  48. deepeval/metrics/tool_use/__init__.py +1 -0
  49. deepeval/metrics/tool_use/schema.py +19 -0
  50. deepeval/metrics/tool_use/template.py +220 -0
  51. deepeval/metrics/tool_use/tool_use.py +458 -0
  52. deepeval/metrics/topic_adherence/__init__.py +1 -0
  53. deepeval/metrics/topic_adherence/schema.py +16 -0
  54. deepeval/metrics/topic_adherence/template.py +162 -0
  55. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  56. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  57. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  58. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  59. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  60. deepeval/models/llms/amazon_bedrock_model.py +20 -17
  61. deepeval/models/llms/openai_model.py +10 -1
  62. deepeval/models/retry_policy.py +103 -20
  63. deepeval/openai/extractors.py +61 -16
  64. deepeval/openai/patch.py +8 -12
  65. deepeval/openai/types.py +1 -1
  66. deepeval/openai/utils.py +108 -1
  67. deepeval/prompt/prompt.py +1 -0
  68. deepeval/prompt/utils.py +43 -14
  69. deepeval/simulator/conversation_simulator.py +25 -18
  70. deepeval/synthesizer/chunking/context_generator.py +9 -1
  71. deepeval/synthesizer/synthesizer.py +11 -10
  72. deepeval/test_case/llm_test_case.py +6 -2
  73. deepeval/test_run/test_run.py +190 -207
  74. deepeval/tracing/__init__.py +2 -1
  75. deepeval/tracing/otel/exporter.py +3 -4
  76. deepeval/tracing/otel/utils.py +23 -4
  77. deepeval/tracing/trace_context.py +53 -38
  78. deepeval/tracing/tracing.py +23 -0
  79. deepeval/tracing/types.py +16 -14
  80. deepeval/utils.py +21 -0
  81. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
  82. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
  83. deepeval/integrations/llama_index/agent/patched.py +0 -68
  84. deepeval/tracing/message_types/__init__.py +0 -10
  85. deepeval/tracing/message_types/base.py +0 -6
  86. deepeval/tracing/message_types/messages.py +0 -14
  87. deepeval/tracing/message_types/tools.py +0 -18
  88. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
  89. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
  90. {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,256 @@
1
+ import textwrap
2
+ import json
3
+ from deepeval.tracing.utils import make_json_serializable
4
+
5
+
6
+ class StepEfficiencyTemplate:
7
+
8
+ @staticmethod
9
+ def extract_task_from_trace(trace: dict) -> str:
10
+ return textwrap.dedent(
11
+ f"""You are a **trace analyst** tasked with extracting the **user's original goal or task** from a complete nested execution trace of an AI agent.
12
+
13
+ YOUR OBJECTIVE:
14
+
15
+ Identify and describe **exactly what the user asked the agent to do**, based only on the user's explicit input and any unambiguous contextual details present in the trace.
16
+
17
+ Your goal is to produce a **concise, fact-based statement** that captures the *intended user task* — not the agent's plan, actions, reasoning, or assumptions.
18
+
19
+ STRICT EXTRACTION RULES:
20
+
21
+ 1. Primary Source: Root-Level User Input
22
+ - The user's task must be derived **directly and primarily** from the root agent's `"input"` field.
23
+ - If that field contains nested `"input"` or `"messages"`, extract the true user instruction or request text from within it.
24
+
25
+ 2. Secondary Context: Subtasks as Clarifiers (Optional)
26
+ - You may use child spans (tools, retrievers, LLMs) **only** to clarify or disambiguate what the user explicitly asked for —
27
+ e.g., to confirm that the task involves multiple subtasks the user clearly implied (like booking and planning steps for a trip).
28
+ - You may **NOT** infer new goals that the user did not state or imply.
29
+
30
+ 3. No Hallucination
31
+ - Do **NOT** invent goals, assumptions, or implied needs beyond what is explicitly or clearly inferable from the input.
32
+ - If the user's request is vague, preserve that vagueness — do not expand it.
33
+
34
+ 4. Agent-Agnostic Rule
35
+ - Ignore the agent's tools, methods, reasoning, or internal operations.
36
+ - The task reflects **what the user wanted**, not how the agent chose to approach it.
37
+
38
+ 5. Perspective
39
+ - Express the extracted task **from the user's perspective**, as if restating what they asked the system to do.
40
+ - Avoid any meta or evaluative phrasing (“The user wanted the agent to…”).
41
+
42
+ 6. Fallback Condition
43
+ - If the only available information about the task is the raw user input text, return that input verbatim without modification.
44
+
45
+ OUTPUT FORMAT:
46
+
47
+ Return **only** a JSON object of this form:
48
+
49
+ {{
50
+ "task": "<a single clear sentence summarizing the user's explicit goal>"
51
+ }}
52
+
53
+ - The `"task"` value should be a single, coherent natural language sentence or two at most.
54
+ - Do not include commentary, metadata, or any additional fields.
55
+
56
+ EXAMPLES:
57
+
58
+ Example Trace: {{
59
+ "name": "trip_planner",
60
+ "type": "agent",
61
+ "input": {{
62
+ "input": "Help me plan a business trip to Chicago next week."
63
+ }},
64
+ "children": [
65
+ {{
66
+ "name": "flight_tool",
67
+ "type": "tool",
68
+ "input": {{
69
+ "inputParameters": {{
70
+ "destination": "Chicago",
71
+ "date": "2024-07-10"
72
+ }} }},
73
+ "output": {{
74
+ "flights": ["Flight 101", "Flight 202"]
75
+ }},
76
+ "children": []
77
+ }},
78
+ {{
79
+ "name": "hotel_tool",
80
+ "type": "tool",
81
+ "input": {{
82
+ "inputParameters": {{
83
+ "location": "Chicago",
84
+ "check_in": "2024-07-10",
85
+ "check_out": "2024-07-12"
86
+ }} }},
87
+ "output": {{
88
+ "hotels": ["The Grand Chicago", "Lakeview Inn"]
89
+ }},
90
+ "children": []
91
+ }},
92
+ {{
93
+ "name": "agenda_llm",
94
+ "type": "llm",
95
+ "input": {{
96
+ "prompt": "Draft a meeting agenda",
97
+ "input": [
98
+ {{
99
+ "role": "system",
100
+ "content": "You are an executive assistant."
101
+ }},
102
+ {{
103
+ "role": "user",
104
+ "content": "Create an agenda for a client strategy meeting."
105
+ }}
106
+ ]
107
+ }},
108
+ "output": "1. Q2 review\\n2. Client feedback\\n3. Strategy planning",
109
+ "children": []
110
+ }}
111
+ ]
112
+ }}
113
+
114
+ Expected JSON:
115
+ {{
116
+ "task": "Plan a business trip to Chicago next week, including booking a flight, reserving a hotel, and drafting a client meeting agenda."
117
+ }}
118
+
119
+ IMPORTANT ENFORCEMENT RULES:
120
+
121
+ - If multiple user inputs exist, identify the overall task that user has in mind.
122
+ - Do not include execution details, tools, function names, or reasoning text.
123
+ - Avoid restating or paraphrasing beyond clarity; preserve the user's intent exactly.
124
+ - When uncertain, extract **less rather than more** — prefer minimal, factual phrasing over speculative completion.
125
+
126
+ TRACE DATA:
127
+
128
+ {json.dumps(trace, default=make_json_serializable, indent=2)}
129
+
130
+ ---
131
+
132
+ ### JSON:
133
+ """
134
+ )
135
+
136
+ @staticmethod
137
+ def get_execution_efficiency(task: str, trace: dict) -> str:
138
+ return textwrap.dedent(
139
+ f"""You are an **efficiency auditor** evaluating how economically an AI agent executed a task.
140
+
141
+ OBJECTIVE:
142
+
143
+ Determine how **efficiently** the agent executed the given task based on its full execution trace.
144
+ Efficiency means achieving the user's goal using the **fewest, simplest, and most direct** actions possible.
145
+
146
+ You must assign a score from **0.0 to 1.0** that reflects how close the execution came to the *minimal necessary sequence of actions*.
147
+
148
+ **Important:** You are not evaluating correctness, completeness, creativity, or helpfulness — only the *efficiency* of the execution.
149
+
150
+ STRICT EVALUATION RULES:
151
+
152
+ 1. Zero-Tolerance for Unnecessary Actions
153
+ - Every step, tool call, LLM query, or retrieval must be **strictly required** to fulfill the task.
154
+ - If a single tool, retrieval, or reasoning step is superfluous, speculative, repetitive, or stylistic,
155
+ the score must be as low as possible, regardless of outcome quality.
156
+ - Adding “helpful” or “contextual” actions that were not explicitly necessary is an inefficiency.
157
+
158
+ 2. Minimal Action Principle
159
+ - The ideal execution performs the **exact minimum number of steps** needed to complete the task.
160
+ - Each step must directly contribute to completing the task, not to exploration, confirmation, or elaboration.
161
+
162
+ 3. No Speculation or Enrichment
163
+ - Any activity aimed at *enhancing*, *expanding*, or *beautifying* the answer (e.g., extra retrievals, style edits, rephrasings)
164
+ reduces the score sharply (≤ 0.25).
165
+ - Efficiency is about restraint — **doing exactly what's required, nothing more**.
166
+
167
+ 4. Directness and Focus
168
+ - Steps must appear in a logically minimal sequence from input to goal.
169
+ - Repetition, re-querying, nested reasoning loops, or tool reuse when not needed
170
+ indicate inefficiency.
171
+
172
+ 5. Resource Economy
173
+ - Use of multiple LLM calls, retrievers, or tools when one would suffice must be penalized.
174
+ - Avoided resources (if the agent achieved the task through simpler means) improve efficiency.
175
+
176
+ 6. When in Doubt
177
+ - If it is unclear whether an action was required or not, **assume it was unnecessary** and lower the score.
178
+ - Err on the side of penalizing over generosity.
179
+
180
+ SCORING SCALE (STRICT)
181
+
182
+ - **1.0 — Perfectly efficient**
183
+ - Only essential steps taken.
184
+ - Each action was directly necessary for task completion.
185
+ - No speculative, redundant, or decorative work.
186
+
187
+ - **0.75 — Strong efficiency**
188
+ - Mostly minimal execution with one small redundant or stylistic step.
189
+ - Slight overuse of a tool or repeated call, but otherwise tight.
190
+
191
+ - **0.5 — Moderate efficiency**
192
+ - Noticeable inefficiency: extra steps, unnecessary tool calls, or indirect methods.
193
+ - The same task could clearly have been completed faster or with fewer actions.
194
+
195
+ - **0.25 — Low efficiency**
196
+ - Multiple irrelevant or unjustified actions taken.
197
+ - Execution path significantly longer or more complex than needed.
198
+
199
+ - **0.0 — Highly inefficient**
200
+ - Execution was verbose, exploratory, speculative, or wasteful.
201
+ - Most actions were unnecessary or unrelated to achieving the core task.
202
+
203
+ *When uncertain, always assign the lower score.*
204
+
205
+ OUTPUT FORMAT:
206
+
207
+ Return a single JSON object in this exact format:
208
+
209
+ {{
210
+ "score": 0.0,
211
+ "reason": "1-3 concise factual sentences describing where inefficiencies occurred."
212
+ }}
213
+
214
+ The `reason` must:
215
+ - Identify specific inefficient actions (e.g., redundant LLM call, unnecessary retrieval, speculative tool use).
216
+ - Avoid subjective phrasing (“reasonable”, “seems okay”, “somewhat efficient”).
217
+ - Be direct and concrete: “Extra retrieval used for enrichment”, “Multiple summarizations of same data”, etc.
218
+
219
+ EXAMPLES
220
+
221
+ **Example 1:**
222
+ Task: "Summarize the given text."
223
+ Trace: Agent calls an LLM twice, then performs an extra web search.
224
+
225
+ → Output:
226
+ {{
227
+ "score": 0.25,
228
+ "reason": "The agent used redundant LLM calls and performed an unnecessary web search. Only one LLM call was required for the summary."
229
+ }}
230
+
231
+ **Example 2:**
232
+ Task: "Convert a date to ISO format."
233
+ Trace: Agent performs one computation directly.
234
+
235
+ → Output:
236
+ {{
237
+ "score": 1.0,
238
+ "reason": "The agent completed the task with one minimal action and no unnecessary steps."
239
+ }}
240
+
241
+ FINAL REMINDERS
242
+
243
+ - Efficiency = minimality. Any extra work, enrichment, or indirect approach must lower the score.
244
+ - Do not consider correctness, helpfulness, or reasoning quality.
245
+ - A “good answer” can still score **0.0** if it was achieved inefficiently.
246
+ - This metric is adversarial: assign the lowest score possible unless execution was provably minimal.
247
+
248
+ TASK:
249
+ {task}
250
+
251
+ TRACE:
252
+ {json.dumps(trace, indent=2, default=str)}
253
+
254
+ JSON:
255
+ """
256
+ )
@@ -44,6 +44,7 @@ class TaskCompletionMetric(BaseMetric):
44
44
  self.async_mode = async_mode
45
45
  self.strict_mode = strict_mode
46
46
  self.verbose_mode = verbose_mode
47
+ self.requires_trace = True
47
48
 
48
49
  def measure(
49
50
  self,
@@ -0,0 +1,6 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class ToolSelectionScore(BaseModel):
5
+ score: float
6
+ reason: str
@@ -0,0 +1,88 @@
1
+ import textwrap
2
+ import json
3
+
4
+
5
+ class ToolCorrectnessTemplate:
6
+
7
+ @staticmethod
8
+ def get_tool_selection_score(
9
+ user_input: str, tools_called: list, available_tools: list
10
+ ) -> str:
11
+ return textwrap.dedent(
12
+ f"""You are an expert evaluator assessing the **Tool Selection** quality of an AI agent.
13
+
14
+ You are given:
15
+ - The **user input** that defines the user's goal / task.
16
+ - A list of **available tools**, each with a name and description.
17
+ - A list of **tool calls made** by the agent during execution, including tool name and parameters.
18
+
19
+ Your job is to assign a **Tool Selection score** from 0.0 to 1.0 based on how appropriate and well-matched the agent's chosen tools were to the task's requirements.
20
+
21
+ ---
22
+
23
+ DEFINITION:
24
+
25
+ Tool Selection evaluates how suitable the agent's tool choices were in addressing the task and sub-tasks.
26
+
27
+ This metric does **not** consider:
28
+ - How well the tools were used (execution quality)
29
+ - Whether the agent adhered to a plan
30
+ - Whether the output was correct or efficient
31
+
32
+ It only assesses whether the **right tools** were selected, based on their stated descriptions and the demands of the task.
33
+
34
+ ---
35
+
36
+ INSTRUCTIONS:
37
+
38
+ Step 1: Read the **user task** to understand what needed to be accomplished.
39
+
40
+ Step 2: Examine the **available tools** and their descriptions to understand the intended purpose of each.
41
+
42
+ Step 3: Review the **tool calls made by the agent**:
43
+ - Were the selected tools well-aligned with the task?
44
+ - Were any obviously better-suited tools ignored?
45
+ - Were any tools misapplied or used unnecessarily?
46
+
47
+ Step 4: Identify selection issues:
48
+ - **Correct Selection**: Tool(s) chosen directly and appropriately matched the subtask.
49
+ - **Over-selection**: More tools were selected than necessary, despite availability of a simpler or more direct option.
50
+ - **Under-selection**: Key tools that were well-suited were omitted.
51
+ - **Mis-selection**: Tools were chosen that were poorly matched to their purpose or the subtask.
52
+
53
+ ---
54
+
55
+ SCORING GUIDE:
56
+
57
+ - **1.0** → All selected tools were appropriate and necessary. No better-suited tools were omitted.
58
+ - **0.75** → Tool choices were mostly appropriate, with minor omissions or unnecessary use.
59
+ - **0.5** → Mixed tool selection. Some useful tools ignored or some inappropriate ones used.
60
+ - **0.25** → Poor tool selection. Better alternatives were available and ignored.
61
+ - **0.0** → Tool selection was clearly misaligned with task requirements.
62
+
63
+ ---
64
+
65
+ OUTPUT FORMAT:
66
+
67
+ Return a valid JSON object with this exact structure:
68
+ {{
69
+ "score": float between 0.0 and 1.0,
70
+ "reason": "1-3 concise, factual sentences explaining the score. Reference specific tool names and descriptions when relevant."
71
+ }}
72
+
73
+ Do not include any additional commentary or output outside the JSON object.
74
+
75
+ ---
76
+
77
+ USER INPUT:
78
+ {user_input}
79
+
80
+ ALL AVAILABLE TOOLS:
81
+ {available_tools}
82
+
83
+ TOOL CALLS MADE BY AGENT:
84
+ {tools_called}
85
+
86
+ JSON:
87
+ """
88
+ )
@@ -1,10 +1,15 @@
1
- from typing import List, Dict
1
+ from typing import List, Dict, Optional, Union
2
2
 
3
3
  from deepeval.metrics.indicator import metric_progress_indicator
4
+ from deepeval.utils import get_or_create_event_loop, prettify_list
4
5
  from deepeval.metrics.utils import (
5
6
  construct_verbose_logs,
6
7
  check_llm_test_case_params,
8
+ trimAndLoadJson,
9
+ initialize_model,
10
+ print_tools_called,
7
11
  )
12
+ from deepeval.models import DeepEvalBaseLLM
8
13
  from deepeval.test_case import (
9
14
  LLMTestCase,
10
15
  LLMTestCaseParams,
@@ -13,6 +18,8 @@ from deepeval.test_case import (
13
18
  )
14
19
  from deepeval.metrics import BaseMetric
15
20
  from deepeval.metrics.api import metric_data_manager
21
+ from deepeval.metrics.tool_correctness.template import ToolCorrectnessTemplate
22
+ from deepeval.metrics.tool_correctness.schema import ToolSelectionScore
16
23
 
17
24
 
18
25
  class ToolCorrectnessMetric(BaseMetric):
@@ -25,15 +32,21 @@ class ToolCorrectnessMetric(BaseMetric):
25
32
 
26
33
  def __init__(
27
34
  self,
35
+ available_tools: List[ToolCall] = None,
28
36
  threshold: float = 0.5,
29
37
  evaluation_params: List[ToolCallParams] = [],
38
+ model: Optional[Union[str, DeepEvalBaseLLM]] = None,
30
39
  include_reason: bool = True,
40
+ async_mode: bool = True,
31
41
  strict_mode: bool = False,
32
42
  verbose_mode: bool = False,
33
43
  should_exact_match: bool = False,
34
44
  should_consider_ordering: bool = False,
35
45
  ):
46
+ self.available_tools = available_tools
36
47
  self.threshold = 1 if strict_mode else threshold
48
+ self.model, self.using_native_model = initialize_model(model)
49
+ self.async_mode = async_mode
37
50
  self.include_reason = include_reason
38
51
  self.strict_mode = strict_mode
39
52
  self.verbose_mode = verbose_mode
@@ -51,14 +64,140 @@ class ToolCorrectnessMetric(BaseMetric):
51
64
 
52
65
  check_llm_test_case_params(test_case, self._required_params, self)
53
66
  self.test_case = test_case
67
+ self.evaluation_cost = 0 if self.using_native_model else None
68
+
54
69
  with metric_progress_indicator(
55
70
  self, _show_indicator=_show_indicator, _in_component=_in_component
71
+ ):
72
+ if self.async_mode:
73
+ loop = get_or_create_event_loop()
74
+ loop.run_until_complete(
75
+ self.a_measure(
76
+ test_case,
77
+ _show_indicator=False,
78
+ _in_component=_in_component,
79
+ _log_metric_to_confident=_log_metric_to_confident,
80
+ )
81
+ )
82
+ else:
83
+ self.tools_called: List[ToolCall] = test_case.tools_called
84
+ self.expected_tools: List[ToolCall] = test_case.expected_tools
85
+ tool_calling_score = self._calculate_score()
86
+ if self.available_tools:
87
+ tool_selection_score = self._get_tool_selection_score(
88
+ test_case.input,
89
+ test_case.tools_called,
90
+ self.available_tools,
91
+ )
92
+ else:
93
+ tool_selection_score = tool_selection_score = (
94
+ ToolSelectionScore(
95
+ score=1,
96
+ reason="No available tools were provided to assess tool selection criteria",
97
+ )
98
+ )
99
+ score = min(tool_calling_score, tool_selection_score.score)
100
+ self.score = (
101
+ 0 if self.strict_mode and score < self.threshold else score
102
+ )
103
+ tool_calling_reason = self._generate_reason()
104
+ self.reason = self._construct_final_reason(
105
+ tool_calling_reason, tool_selection_score.reason
106
+ )
107
+ self.success = self.score >= self.threshold
108
+
109
+ expected_tools_formatted = (
110
+ "Expected Tools:\n[\n"
111
+ + ",\n".join(
112
+ self.indent_multiline_string(
113
+ repr(tool_call), indent_level=4
114
+ )
115
+ for tool_call in self.expected_tools
116
+ )
117
+ + "\n]"
118
+ )
119
+ tools_called_formatted = (
120
+ "Tools Called:\n[\n"
121
+ + ",\n".join(
122
+ self.indent_multiline_string(
123
+ repr(tool_call), indent_level=4
124
+ )
125
+ for tool_call in self.tools_called
126
+ )
127
+ + "\n]"
128
+ )
129
+ available_tools_formatted = (
130
+ (
131
+ "Available Tools:\n[\n"
132
+ + ",\n".join(
133
+ self.indent_multiline_string(
134
+ repr(tool_call), indent_level=4
135
+ )
136
+ for tool_call in self.available_tools
137
+ )
138
+ + "\n]"
139
+ )
140
+ if self.available_tools
141
+ else "Available Tools: []"
142
+ )
143
+ self.verbose_logs = construct_verbose_logs(
144
+ self,
145
+ steps=[
146
+ f"{expected_tools_formatted}",
147
+ f"{tools_called_formatted}",
148
+ f"{available_tools_formatted}",
149
+ f"Tool Selection Score: {tool_selection_score.score}",
150
+ f"Tool Selection Reason: {tool_selection_score.reason}",
151
+ f"Final Score: {self.score}\nFinal Reason: {self.reason}",
152
+ ],
153
+ )
154
+
155
+ if _log_metric_to_confident:
156
+ metric_data_manager.post_metric_if_enabled(
157
+ self, test_case=test_case
158
+ )
159
+ return self.score
160
+
161
+ async def a_measure(
162
+ self,
163
+ test_case: LLMTestCase,
164
+ _show_indicator: bool = True,
165
+ _in_component: bool = False,
166
+ _log_metric_to_confident: bool = True,
167
+ ) -> float:
168
+ check_llm_test_case_params(test_case, self._required_params, self)
169
+
170
+ self.evaluation_cost = 0 if self.using_native_model else None
171
+ with metric_progress_indicator(
172
+ self,
173
+ async_mode=True,
174
+ _show_indicator=_show_indicator,
175
+ _in_component=_in_component,
56
176
  ):
57
177
  self.tools_called: List[ToolCall] = test_case.tools_called
58
178
  self.expected_tools: List[ToolCall] = test_case.expected_tools
59
- self.score = self._calculate_score()
60
- self.reason = self._generate_reason()
179
+ tool_calling_score = self._calculate_score()
180
+ if self.available_tools:
181
+ tool_selection_score = await self._a_get_tool_selection_score(
182
+ test_case.input,
183
+ test_case.tools_called,
184
+ self.available_tools,
185
+ )
186
+ else:
187
+ tool_selection_score = ToolSelectionScore(
188
+ score=1,
189
+ reason="No available tools were provided to assess tool selection criteria",
190
+ )
191
+ score = min(tool_calling_score, tool_selection_score.score)
192
+ self.score = (
193
+ 0 if self.strict_mode and score < self.threshold else score
194
+ )
195
+ tool_calling_reason = self._generate_reason()
196
+ self.reason = self._construct_final_reason(
197
+ tool_calling_reason, tool_selection_score.reason
198
+ )
61
199
  self.success = self.score >= self.threshold
200
+
62
201
  expected_tools_formatted = (
63
202
  "Expected Tools:\n[\n"
64
203
  + ",\n".join(
@@ -79,12 +218,31 @@ class ToolCorrectnessMetric(BaseMetric):
79
218
  )
80
219
  + "\n]"
81
220
  )
82
- steps = [
83
- f"{expected_tools_formatted}",
84
- f"{tools_called_formatted}",
85
- ]
86
- steps.append(f"Score: {self.score}\nReason: {self.reason}")
87
- self.verbose_logs = construct_verbose_logs(self, steps=steps)
221
+ available_tools_formatted = (
222
+ (
223
+ "Available Tools:\n[\n"
224
+ + ",\n".join(
225
+ self.indent_multiline_string(
226
+ repr(tool_call), indent_level=4
227
+ )
228
+ for tool_call in self.available_tools
229
+ )
230
+ + "\n]"
231
+ )
232
+ if self.available_tools
233
+ else "Available Tools: []"
234
+ )
235
+ self.verbose_logs = construct_verbose_logs(
236
+ self,
237
+ steps=[
238
+ f"{expected_tools_formatted}",
239
+ f"{tools_called_formatted}",
240
+ f"{available_tools_formatted}",
241
+ f"Tool Selection Score: {tool_selection_score.score}",
242
+ f"Tool Selection Reason: {tool_selection_score.reason}",
243
+ f"Final Score: {self.score}\nFinal Reason: {self.reason}",
244
+ ],
245
+ )
88
246
 
89
247
  if _log_metric_to_confident:
90
248
  metric_data_manager.post_metric_if_enabled(
@@ -92,19 +250,6 @@ class ToolCorrectnessMetric(BaseMetric):
92
250
  )
93
251
  return self.score
94
252
 
95
- async def a_measure(
96
- self,
97
- test_case: LLMTestCase,
98
- _show_indicator: bool = True,
99
- _in_component: bool = False,
100
- _log_metric_to_confident: bool = True,
101
- ) -> float:
102
- return self.measure(
103
- test_case,
104
- _show_indicator=_show_indicator,
105
- _in_component=_in_component,
106
- )
107
-
108
253
  ##################################################
109
254
  ### Tool Correctness (Tool) ######################
110
255
  ##################################################
@@ -154,10 +299,69 @@ class ToolCorrectnessMetric(BaseMetric):
154
299
  else:
155
300
  return f"Incomplete tool usage: missing tools {list(missing)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
156
301
 
302
+ def _construct_final_reason(
303
+ self,
304
+ tool_calling_reason,
305
+ tool_selection_reason,
306
+ ):
307
+ final_reason = "[\n"
308
+ final_reason += "\t Tool Calling Reason: " + tool_calling_reason + "\n"
309
+ final_reason += (
310
+ "\t Tool Selection Reason: " + tool_selection_reason + "\n"
311
+ )
312
+ final_reason += "]\n"
313
+ return final_reason
314
+
157
315
  ##################################################
158
316
  ### Score Helper Functions #######################
159
317
  ##################################################
160
318
 
319
+ def _get_tool_selection_score(
320
+ self, user_input, tools_called, available_tools
321
+ ):
322
+ tools_called_formatted = print_tools_called(tools_called)
323
+ available_tools_formatted = print_tools_called(available_tools)
324
+ prompt = ToolCorrectnessTemplate.get_tool_selection_score(
325
+ user_input, tools_called_formatted, available_tools_formatted
326
+ )
327
+ if self.using_native_model:
328
+ res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
329
+ self.evaluation_cost += cost
330
+ return res
331
+ else:
332
+ try:
333
+ res = self.model.generate(prompt, schema=ToolSelectionScore)
334
+ return res
335
+ except TypeError:
336
+ res = self.model.generate(prompt)
337
+ data = trimAndLoadJson(res, self)
338
+ return ToolSelectionScore(**data)
339
+
340
+ async def _a_get_tool_selection_score(
341
+ self, user_input, tools_called, available_tools
342
+ ):
343
+ tools_called_formatted = print_tools_called(tools_called)
344
+ available_tools_formatted = print_tools_called(available_tools)
345
+ prompt = ToolCorrectnessTemplate.get_tool_selection_score(
346
+ user_input, tools_called_formatted, available_tools_formatted
347
+ )
348
+ if self.using_native_model:
349
+ res, cost = await self.model.a_generate(
350
+ prompt, schema=ToolSelectionScore
351
+ )
352
+ self.evaluation_cost += cost
353
+ return res
354
+ else:
355
+ try:
356
+ res = await self.model.a_generate(
357
+ prompt, schema=ToolSelectionScore
358
+ )
359
+ return res
360
+ except TypeError:
361
+ res = await self.model.a_generate(prompt)
362
+ data = trimAndLoadJson(res, self)
363
+ return ToolSelectionScore(**data)
364
+
161
365
  # Calculate score
162
366
  def _calculate_score(self):
163
367
  if self.should_exact_match:
@@ -0,0 +1 @@
1
+ from .tool_use import ToolUseMetric