deepeval 3.6.7__py3-none-any.whl → 3.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +104 -36
- deepeval/config/utils.py +5 -0
- deepeval/dataset/dataset.py +162 -30
- deepeval/dataset/utils.py +41 -13
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +1662 -688
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/indicator.py +21 -1
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/amazon_bedrock_model.py +20 -17
- deepeval/models/llms/openai_model.py +10 -1
- deepeval/models/retry_policy.py +103 -20
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/simulator/conversation_simulator.py +25 -18
- deepeval/synthesizer/chunking/context_generator.py +9 -1
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/RECORD +85 -63
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.9.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
import textwrap
|
|
2
|
+
import json
|
|
3
|
+
from deepeval.tracing.utils import make_json_serializable
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class StepEfficiencyTemplate:
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def extract_task_from_trace(trace: dict) -> str:
|
|
10
|
+
return textwrap.dedent(
|
|
11
|
+
f"""You are a **trace analyst** tasked with extracting the **user's original goal or task** from a complete nested execution trace of an AI agent.
|
|
12
|
+
|
|
13
|
+
YOUR OBJECTIVE:
|
|
14
|
+
|
|
15
|
+
Identify and describe **exactly what the user asked the agent to do**, based only on the user's explicit input and any unambiguous contextual details present in the trace.
|
|
16
|
+
|
|
17
|
+
Your goal is to produce a **concise, fact-based statement** that captures the *intended user task* — not the agent's plan, actions, reasoning, or assumptions.
|
|
18
|
+
|
|
19
|
+
STRICT EXTRACTION RULES:
|
|
20
|
+
|
|
21
|
+
1. Primary Source: Root-Level User Input
|
|
22
|
+
- The user's task must be derived **directly and primarily** from the root agent's `"input"` field.
|
|
23
|
+
- If that field contains nested `"input"` or `"messages"`, extract the true user instruction or request text from within it.
|
|
24
|
+
|
|
25
|
+
2. Secondary Context: Subtasks as Clarifiers (Optional)
|
|
26
|
+
- You may use child spans (tools, retrievers, LLMs) **only** to clarify or disambiguate what the user explicitly asked for —
|
|
27
|
+
e.g., to confirm that the task involves multiple subtasks the user clearly implied (like booking and planning steps for a trip).
|
|
28
|
+
- You may **NOT** infer new goals that the user did not state or imply.
|
|
29
|
+
|
|
30
|
+
3. No Hallucination
|
|
31
|
+
- Do **NOT** invent goals, assumptions, or implied needs beyond what is explicitly or clearly inferable from the input.
|
|
32
|
+
- If the user's request is vague, preserve that vagueness — do not expand it.
|
|
33
|
+
|
|
34
|
+
4. Agent-Agnostic Rule
|
|
35
|
+
- Ignore the agent's tools, methods, reasoning, or internal operations.
|
|
36
|
+
- The task reflects **what the user wanted**, not how the agent chose to approach it.
|
|
37
|
+
|
|
38
|
+
5. Perspective
|
|
39
|
+
- Express the extracted task **from the user's perspective**, as if restating what they asked the system to do.
|
|
40
|
+
- Avoid any meta or evaluative phrasing (“The user wanted the agent to…”).
|
|
41
|
+
|
|
42
|
+
6. Fallback Condition
|
|
43
|
+
- If the only available information about the task is the raw user input text, return that input verbatim without modification.
|
|
44
|
+
|
|
45
|
+
OUTPUT FORMAT:
|
|
46
|
+
|
|
47
|
+
Return **only** a JSON object of this form:
|
|
48
|
+
|
|
49
|
+
{{
|
|
50
|
+
"task": "<a single clear sentence summarizing the user's explicit goal>"
|
|
51
|
+
}}
|
|
52
|
+
|
|
53
|
+
- The `"task"` value should be a single, coherent natural language sentence or two at most.
|
|
54
|
+
- Do not include commentary, metadata, or any additional fields.
|
|
55
|
+
|
|
56
|
+
EXAMPLES:
|
|
57
|
+
|
|
58
|
+
Example Trace: {{
|
|
59
|
+
"name": "trip_planner",
|
|
60
|
+
"type": "agent",
|
|
61
|
+
"input": {{
|
|
62
|
+
"input": "Help me plan a business trip to Chicago next week."
|
|
63
|
+
}},
|
|
64
|
+
"children": [
|
|
65
|
+
{{
|
|
66
|
+
"name": "flight_tool",
|
|
67
|
+
"type": "tool",
|
|
68
|
+
"input": {{
|
|
69
|
+
"inputParameters": {{
|
|
70
|
+
"destination": "Chicago",
|
|
71
|
+
"date": "2024-07-10"
|
|
72
|
+
}} }},
|
|
73
|
+
"output": {{
|
|
74
|
+
"flights": ["Flight 101", "Flight 202"]
|
|
75
|
+
}},
|
|
76
|
+
"children": []
|
|
77
|
+
}},
|
|
78
|
+
{{
|
|
79
|
+
"name": "hotel_tool",
|
|
80
|
+
"type": "tool",
|
|
81
|
+
"input": {{
|
|
82
|
+
"inputParameters": {{
|
|
83
|
+
"location": "Chicago",
|
|
84
|
+
"check_in": "2024-07-10",
|
|
85
|
+
"check_out": "2024-07-12"
|
|
86
|
+
}} }},
|
|
87
|
+
"output": {{
|
|
88
|
+
"hotels": ["The Grand Chicago", "Lakeview Inn"]
|
|
89
|
+
}},
|
|
90
|
+
"children": []
|
|
91
|
+
}},
|
|
92
|
+
{{
|
|
93
|
+
"name": "agenda_llm",
|
|
94
|
+
"type": "llm",
|
|
95
|
+
"input": {{
|
|
96
|
+
"prompt": "Draft a meeting agenda",
|
|
97
|
+
"input": [
|
|
98
|
+
{{
|
|
99
|
+
"role": "system",
|
|
100
|
+
"content": "You are an executive assistant."
|
|
101
|
+
}},
|
|
102
|
+
{{
|
|
103
|
+
"role": "user",
|
|
104
|
+
"content": "Create an agenda for a client strategy meeting."
|
|
105
|
+
}}
|
|
106
|
+
]
|
|
107
|
+
}},
|
|
108
|
+
"output": "1. Q2 review\\n2. Client feedback\\n3. Strategy planning",
|
|
109
|
+
"children": []
|
|
110
|
+
}}
|
|
111
|
+
]
|
|
112
|
+
}}
|
|
113
|
+
|
|
114
|
+
Expected JSON:
|
|
115
|
+
{{
|
|
116
|
+
"task": "Plan a business trip to Chicago next week, including booking a flight, reserving a hotel, and drafting a client meeting agenda."
|
|
117
|
+
}}
|
|
118
|
+
|
|
119
|
+
IMPORTANT ENFORCEMENT RULES:
|
|
120
|
+
|
|
121
|
+
- If multiple user inputs exist, identify the overall task that user has in mind.
|
|
122
|
+
- Do not include execution details, tools, function names, or reasoning text.
|
|
123
|
+
- Avoid restating or paraphrasing beyond clarity; preserve the user's intent exactly.
|
|
124
|
+
- When uncertain, extract **less rather than more** — prefer minimal, factual phrasing over speculative completion.
|
|
125
|
+
|
|
126
|
+
TRACE DATA:
|
|
127
|
+
|
|
128
|
+
{json.dumps(trace, default=make_json_serializable, indent=2)}
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
### JSON:
|
|
133
|
+
"""
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def get_execution_efficiency(task: str, trace: dict) -> str:
|
|
138
|
+
return textwrap.dedent(
|
|
139
|
+
f"""You are an **efficiency auditor** evaluating how economically an AI agent executed a task.
|
|
140
|
+
|
|
141
|
+
OBJECTIVE:
|
|
142
|
+
|
|
143
|
+
Determine how **efficiently** the agent executed the given task based on its full execution trace.
|
|
144
|
+
Efficiency means achieving the user's goal using the **fewest, simplest, and most direct** actions possible.
|
|
145
|
+
|
|
146
|
+
You must assign a score from **0.0 to 1.0** that reflects how close the execution came to the *minimal necessary sequence of actions*.
|
|
147
|
+
|
|
148
|
+
**Important:** You are not evaluating correctness, completeness, creativity, or helpfulness — only the *efficiency* of the execution.
|
|
149
|
+
|
|
150
|
+
STRICT EVALUATION RULES:
|
|
151
|
+
|
|
152
|
+
1. Zero-Tolerance for Unnecessary Actions
|
|
153
|
+
- Every step, tool call, LLM query, or retrieval must be **strictly required** to fulfill the task.
|
|
154
|
+
- If a single tool, retrieval, or reasoning step is superfluous, speculative, repetitive, or stylistic,
|
|
155
|
+
the score must be as low as possible, regardless of outcome quality.
|
|
156
|
+
- Adding “helpful” or “contextual” actions that were not explicitly necessary is an inefficiency.
|
|
157
|
+
|
|
158
|
+
2. Minimal Action Principle
|
|
159
|
+
- The ideal execution performs the **exact minimum number of steps** needed to complete the task.
|
|
160
|
+
- Each step must directly contribute to completing the task, not to exploration, confirmation, or elaboration.
|
|
161
|
+
|
|
162
|
+
3. No Speculation or Enrichment
|
|
163
|
+
- Any activity aimed at *enhancing*, *expanding*, or *beautifying* the answer (e.g., extra retrievals, style edits, rephrasings)
|
|
164
|
+
reduces the score sharply (≤ 0.25).
|
|
165
|
+
- Efficiency is about restraint — **doing exactly what's required, nothing more**.
|
|
166
|
+
|
|
167
|
+
4. Directness and Focus
|
|
168
|
+
- Steps must appear in a logically minimal sequence from input to goal.
|
|
169
|
+
- Repetition, re-querying, nested reasoning loops, or tool reuse when not needed
|
|
170
|
+
indicate inefficiency.
|
|
171
|
+
|
|
172
|
+
5. Resource Economy
|
|
173
|
+
- Use of multiple LLM calls, retrievers, or tools when one would suffice must be penalized.
|
|
174
|
+
- Avoided resources (if the agent achieved the task through simpler means) improve efficiency.
|
|
175
|
+
|
|
176
|
+
6. When in Doubt
|
|
177
|
+
- If it is unclear whether an action was required or not, **assume it was unnecessary** and lower the score.
|
|
178
|
+
- Err on the side of penalizing over generosity.
|
|
179
|
+
|
|
180
|
+
SCORING SCALE (STRICT)
|
|
181
|
+
|
|
182
|
+
- **1.0 — Perfectly efficient**
|
|
183
|
+
- Only essential steps taken.
|
|
184
|
+
- Each action was directly necessary for task completion.
|
|
185
|
+
- No speculative, redundant, or decorative work.
|
|
186
|
+
|
|
187
|
+
- **0.75 — Strong efficiency**
|
|
188
|
+
- Mostly minimal execution with one small redundant or stylistic step.
|
|
189
|
+
- Slight overuse of a tool or repeated call, but otherwise tight.
|
|
190
|
+
|
|
191
|
+
- **0.5 — Moderate efficiency**
|
|
192
|
+
- Noticeable inefficiency: extra steps, unnecessary tool calls, or indirect methods.
|
|
193
|
+
- The same task could clearly have been completed faster or with fewer actions.
|
|
194
|
+
|
|
195
|
+
- **0.25 — Low efficiency**
|
|
196
|
+
- Multiple irrelevant or unjustified actions taken.
|
|
197
|
+
- Execution path significantly longer or more complex than needed.
|
|
198
|
+
|
|
199
|
+
- **0.0 — Highly inefficient**
|
|
200
|
+
- Execution was verbose, exploratory, speculative, or wasteful.
|
|
201
|
+
- Most actions were unnecessary or unrelated to achieving the core task.
|
|
202
|
+
|
|
203
|
+
*When uncertain, always assign the lower score.*
|
|
204
|
+
|
|
205
|
+
OUTPUT FORMAT:
|
|
206
|
+
|
|
207
|
+
Return a single JSON object in this exact format:
|
|
208
|
+
|
|
209
|
+
{{
|
|
210
|
+
"score": 0.0,
|
|
211
|
+
"reason": "1-3 concise factual sentences describing where inefficiencies occurred."
|
|
212
|
+
}}
|
|
213
|
+
|
|
214
|
+
The `reason` must:
|
|
215
|
+
- Identify specific inefficient actions (e.g., redundant LLM call, unnecessary retrieval, speculative tool use).
|
|
216
|
+
- Avoid subjective phrasing (“reasonable”, “seems okay”, “somewhat efficient”).
|
|
217
|
+
- Be direct and concrete: “Extra retrieval used for enrichment”, “Multiple summarizations of same data”, etc.
|
|
218
|
+
|
|
219
|
+
EXAMPLES
|
|
220
|
+
|
|
221
|
+
**Example 1:**
|
|
222
|
+
Task: "Summarize the given text."
|
|
223
|
+
Trace: Agent calls an LLM twice, then performs an extra web search.
|
|
224
|
+
|
|
225
|
+
→ Output:
|
|
226
|
+
{{
|
|
227
|
+
"score": 0.25,
|
|
228
|
+
"reason": "The agent used redundant LLM calls and performed an unnecessary web search. Only one LLM call was required for the summary."
|
|
229
|
+
}}
|
|
230
|
+
|
|
231
|
+
**Example 2:**
|
|
232
|
+
Task: "Convert a date to ISO format."
|
|
233
|
+
Trace: Agent performs one computation directly.
|
|
234
|
+
|
|
235
|
+
→ Output:
|
|
236
|
+
{{
|
|
237
|
+
"score": 1.0,
|
|
238
|
+
"reason": "The agent completed the task with one minimal action and no unnecessary steps."
|
|
239
|
+
}}
|
|
240
|
+
|
|
241
|
+
FINAL REMINDERS
|
|
242
|
+
|
|
243
|
+
- Efficiency = minimality. Any extra work, enrichment, or indirect approach must lower the score.
|
|
244
|
+
- Do not consider correctness, helpfulness, or reasoning quality.
|
|
245
|
+
- A “good answer” can still score **0.0** if it was achieved inefficiently.
|
|
246
|
+
- This metric is adversarial: assign the lowest score possible unless execution was provably minimal.
|
|
247
|
+
|
|
248
|
+
TASK:
|
|
249
|
+
{task}
|
|
250
|
+
|
|
251
|
+
TRACE:
|
|
252
|
+
{json.dumps(trace, indent=2, default=str)}
|
|
253
|
+
|
|
254
|
+
JSON:
|
|
255
|
+
"""
|
|
256
|
+
)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import textwrap
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ToolCorrectnessTemplate:
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def get_tool_selection_score(
|
|
9
|
+
user_input: str, tools_called: list, available_tools: list
|
|
10
|
+
) -> str:
|
|
11
|
+
return textwrap.dedent(
|
|
12
|
+
f"""You are an expert evaluator assessing the **Tool Selection** quality of an AI agent.
|
|
13
|
+
|
|
14
|
+
You are given:
|
|
15
|
+
- The **user input** that defines the user's goal / task.
|
|
16
|
+
- A list of **available tools**, each with a name and description.
|
|
17
|
+
- A list of **tool calls made** by the agent during execution, including tool name and parameters.
|
|
18
|
+
|
|
19
|
+
Your job is to assign a **Tool Selection score** from 0.0 to 1.0 based on how appropriate and well-matched the agent's chosen tools were to the task's requirements.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
DEFINITION:
|
|
24
|
+
|
|
25
|
+
Tool Selection evaluates how suitable the agent's tool choices were in addressing the task and sub-tasks.
|
|
26
|
+
|
|
27
|
+
This metric does **not** consider:
|
|
28
|
+
- How well the tools were used (execution quality)
|
|
29
|
+
- Whether the agent adhered to a plan
|
|
30
|
+
- Whether the output was correct or efficient
|
|
31
|
+
|
|
32
|
+
It only assesses whether the **right tools** were selected, based on their stated descriptions and the demands of the task.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
INSTRUCTIONS:
|
|
37
|
+
|
|
38
|
+
Step 1: Read the **user task** to understand what needed to be accomplished.
|
|
39
|
+
|
|
40
|
+
Step 2: Examine the **available tools** and their descriptions to understand the intended purpose of each.
|
|
41
|
+
|
|
42
|
+
Step 3: Review the **tool calls made by the agent**:
|
|
43
|
+
- Were the selected tools well-aligned with the task?
|
|
44
|
+
- Were any obviously better-suited tools ignored?
|
|
45
|
+
- Were any tools misapplied or used unnecessarily?
|
|
46
|
+
|
|
47
|
+
Step 4: Identify selection issues:
|
|
48
|
+
- **Correct Selection**: Tool(s) chosen directly and appropriately matched the subtask.
|
|
49
|
+
- **Over-selection**: More tools were selected than necessary, despite availability of a simpler or more direct option.
|
|
50
|
+
- **Under-selection**: Key tools that were well-suited were omitted.
|
|
51
|
+
- **Mis-selection**: Tools were chosen that were poorly matched to their purpose or the subtask.
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
SCORING GUIDE:
|
|
56
|
+
|
|
57
|
+
- **1.0** → All selected tools were appropriate and necessary. No better-suited tools were omitted.
|
|
58
|
+
- **0.75** → Tool choices were mostly appropriate, with minor omissions or unnecessary use.
|
|
59
|
+
- **0.5** → Mixed tool selection. Some useful tools ignored or some inappropriate ones used.
|
|
60
|
+
- **0.25** → Poor tool selection. Better alternatives were available and ignored.
|
|
61
|
+
- **0.0** → Tool selection was clearly misaligned with task requirements.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
OUTPUT FORMAT:
|
|
66
|
+
|
|
67
|
+
Return a valid JSON object with this exact structure:
|
|
68
|
+
{{
|
|
69
|
+
"score": float between 0.0 and 1.0,
|
|
70
|
+
"reason": "1-3 concise, factual sentences explaining the score. Reference specific tool names and descriptions when relevant."
|
|
71
|
+
}}
|
|
72
|
+
|
|
73
|
+
Do not include any additional commentary or output outside the JSON object.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
USER INPUT:
|
|
78
|
+
{user_input}
|
|
79
|
+
|
|
80
|
+
ALL AVAILABLE TOOLS:
|
|
81
|
+
{available_tools}
|
|
82
|
+
|
|
83
|
+
TOOL CALLS MADE BY AGENT:
|
|
84
|
+
{tools_called}
|
|
85
|
+
|
|
86
|
+
JSON:
|
|
87
|
+
"""
|
|
88
|
+
)
|
|
@@ -1,10 +1,15 @@
|
|
|
1
|
-
from typing import List, Dict
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
4
|
+
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
5
|
from deepeval.metrics.utils import (
|
|
5
6
|
construct_verbose_logs,
|
|
6
7
|
check_llm_test_case_params,
|
|
8
|
+
trimAndLoadJson,
|
|
9
|
+
initialize_model,
|
|
10
|
+
print_tools_called,
|
|
7
11
|
)
|
|
12
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
8
13
|
from deepeval.test_case import (
|
|
9
14
|
LLMTestCase,
|
|
10
15
|
LLMTestCaseParams,
|
|
@@ -13,6 +18,8 @@ from deepeval.test_case import (
|
|
|
13
18
|
)
|
|
14
19
|
from deepeval.metrics import BaseMetric
|
|
15
20
|
from deepeval.metrics.api import metric_data_manager
|
|
21
|
+
from deepeval.metrics.tool_correctness.template import ToolCorrectnessTemplate
|
|
22
|
+
from deepeval.metrics.tool_correctness.schema import ToolSelectionScore
|
|
16
23
|
|
|
17
24
|
|
|
18
25
|
class ToolCorrectnessMetric(BaseMetric):
|
|
@@ -25,15 +32,21 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
25
32
|
|
|
26
33
|
def __init__(
|
|
27
34
|
self,
|
|
35
|
+
available_tools: List[ToolCall] = None,
|
|
28
36
|
threshold: float = 0.5,
|
|
29
37
|
evaluation_params: List[ToolCallParams] = [],
|
|
38
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
30
39
|
include_reason: bool = True,
|
|
40
|
+
async_mode: bool = True,
|
|
31
41
|
strict_mode: bool = False,
|
|
32
42
|
verbose_mode: bool = False,
|
|
33
43
|
should_exact_match: bool = False,
|
|
34
44
|
should_consider_ordering: bool = False,
|
|
35
45
|
):
|
|
46
|
+
self.available_tools = available_tools
|
|
36
47
|
self.threshold = 1 if strict_mode else threshold
|
|
48
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
49
|
+
self.async_mode = async_mode
|
|
37
50
|
self.include_reason = include_reason
|
|
38
51
|
self.strict_mode = strict_mode
|
|
39
52
|
self.verbose_mode = verbose_mode
|
|
@@ -51,14 +64,140 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
51
64
|
|
|
52
65
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
53
66
|
self.test_case = test_case
|
|
67
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
68
|
+
|
|
54
69
|
with metric_progress_indicator(
|
|
55
70
|
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
71
|
+
):
|
|
72
|
+
if self.async_mode:
|
|
73
|
+
loop = get_or_create_event_loop()
|
|
74
|
+
loop.run_until_complete(
|
|
75
|
+
self.a_measure(
|
|
76
|
+
test_case,
|
|
77
|
+
_show_indicator=False,
|
|
78
|
+
_in_component=_in_component,
|
|
79
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
self.tools_called: List[ToolCall] = test_case.tools_called
|
|
84
|
+
self.expected_tools: List[ToolCall] = test_case.expected_tools
|
|
85
|
+
tool_calling_score = self._calculate_score()
|
|
86
|
+
if self.available_tools:
|
|
87
|
+
tool_selection_score = self._get_tool_selection_score(
|
|
88
|
+
test_case.input,
|
|
89
|
+
test_case.tools_called,
|
|
90
|
+
self.available_tools,
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
tool_selection_score = tool_selection_score = (
|
|
94
|
+
ToolSelectionScore(
|
|
95
|
+
score=1,
|
|
96
|
+
reason="No available tools were provided to assess tool selection criteria",
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
score = min(tool_calling_score, tool_selection_score.score)
|
|
100
|
+
self.score = (
|
|
101
|
+
0 if self.strict_mode and score < self.threshold else score
|
|
102
|
+
)
|
|
103
|
+
tool_calling_reason = self._generate_reason()
|
|
104
|
+
self.reason = self._construct_final_reason(
|
|
105
|
+
tool_calling_reason, tool_selection_score.reason
|
|
106
|
+
)
|
|
107
|
+
self.success = self.score >= self.threshold
|
|
108
|
+
|
|
109
|
+
expected_tools_formatted = (
|
|
110
|
+
"Expected Tools:\n[\n"
|
|
111
|
+
+ ",\n".join(
|
|
112
|
+
self.indent_multiline_string(
|
|
113
|
+
repr(tool_call), indent_level=4
|
|
114
|
+
)
|
|
115
|
+
for tool_call in self.expected_tools
|
|
116
|
+
)
|
|
117
|
+
+ "\n]"
|
|
118
|
+
)
|
|
119
|
+
tools_called_formatted = (
|
|
120
|
+
"Tools Called:\n[\n"
|
|
121
|
+
+ ",\n".join(
|
|
122
|
+
self.indent_multiline_string(
|
|
123
|
+
repr(tool_call), indent_level=4
|
|
124
|
+
)
|
|
125
|
+
for tool_call in self.tools_called
|
|
126
|
+
)
|
|
127
|
+
+ "\n]"
|
|
128
|
+
)
|
|
129
|
+
available_tools_formatted = (
|
|
130
|
+
(
|
|
131
|
+
"Available Tools:\n[\n"
|
|
132
|
+
+ ",\n".join(
|
|
133
|
+
self.indent_multiline_string(
|
|
134
|
+
repr(tool_call), indent_level=4
|
|
135
|
+
)
|
|
136
|
+
for tool_call in self.available_tools
|
|
137
|
+
)
|
|
138
|
+
+ "\n]"
|
|
139
|
+
)
|
|
140
|
+
if self.available_tools
|
|
141
|
+
else "Available Tools: []"
|
|
142
|
+
)
|
|
143
|
+
self.verbose_logs = construct_verbose_logs(
|
|
144
|
+
self,
|
|
145
|
+
steps=[
|
|
146
|
+
f"{expected_tools_formatted}",
|
|
147
|
+
f"{tools_called_formatted}",
|
|
148
|
+
f"{available_tools_formatted}",
|
|
149
|
+
f"Tool Selection Score: {tool_selection_score.score}",
|
|
150
|
+
f"Tool Selection Reason: {tool_selection_score.reason}",
|
|
151
|
+
f"Final Score: {self.score}\nFinal Reason: {self.reason}",
|
|
152
|
+
],
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if _log_metric_to_confident:
|
|
156
|
+
metric_data_manager.post_metric_if_enabled(
|
|
157
|
+
self, test_case=test_case
|
|
158
|
+
)
|
|
159
|
+
return self.score
|
|
160
|
+
|
|
161
|
+
async def a_measure(
|
|
162
|
+
self,
|
|
163
|
+
test_case: LLMTestCase,
|
|
164
|
+
_show_indicator: bool = True,
|
|
165
|
+
_in_component: bool = False,
|
|
166
|
+
_log_metric_to_confident: bool = True,
|
|
167
|
+
) -> float:
|
|
168
|
+
check_llm_test_case_params(test_case, self._required_params, self)
|
|
169
|
+
|
|
170
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
171
|
+
with metric_progress_indicator(
|
|
172
|
+
self,
|
|
173
|
+
async_mode=True,
|
|
174
|
+
_show_indicator=_show_indicator,
|
|
175
|
+
_in_component=_in_component,
|
|
56
176
|
):
|
|
57
177
|
self.tools_called: List[ToolCall] = test_case.tools_called
|
|
58
178
|
self.expected_tools: List[ToolCall] = test_case.expected_tools
|
|
59
|
-
|
|
60
|
-
|
|
179
|
+
tool_calling_score = self._calculate_score()
|
|
180
|
+
if self.available_tools:
|
|
181
|
+
tool_selection_score = await self._a_get_tool_selection_score(
|
|
182
|
+
test_case.input,
|
|
183
|
+
test_case.tools_called,
|
|
184
|
+
self.available_tools,
|
|
185
|
+
)
|
|
186
|
+
else:
|
|
187
|
+
tool_selection_score = ToolSelectionScore(
|
|
188
|
+
score=1,
|
|
189
|
+
reason="No available tools were provided to assess tool selection criteria",
|
|
190
|
+
)
|
|
191
|
+
score = min(tool_calling_score, tool_selection_score.score)
|
|
192
|
+
self.score = (
|
|
193
|
+
0 if self.strict_mode and score < self.threshold else score
|
|
194
|
+
)
|
|
195
|
+
tool_calling_reason = self._generate_reason()
|
|
196
|
+
self.reason = self._construct_final_reason(
|
|
197
|
+
tool_calling_reason, tool_selection_score.reason
|
|
198
|
+
)
|
|
61
199
|
self.success = self.score >= self.threshold
|
|
200
|
+
|
|
62
201
|
expected_tools_formatted = (
|
|
63
202
|
"Expected Tools:\n[\n"
|
|
64
203
|
+ ",\n".join(
|
|
@@ -79,12 +218,31 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
79
218
|
)
|
|
80
219
|
+ "\n]"
|
|
81
220
|
)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
221
|
+
available_tools_formatted = (
|
|
222
|
+
(
|
|
223
|
+
"Available Tools:\n[\n"
|
|
224
|
+
+ ",\n".join(
|
|
225
|
+
self.indent_multiline_string(
|
|
226
|
+
repr(tool_call), indent_level=4
|
|
227
|
+
)
|
|
228
|
+
for tool_call in self.available_tools
|
|
229
|
+
)
|
|
230
|
+
+ "\n]"
|
|
231
|
+
)
|
|
232
|
+
if self.available_tools
|
|
233
|
+
else "Available Tools: []"
|
|
234
|
+
)
|
|
235
|
+
self.verbose_logs = construct_verbose_logs(
|
|
236
|
+
self,
|
|
237
|
+
steps=[
|
|
238
|
+
f"{expected_tools_formatted}",
|
|
239
|
+
f"{tools_called_formatted}",
|
|
240
|
+
f"{available_tools_formatted}",
|
|
241
|
+
f"Tool Selection Score: {tool_selection_score.score}",
|
|
242
|
+
f"Tool Selection Reason: {tool_selection_score.reason}",
|
|
243
|
+
f"Final Score: {self.score}\nFinal Reason: {self.reason}",
|
|
244
|
+
],
|
|
245
|
+
)
|
|
88
246
|
|
|
89
247
|
if _log_metric_to_confident:
|
|
90
248
|
metric_data_manager.post_metric_if_enabled(
|
|
@@ -92,19 +250,6 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
92
250
|
)
|
|
93
251
|
return self.score
|
|
94
252
|
|
|
95
|
-
async def a_measure(
|
|
96
|
-
self,
|
|
97
|
-
test_case: LLMTestCase,
|
|
98
|
-
_show_indicator: bool = True,
|
|
99
|
-
_in_component: bool = False,
|
|
100
|
-
_log_metric_to_confident: bool = True,
|
|
101
|
-
) -> float:
|
|
102
|
-
return self.measure(
|
|
103
|
-
test_case,
|
|
104
|
-
_show_indicator=_show_indicator,
|
|
105
|
-
_in_component=_in_component,
|
|
106
|
-
)
|
|
107
|
-
|
|
108
253
|
##################################################
|
|
109
254
|
### Tool Correctness (Tool) ######################
|
|
110
255
|
##################################################
|
|
@@ -154,10 +299,69 @@ class ToolCorrectnessMetric(BaseMetric):
|
|
|
154
299
|
else:
|
|
155
300
|
return f"Incomplete tool usage: missing tools {list(missing)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
|
|
156
301
|
|
|
302
|
+
def _construct_final_reason(
|
|
303
|
+
self,
|
|
304
|
+
tool_calling_reason,
|
|
305
|
+
tool_selection_reason,
|
|
306
|
+
):
|
|
307
|
+
final_reason = "[\n"
|
|
308
|
+
final_reason += "\t Tool Calling Reason: " + tool_calling_reason + "\n"
|
|
309
|
+
final_reason += (
|
|
310
|
+
"\t Tool Selection Reason: " + tool_selection_reason + "\n"
|
|
311
|
+
)
|
|
312
|
+
final_reason += "]\n"
|
|
313
|
+
return final_reason
|
|
314
|
+
|
|
157
315
|
##################################################
|
|
158
316
|
### Score Helper Functions #######################
|
|
159
317
|
##################################################
|
|
160
318
|
|
|
319
|
+
def _get_tool_selection_score(
|
|
320
|
+
self, user_input, tools_called, available_tools
|
|
321
|
+
):
|
|
322
|
+
tools_called_formatted = print_tools_called(tools_called)
|
|
323
|
+
available_tools_formatted = print_tools_called(available_tools)
|
|
324
|
+
prompt = ToolCorrectnessTemplate.get_tool_selection_score(
|
|
325
|
+
user_input, tools_called_formatted, available_tools_formatted
|
|
326
|
+
)
|
|
327
|
+
if self.using_native_model:
|
|
328
|
+
res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
|
|
329
|
+
self.evaluation_cost += cost
|
|
330
|
+
return res
|
|
331
|
+
else:
|
|
332
|
+
try:
|
|
333
|
+
res = self.model.generate(prompt, schema=ToolSelectionScore)
|
|
334
|
+
return res
|
|
335
|
+
except TypeError:
|
|
336
|
+
res = self.model.generate(prompt)
|
|
337
|
+
data = trimAndLoadJson(res, self)
|
|
338
|
+
return ToolSelectionScore(**data)
|
|
339
|
+
|
|
340
|
+
async def _a_get_tool_selection_score(
|
|
341
|
+
self, user_input, tools_called, available_tools
|
|
342
|
+
):
|
|
343
|
+
tools_called_formatted = print_tools_called(tools_called)
|
|
344
|
+
available_tools_formatted = print_tools_called(available_tools)
|
|
345
|
+
prompt = ToolCorrectnessTemplate.get_tool_selection_score(
|
|
346
|
+
user_input, tools_called_formatted, available_tools_formatted
|
|
347
|
+
)
|
|
348
|
+
if self.using_native_model:
|
|
349
|
+
res, cost = await self.model.a_generate(
|
|
350
|
+
prompt, schema=ToolSelectionScore
|
|
351
|
+
)
|
|
352
|
+
self.evaluation_cost += cost
|
|
353
|
+
return res
|
|
354
|
+
else:
|
|
355
|
+
try:
|
|
356
|
+
res = await self.model.a_generate(
|
|
357
|
+
prompt, schema=ToolSelectionScore
|
|
358
|
+
)
|
|
359
|
+
return res
|
|
360
|
+
except TypeError:
|
|
361
|
+
res = await self.model.a_generate(prompt)
|
|
362
|
+
data = trimAndLoadJson(res, self)
|
|
363
|
+
return ToolSelectionScore(**data)
|
|
364
|
+
|
|
161
365
|
# Calculate score
|
|
162
366
|
def _calculate_score(self):
|
|
163
367
|
if self.should_exact_match:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .tool_use import ToolUseMetric
|