deepeval 3.6.7__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/execute.py +725 -217
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +13 -3
- deepeval/integrations/crewai/__init__.py +2 -1
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/contextual_precision/contextual_precision.py +27 -21
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +11 -7
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +8 -8
- deepeval/metrics/mcp/mcp_task_completion.py +7 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +16 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +2 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +32 -24
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/task_completion/task_completion.py +1 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +226 -22
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/openai/extractors.py +61 -16
- deepeval/openai/patch.py +8 -12
- deepeval/openai/types.py +1 -1
- deepeval/openai/utils.py +108 -1
- deepeval/prompt/prompt.py +1 -0
- deepeval/prompt/utils.py +43 -14
- deepeval/synthesizer/synthesizer.py +11 -10
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/test_run.py +190 -207
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +23 -4
- deepeval/tracing/trace_context.py +53 -38
- deepeval/tracing/tracing.py +23 -0
- deepeval/tracing/types.py +16 -14
- deepeval/utils.py +21 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/RECORD +75 -53
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- deepeval/tracing/message_types/__init__.py +0 -10
- deepeval/tracing/message_types/base.py +0 -6
- deepeval/tracing/message_types/messages.py +0 -14
- deepeval/tracing/message_types/tools.py +0 -18
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.7.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from pydantic import BaseModel
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class UserInputAndTools(BaseModel):
|
|
5
|
+
user_messages: str
|
|
6
|
+
assistant_messages: str
|
|
7
|
+
tools_called: str
|
|
8
|
+
available_tools: str
|
|
9
|
+
tools_used: bool
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ToolSelectionScore(BaseModel):
|
|
13
|
+
score: float
|
|
14
|
+
reason: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ArgumentCorrectnessScore(BaseModel):
|
|
18
|
+
score: float
|
|
19
|
+
reason: str
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
import textwrap
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ToolUseTemplate:
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def get_tool_selection_score(
|
|
9
|
+
user_input: str,
|
|
10
|
+
assistant_messages: str,
|
|
11
|
+
tools_called: str,
|
|
12
|
+
available_tools: str,
|
|
13
|
+
) -> str:
|
|
14
|
+
return textwrap.dedent(
|
|
15
|
+
f"""You are an expert evaluator assessing the **Tool Selection Quality** of an AI agent.
|
|
16
|
+
|
|
17
|
+
OBJECTIVE
|
|
18
|
+
Evaluate whether the agent **selected the most appropriate tools** for completing the user's task, given a list of available tools.
|
|
19
|
+
|
|
20
|
+
This metric focuses **only** on which tools were chosen — **not** how they were used or whether they succeeded.
|
|
21
|
+
|
|
22
|
+
EVALUATION RULES
|
|
23
|
+
|
|
24
|
+
1. Relevance
|
|
25
|
+
- Each tool used must directly support the user's stated goal or a clear sub-task derived from it.
|
|
26
|
+
- Tools unrelated to the goal lower the score sharply.
|
|
27
|
+
|
|
28
|
+
2. Appropriateness
|
|
29
|
+
- The chosen tools must match their described purpose.
|
|
30
|
+
- If a more suitable tool existed and was ignored, score ≤ 0.5.
|
|
31
|
+
|
|
32
|
+
3. Necessity
|
|
33
|
+
- Every tool call must be justified by clear need.
|
|
34
|
+
- Redundant or speculative tool use (e.g., calling multiple tools that overlap) reduces the score.
|
|
35
|
+
|
|
36
|
+
4. Strictness
|
|
37
|
+
- When uncertain if a tool was required or correctly chosen, assume it was **not** appropriate.
|
|
38
|
+
- Only perfect alignment between the task and tool choice earns a high score.
|
|
39
|
+
|
|
40
|
+
SCORING GUIDE:
|
|
41
|
+
|
|
42
|
+
- **1.0** → Every tool used was necessary and perfectly matched to the task; no better alternative ignored.
|
|
43
|
+
- **0.75** → Tool selection was mostly correct, with only minor redundancy or a small omission.
|
|
44
|
+
- **0.5** → Mixed quality; some appropriate selections, but others questionable or missing.
|
|
45
|
+
- **0.25** → Poor selection; major mismatches or misuse of available tools.
|
|
46
|
+
- **0.0** → Tool selection irrelevant, random, or unjustified.
|
|
47
|
+
|
|
48
|
+
OUTPUT FORMAT:
|
|
49
|
+
|
|
50
|
+
Return a JSON object with:
|
|
51
|
+
|
|
52
|
+
{{
|
|
53
|
+
"score": float between 0.0 and 1.0,
|
|
54
|
+
"reason": "1-3 factual sentences explaining which tools were appropriate or inappropriate for the task, referencing specific tool names."
|
|
55
|
+
}}
|
|
56
|
+
|
|
57
|
+
USER INPUT:
|
|
58
|
+
{user_input}
|
|
59
|
+
|
|
60
|
+
ASSISTANT MESSAGES:
|
|
61
|
+
{assistant_messages}
|
|
62
|
+
|
|
63
|
+
TOOLS CALLED:
|
|
64
|
+
{tools_called}
|
|
65
|
+
|
|
66
|
+
AVAILABLE TOOLS:
|
|
67
|
+
{available_tools}
|
|
68
|
+
|
|
69
|
+
JSON:
|
|
70
|
+
"""
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def get_argument_correctness_score(
|
|
75
|
+
user_input: str,
|
|
76
|
+
assistant_messages: str,
|
|
77
|
+
tools_called: str,
|
|
78
|
+
available_tools: str,
|
|
79
|
+
) -> str:
|
|
80
|
+
return textwrap.dedent(
|
|
81
|
+
f"""You are an expert evaluator assessing the **Tool Argument Quality** of an AI agent.
|
|
82
|
+
|
|
83
|
+
OBJECTIVE:
|
|
84
|
+
|
|
85
|
+
Evaluate whether the **arguments and parameters** passed to each tool were:
|
|
86
|
+
- Correctly structured and complete.
|
|
87
|
+
- Contextually appropriate for the user's goal.
|
|
88
|
+
- Compatible with each tool's intended purpose.
|
|
89
|
+
|
|
90
|
+
This metric focuses **only** on argument-level correctness and relevance — not which tools were chosen.
|
|
91
|
+
|
|
92
|
+
EVALUATION RULES
|
|
93
|
+
|
|
94
|
+
1. Relevance
|
|
95
|
+
- Each argument must align with the task and the tool's documented input fields.
|
|
96
|
+
- Unrelated, empty, or default arguments reduce the score sharply.
|
|
97
|
+
|
|
98
|
+
2. **Completeness**
|
|
99
|
+
- All required parameters must be provided.
|
|
100
|
+
- Missing or malformed arguments (e.g., wrong data types or incomplete context) lower the score.
|
|
101
|
+
|
|
102
|
+
3. **Specificity**
|
|
103
|
+
- Arguments should reflect task-specific values, not generic placeholders.
|
|
104
|
+
- Overly vague or default arguments are penalized.
|
|
105
|
+
|
|
106
|
+
4. **Justification**
|
|
107
|
+
- Each argument must make sense in context.
|
|
108
|
+
- If it doesn't clearly derive from the user's request, assume it's incorrect.
|
|
109
|
+
|
|
110
|
+
5. **Strict Bias**
|
|
111
|
+
- When uncertain whether arguments fit the tool or task, assume they were **incorrect**.
|
|
112
|
+
|
|
113
|
+
SCORING GUIDE:
|
|
114
|
+
|
|
115
|
+
- **1.0** → All arguments are accurate, specific, and fully aligned with both the task and tool requirements.
|
|
116
|
+
- **0.75** → Mostly correct; minor omissions or small mismatches.
|
|
117
|
+
- **0.5** → Partial correctness; some valid parameters, but key ones missing or off-target.
|
|
118
|
+
- **0.25** → Poor argument quality; several invalid or irrelevant fields.
|
|
119
|
+
- **0.0** → Arguments nonsensical, generic, or unrelated to task/tool intent.
|
|
120
|
+
|
|
121
|
+
OUTPUT FORMAT:
|
|
122
|
+
|
|
123
|
+
Return a JSON object with:
|
|
124
|
+
{{
|
|
125
|
+
"score": float between 0.0 and 1.0,
|
|
126
|
+
"reason": "1-3 sentences explaining argument alignment or issues, referencing specific parameter names or values when possible."
|
|
127
|
+
}}
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
USER INPUT:
|
|
132
|
+
{user_input}
|
|
133
|
+
|
|
134
|
+
ASSISTANT MESSAGES:
|
|
135
|
+
{assistant_messages}
|
|
136
|
+
|
|
137
|
+
TOOLS CALLED (with arguments):
|
|
138
|
+
{tools_called}
|
|
139
|
+
|
|
140
|
+
AVAILABLE TOOLS:
|
|
141
|
+
{available_tools}
|
|
142
|
+
|
|
143
|
+
JSON:
|
|
144
|
+
"""
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
@staticmethod
|
|
148
|
+
def get_tool_selection_final_reason(
|
|
149
|
+
all_scores_and_reasons: str, final_score: float, threshold: float
|
|
150
|
+
) -> str:
|
|
151
|
+
return textwrap.dedent(
|
|
152
|
+
f"""You are an expert evaluator summarizing the outcome of a **Tool Selection** evaluation.
|
|
153
|
+
|
|
154
|
+
You are given:
|
|
155
|
+
- A list of **tool selection sub-scores and reasons**, each describing how appropriately the agent chose tools for its task.
|
|
156
|
+
- The **final aggregated score** across all sub-evaluations.
|
|
157
|
+
- A **threshold** representing the minimum passing score.
|
|
158
|
+
|
|
159
|
+
Your task is to write a **single concise explanation (1-3 sentences)** that captures:
|
|
160
|
+
- Why the agent **passed or failed** based on tool choice quality.
|
|
161
|
+
- The key patterns or trends in the sub-reasons (e.g., consistent correct choices, repeated irrelevant tool calls, missed best-fit tools).
|
|
162
|
+
- A clear statement linking the **score** and **threshold** outcome (e.g., “The agent passed because…” or “Failed because…”).
|
|
163
|
+
|
|
164
|
+
RULES:
|
|
165
|
+
- Focus on *which tools were selected* and *why that selection pattern was or wasn't appropriate*.
|
|
166
|
+
- Mention specific issues or strengths like redundancy, misuse, or perfect matching.
|
|
167
|
+
- Avoid vague or subjective language such as “pretty good” or “reasonable”.
|
|
168
|
+
- Do **not** reference argument-level details; this summary is only for tool choice quality.
|
|
169
|
+
- The result must read as a self-contained, factual justification.
|
|
170
|
+
|
|
171
|
+
FORMAT:
|
|
172
|
+
Return only a single plain-text string. Do **not** include JSON or other formatting.
|
|
173
|
+
|
|
174
|
+
All Tool Selection Sub-Scores and Reasons:
|
|
175
|
+
{all_scores_and_reasons}
|
|
176
|
+
|
|
177
|
+
Final Score: {final_score}
|
|
178
|
+
Threshold: {threshold}
|
|
179
|
+
Result: {"PASS" if final_score >= threshold else "FAIL"}
|
|
180
|
+
|
|
181
|
+
Final Reason:
|
|
182
|
+
"""
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def get_tool_argument_final_reason(
|
|
187
|
+
all_scores_and_reasons: str, final_score: float, threshold: float
|
|
188
|
+
) -> str:
|
|
189
|
+
return textwrap.dedent(
|
|
190
|
+
f"""You are an expert evaluator summarizing the outcome of a **Tool Argument Quality** evaluation.
|
|
191
|
+
|
|
192
|
+
You are given:
|
|
193
|
+
- A list of **argument-level sub-scores and reasons**, each evaluating whether the arguments passed to tools were accurate, complete, and contextually appropriate.
|
|
194
|
+
- The **final aggregated score** across all argument evaluations.
|
|
195
|
+
- A **threshold** representing the minimum passing score.
|
|
196
|
+
|
|
197
|
+
Your task is to write a **single concise explanation (1-3 sentences)** that clearly states:
|
|
198
|
+
- Why the agent **passed or failed** in its use of tool arguments.
|
|
199
|
+
- The dominant strengths or weaknesses from the sub-reasons (e.g., correct parameterization, missing required fields, generic values, or misaligned arguments).
|
|
200
|
+
- Whether the agent met or fell short of the threshold and why.
|
|
201
|
+
|
|
202
|
+
RULES:
|
|
203
|
+
- Focus strictly on **argument correctness** and **context alignment** — not which tools were chosen.
|
|
204
|
+
- Reference specific argument-level problems or successes where helpful.
|
|
205
|
+
- Keep language objective and factual; avoid speculation or vague phrasing.
|
|
206
|
+
- The summary must stand alone as a clear explanation of the final result.
|
|
207
|
+
|
|
208
|
+
FORMAT:
|
|
209
|
+
Return only a single plain-text string. Do **not** include JSON or any extra formatting.
|
|
210
|
+
|
|
211
|
+
All Tool Argument Sub-Scores and Reasons:
|
|
212
|
+
{all_scores_and_reasons}
|
|
213
|
+
|
|
214
|
+
Final Score: {final_score}
|
|
215
|
+
Threshold: {threshold}
|
|
216
|
+
Result: {"PASS" if final_score >= threshold else "FAIL"}
|
|
217
|
+
|
|
218
|
+
Final Reason:
|
|
219
|
+
"""
|
|
220
|
+
)
|
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
from typing import Optional, List, Union
|
|
2
|
+
import asyncio
|
|
3
|
+
from deepeval.utils import get_or_create_event_loop, prettify_list
|
|
4
|
+
from deepeval.metrics.utils import (
|
|
5
|
+
construct_verbose_logs,
|
|
6
|
+
trimAndLoadJson,
|
|
7
|
+
get_unit_interactions,
|
|
8
|
+
print_tools_called,
|
|
9
|
+
check_conversational_test_case_params,
|
|
10
|
+
initialize_model,
|
|
11
|
+
)
|
|
12
|
+
from deepeval.test_case import (
|
|
13
|
+
ConversationalTestCase,
|
|
14
|
+
TurnParams,
|
|
15
|
+
ToolCall,
|
|
16
|
+
Turn,
|
|
17
|
+
)
|
|
18
|
+
from deepeval.metrics import BaseConversationalMetric
|
|
19
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
20
|
+
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
|
+
from deepeval.metrics.tool_use.template import ToolUseTemplate
|
|
22
|
+
from deepeval.metrics.tool_use.schema import (
|
|
23
|
+
ToolSelectionScore,
|
|
24
|
+
UserInputAndTools,
|
|
25
|
+
ArgumentCorrectnessScore,
|
|
26
|
+
)
|
|
27
|
+
from deepeval.metrics.api import metric_data_manager
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ToolUseMetric(BaseConversationalMetric):
|
|
31
|
+
|
|
32
|
+
_required_test_case_params = [
|
|
33
|
+
TurnParams.ROLE,
|
|
34
|
+
TurnParams.CONTENT,
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
available_tools: List[ToolCall],
|
|
40
|
+
threshold: float = 0.5,
|
|
41
|
+
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
|
|
42
|
+
include_reason: bool = True,
|
|
43
|
+
async_mode: bool = True,
|
|
44
|
+
strict_mode: bool = False,
|
|
45
|
+
verbose_mode: bool = False,
|
|
46
|
+
):
|
|
47
|
+
self.available_tools = available_tools
|
|
48
|
+
self.threshold = 1 if strict_mode else threshold
|
|
49
|
+
self.model, self.using_native_model = initialize_model(model)
|
|
50
|
+
self.evaluation_model = self.model.get_model_name()
|
|
51
|
+
self.include_reason = include_reason
|
|
52
|
+
self.async_mode = async_mode
|
|
53
|
+
self.strict_mode = strict_mode
|
|
54
|
+
self.verbose_mode = verbose_mode
|
|
55
|
+
|
|
56
|
+
def measure(
|
|
57
|
+
self,
|
|
58
|
+
test_case: ConversationalTestCase,
|
|
59
|
+
_show_indicator: bool = True,
|
|
60
|
+
_in_component: bool = False,
|
|
61
|
+
_log_metric_to_confident: bool = True,
|
|
62
|
+
):
|
|
63
|
+
check_conversational_test_case_params(
|
|
64
|
+
test_case, self._required_test_case_params, self
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
68
|
+
with metric_progress_indicator(
|
|
69
|
+
self, _show_indicator=_show_indicator, _in_component=_in_component
|
|
70
|
+
):
|
|
71
|
+
if self.async_mode:
|
|
72
|
+
loop = get_or_create_event_loop()
|
|
73
|
+
loop.run_until_complete(
|
|
74
|
+
self.a_measure(
|
|
75
|
+
test_case,
|
|
76
|
+
_show_indicator=False,
|
|
77
|
+
_in_component=_in_component,
|
|
78
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
83
|
+
user_input_and_tools = self._get_user_input_and_turns(
|
|
84
|
+
unit_interactions
|
|
85
|
+
)
|
|
86
|
+
tool_selection_scores = [
|
|
87
|
+
self._get_tool_selection_score(user_and_tools)
|
|
88
|
+
for user_and_tools in user_input_and_tools
|
|
89
|
+
]
|
|
90
|
+
argument_correctness_scores = [
|
|
91
|
+
self._get_argument_correctness_score(user_and_tools)
|
|
92
|
+
for user_and_tools in user_input_and_tools
|
|
93
|
+
if user_and_tools.tools_used
|
|
94
|
+
]
|
|
95
|
+
self.score = self._calculate_score(
|
|
96
|
+
tool_selection_scores, argument_correctness_scores
|
|
97
|
+
)
|
|
98
|
+
tool_selection_reason = (
|
|
99
|
+
self._generate_reason_for_tool_selection(
|
|
100
|
+
tool_selection_scores
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
argument_correctness_reason = (
|
|
104
|
+
self._generate_reason_for_argument_correctness(
|
|
105
|
+
argument_correctness_scores
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
self.reason = str(
|
|
109
|
+
"\n".join(
|
|
110
|
+
[tool_selection_reason, argument_correctness_reason]
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
self.verbose_logs = construct_verbose_logs(
|
|
115
|
+
self,
|
|
116
|
+
steps=[
|
|
117
|
+
f"Tool Selection Scores: {prettify_list(tool_selection_scores)} \n",
|
|
118
|
+
f"Argument Correctness Scores: {prettify_list(argument_correctness_scores)} \n",
|
|
119
|
+
f"Final Score: {self.score}",
|
|
120
|
+
f"Final Reason: {self.reason}",
|
|
121
|
+
],
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if _log_metric_to_confident:
|
|
125
|
+
metric_data_manager.post_metric_if_enabled(
|
|
126
|
+
self, test_case=test_case
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return self.score
|
|
130
|
+
|
|
131
|
+
async def a_measure(
|
|
132
|
+
self,
|
|
133
|
+
test_case: ConversationalTestCase,
|
|
134
|
+
_show_indicator: bool = True,
|
|
135
|
+
_in_component: bool = False,
|
|
136
|
+
_log_metric_to_confident: bool = True,
|
|
137
|
+
):
|
|
138
|
+
check_conversational_test_case_params(
|
|
139
|
+
test_case, self._required_test_case_params, self
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
self.evaluation_cost = 0 if self.using_native_model else None
|
|
143
|
+
with metric_progress_indicator(
|
|
144
|
+
self,
|
|
145
|
+
async_mode=True,
|
|
146
|
+
_show_indicator=_show_indicator,
|
|
147
|
+
_in_component=_in_component,
|
|
148
|
+
):
|
|
149
|
+
unit_interactions = get_unit_interactions(test_case.turns)
|
|
150
|
+
user_input_and_tools = self._get_user_input_and_turns(
|
|
151
|
+
unit_interactions
|
|
152
|
+
)
|
|
153
|
+
tool_selection_scores = await asyncio.gather(
|
|
154
|
+
*[
|
|
155
|
+
self._a_get_tool_selection_score(user_and_tools)
|
|
156
|
+
for user_and_tools in user_input_and_tools
|
|
157
|
+
]
|
|
158
|
+
)
|
|
159
|
+
argument_correctness_scores = await asyncio.gather(
|
|
160
|
+
*[
|
|
161
|
+
self._a_get_argument_correctness_score(user_and_tools)
|
|
162
|
+
for user_and_tools in user_input_and_tools
|
|
163
|
+
if user_and_tools.tools_used
|
|
164
|
+
]
|
|
165
|
+
)
|
|
166
|
+
self.score = self._calculate_score(
|
|
167
|
+
tool_selection_scores, argument_correctness_scores
|
|
168
|
+
)
|
|
169
|
+
tool_selection_reason = (
|
|
170
|
+
await self._a_generate_reason_for_tool_selection(
|
|
171
|
+
tool_selection_scores
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
argument_correctness_reason = (
|
|
175
|
+
await self._a_generate_reason_for_argument_correctness(
|
|
176
|
+
argument_correctness_scores
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
self.reason = str(
|
|
180
|
+
"\n".join([tool_selection_reason, argument_correctness_reason])
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
self.verbose_logs = construct_verbose_logs(
|
|
184
|
+
self,
|
|
185
|
+
steps=[
|
|
186
|
+
f"Tool Selection Scores: {prettify_list(tool_selection_scores)} \n",
|
|
187
|
+
f"Argument Correctness Scores: {prettify_list(argument_correctness_scores)} \n",
|
|
188
|
+
f"Final Score: {self.score}",
|
|
189
|
+
f"Final Reason: {self.reason}",
|
|
190
|
+
],
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if _log_metric_to_confident:
|
|
194
|
+
metric_data_manager.post_metric_if_enabled(
|
|
195
|
+
self, test_case=test_case
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return self.score
|
|
199
|
+
|
|
200
|
+
def _get_argument_correctness_score(
|
|
201
|
+
self, user_and_tools: UserInputAndTools
|
|
202
|
+
):
|
|
203
|
+
prompt = ToolUseTemplate.get_argument_correctness_score(
|
|
204
|
+
user_and_tools.user_messages,
|
|
205
|
+
user_and_tools.assistant_messages,
|
|
206
|
+
user_and_tools.tools_called,
|
|
207
|
+
user_and_tools.available_tools,
|
|
208
|
+
)
|
|
209
|
+
if self.using_native_model:
|
|
210
|
+
res, cost = self.model.generate(
|
|
211
|
+
prompt, schema=ArgumentCorrectnessScore
|
|
212
|
+
)
|
|
213
|
+
self.evaluation_cost += cost
|
|
214
|
+
return res
|
|
215
|
+
else:
|
|
216
|
+
try:
|
|
217
|
+
res: ArgumentCorrectnessScore = self.model.generate(
|
|
218
|
+
prompt, schema=ArgumentCorrectnessScore
|
|
219
|
+
)
|
|
220
|
+
return res
|
|
221
|
+
except TypeError:
|
|
222
|
+
res = self.model.generate(prompt)
|
|
223
|
+
data = trimAndLoadJson(res, self)
|
|
224
|
+
return ArgumentCorrectnessScore(**data)
|
|
225
|
+
|
|
226
|
+
async def _a_get_argument_correctness_score(
|
|
227
|
+
self,
|
|
228
|
+
user_and_tools: UserInputAndTools,
|
|
229
|
+
):
|
|
230
|
+
prompt = ToolUseTemplate.get_argument_correctness_score(
|
|
231
|
+
user_and_tools.user_messages,
|
|
232
|
+
user_and_tools.assistant_messages,
|
|
233
|
+
user_and_tools.tools_called,
|
|
234
|
+
user_and_tools.available_tools,
|
|
235
|
+
)
|
|
236
|
+
if self.using_native_model:
|
|
237
|
+
res, cost = await self.model.a_generate(
|
|
238
|
+
prompt, schema=ArgumentCorrectnessScore
|
|
239
|
+
)
|
|
240
|
+
self.evaluation_cost += cost
|
|
241
|
+
return res
|
|
242
|
+
else:
|
|
243
|
+
try:
|
|
244
|
+
res: ArgumentCorrectnessScore = await self.model.a_generate(
|
|
245
|
+
prompt, schema=ArgumentCorrectnessScore
|
|
246
|
+
)
|
|
247
|
+
return res
|
|
248
|
+
except TypeError:
|
|
249
|
+
res = await self.model.a_generate(prompt)
|
|
250
|
+
data = trimAndLoadJson(res, self)
|
|
251
|
+
return ArgumentCorrectnessScore(**data)
|
|
252
|
+
|
|
253
|
+
def _get_tool_selection_score(
|
|
254
|
+
self,
|
|
255
|
+
user_and_tools: UserInputAndTools,
|
|
256
|
+
):
|
|
257
|
+
prompt = ToolUseTemplate.get_tool_selection_score(
|
|
258
|
+
user_and_tools.user_messages,
|
|
259
|
+
user_and_tools.assistant_messages,
|
|
260
|
+
user_and_tools.tools_called,
|
|
261
|
+
user_and_tools.available_tools,
|
|
262
|
+
)
|
|
263
|
+
if self.using_native_model:
|
|
264
|
+
res, cost = self.model.generate(prompt, schema=ToolSelectionScore)
|
|
265
|
+
self.evaluation_cost += cost
|
|
266
|
+
return res
|
|
267
|
+
else:
|
|
268
|
+
try:
|
|
269
|
+
res: ToolSelectionScore = self.model.generate(
|
|
270
|
+
prompt, schema=ToolSelectionScore
|
|
271
|
+
)
|
|
272
|
+
return res
|
|
273
|
+
except TypeError:
|
|
274
|
+
res = self.model.generate(prompt)
|
|
275
|
+
data = trimAndLoadJson(res, self)
|
|
276
|
+
return ToolSelectionScore(**data)
|
|
277
|
+
|
|
278
|
+
async def _a_get_tool_selection_score(
|
|
279
|
+
self,
|
|
280
|
+
user_and_tools: UserInputAndTools,
|
|
281
|
+
):
|
|
282
|
+
prompt = ToolUseTemplate.get_tool_selection_score(
|
|
283
|
+
user_and_tools.user_messages,
|
|
284
|
+
user_and_tools.assistant_messages,
|
|
285
|
+
user_and_tools.tools_called,
|
|
286
|
+
user_and_tools.available_tools,
|
|
287
|
+
)
|
|
288
|
+
if self.using_native_model:
|
|
289
|
+
res, cost = await self.model.a_generate(
|
|
290
|
+
prompt, schema=ToolSelectionScore
|
|
291
|
+
)
|
|
292
|
+
self.evaluation_cost += cost
|
|
293
|
+
return res
|
|
294
|
+
else:
|
|
295
|
+
try:
|
|
296
|
+
res: ToolSelectionScore = await self.model.a_generate(
|
|
297
|
+
prompt, schema=ToolSelectionScore
|
|
298
|
+
)
|
|
299
|
+
return res
|
|
300
|
+
except TypeError:
|
|
301
|
+
res = await self.model.a_generate(prompt)
|
|
302
|
+
data = trimAndLoadJson(res, self)
|
|
303
|
+
return ToolSelectionScore(**data)
|
|
304
|
+
|
|
305
|
+
def _get_user_input_and_turns(
|
|
306
|
+
self,
|
|
307
|
+
unit_interactions: List[List[Turn]],
|
|
308
|
+
) -> List[UserInputAndTools]:
|
|
309
|
+
user_inputs_and_tools = []
|
|
310
|
+
available_tools = ",".join(
|
|
311
|
+
[repr(tool) for tool in self.available_tools]
|
|
312
|
+
)
|
|
313
|
+
for unit_interaction in unit_interactions:
|
|
314
|
+
if len(unit_interaction) < 2:
|
|
315
|
+
continue
|
|
316
|
+
user_messages = ""
|
|
317
|
+
assistant_messages = ""
|
|
318
|
+
tools_called = []
|
|
319
|
+
tools_used = False
|
|
320
|
+
for turn in unit_interaction:
|
|
321
|
+
if turn.role == "user":
|
|
322
|
+
user_messages += f"{turn.content} \n"
|
|
323
|
+
else:
|
|
324
|
+
break
|
|
325
|
+
for turn in unit_interaction[1:]:
|
|
326
|
+
if turn.role == "assistant":
|
|
327
|
+
assistant_messages += f"{turn.content} \n"
|
|
328
|
+
if turn.tools_called:
|
|
329
|
+
tools_called.extend(turn.tools_called)
|
|
330
|
+
tools_used = True
|
|
331
|
+
tools_called = ",".join([repr(tool) for tool in tools_called])
|
|
332
|
+
new_user_input_tools = UserInputAndTools(
|
|
333
|
+
user_messages=user_messages,
|
|
334
|
+
assistant_messages=assistant_messages,
|
|
335
|
+
tools_called=tools_called,
|
|
336
|
+
available_tools=available_tools,
|
|
337
|
+
tools_used=tools_used,
|
|
338
|
+
)
|
|
339
|
+
user_inputs_and_tools.append(new_user_input_tools)
|
|
340
|
+
return user_inputs_and_tools
|
|
341
|
+
|
|
342
|
+
def _calculate_score(
|
|
343
|
+
self,
|
|
344
|
+
tool_use_scores: List[ToolSelectionScore],
|
|
345
|
+
argument_correctness_scores: List[ArgumentCorrectnessScore],
|
|
346
|
+
):
|
|
347
|
+
tools_scores_sum = sum(
|
|
348
|
+
[tool_use_score.score for tool_use_score in tool_use_scores]
|
|
349
|
+
)
|
|
350
|
+
arguments_scores_sum = sum(
|
|
351
|
+
[
|
|
352
|
+
argument_correctness_score.score
|
|
353
|
+
for argument_correctness_score in argument_correctness_scores
|
|
354
|
+
]
|
|
355
|
+
)
|
|
356
|
+
tool_selections_scores_divisor = (
|
|
357
|
+
len(tool_use_scores) if len(tool_use_scores) > 0 else 1
|
|
358
|
+
)
|
|
359
|
+
argument_correctness_score_divisor = (
|
|
360
|
+
len(argument_correctness_scores)
|
|
361
|
+
if len(argument_correctness_scores) > 0
|
|
362
|
+
else 1
|
|
363
|
+
)
|
|
364
|
+
tools_selction_score = tools_scores_sum / tool_selections_scores_divisor
|
|
365
|
+
argument_correctness_score = (
|
|
366
|
+
arguments_scores_sum / argument_correctness_score_divisor
|
|
367
|
+
)
|
|
368
|
+
score = min(tools_selction_score, argument_correctness_score)
|
|
369
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
|
370
|
+
|
|
371
|
+
def _generate_reason_for_tool_selection(
|
|
372
|
+
self,
|
|
373
|
+
tool_use_scores: List[ToolSelectionScore],
|
|
374
|
+
):
|
|
375
|
+
scores_and_reasons = ""
|
|
376
|
+
for tool_use in tool_use_scores:
|
|
377
|
+
scores_and_reasons += (
|
|
378
|
+
f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
|
|
379
|
+
)
|
|
380
|
+
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
381
|
+
scores_and_reasons, self.score, self.threshold
|
|
382
|
+
)
|
|
383
|
+
if self.using_native_model:
|
|
384
|
+
res, cost = self.model.generate(prompt)
|
|
385
|
+
self.evaluation_cost += cost
|
|
386
|
+
return res
|
|
387
|
+
else:
|
|
388
|
+
res = self.model.generate(prompt)
|
|
389
|
+
return res
|
|
390
|
+
|
|
391
|
+
def _generate_reason_for_argument_correctness(
|
|
392
|
+
self,
|
|
393
|
+
argument_correctness_scores: List[ArgumentCorrectnessScore],
|
|
394
|
+
):
|
|
395
|
+
scores_and_reasons = ""
|
|
396
|
+
for tool_use in argument_correctness_scores:
|
|
397
|
+
scores_and_reasons += (
|
|
398
|
+
f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
|
|
399
|
+
)
|
|
400
|
+
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
401
|
+
scores_and_reasons, self.score, self.threshold
|
|
402
|
+
)
|
|
403
|
+
if self.using_native_model:
|
|
404
|
+
res, cost = self.model.generate(prompt)
|
|
405
|
+
self.evaluation_cost += cost
|
|
406
|
+
return res
|
|
407
|
+
else:
|
|
408
|
+
res = self.model.generate(prompt)
|
|
409
|
+
return res
|
|
410
|
+
|
|
411
|
+
async def _a_generate_reason_for_tool_selection(
|
|
412
|
+
self, tool_use_scores: List[ToolSelectionScore]
|
|
413
|
+
):
|
|
414
|
+
scores_and_reasons = ""
|
|
415
|
+
for tool_use in tool_use_scores:
|
|
416
|
+
scores_and_reasons += (
|
|
417
|
+
f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
|
|
418
|
+
)
|
|
419
|
+
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
420
|
+
scores_and_reasons, self.score, self.threshold
|
|
421
|
+
)
|
|
422
|
+
if self.using_native_model:
|
|
423
|
+
res, cost = await self.model.a_generate(prompt)
|
|
424
|
+
self.evaluation_cost += cost
|
|
425
|
+
return res
|
|
426
|
+
else:
|
|
427
|
+
res = await self.model.a_generate(prompt)
|
|
428
|
+
return res
|
|
429
|
+
|
|
430
|
+
async def _a_generate_reason_for_argument_correctness(
|
|
431
|
+
self, argument_correctness_scores: List[ArgumentCorrectnessScore]
|
|
432
|
+
):
|
|
433
|
+
scores_and_reasons = ""
|
|
434
|
+
for tool_use in argument_correctness_scores:
|
|
435
|
+
scores_and_reasons += (
|
|
436
|
+
f"\nScore: {tool_use.score} \nReason: {tool_use.reason} \n"
|
|
437
|
+
)
|
|
438
|
+
prompt = ToolUseTemplate.get_tool_selection_final_reason(
|
|
439
|
+
scores_and_reasons, self.score, self.threshold
|
|
440
|
+
)
|
|
441
|
+
if self.using_native_model:
|
|
442
|
+
res, cost = await self.model.a_generate(prompt)
|
|
443
|
+
self.evaluation_cost += cost
|
|
444
|
+
return res
|
|
445
|
+
else:
|
|
446
|
+
res = await self.model.a_generate(prompt)
|
|
447
|
+
return res
|
|
448
|
+
|
|
449
|
+
def is_successful(self) -> bool:
|
|
450
|
+
try:
|
|
451
|
+
self.success = self.score >= self.threshold
|
|
452
|
+
except (AttributeError, TypeError):
|
|
453
|
+
self.success = False
|
|
454
|
+
return self.success
|
|
455
|
+
|
|
456
|
+
@property
|
|
457
|
+
def __name__(self):
|
|
458
|
+
return "Tool Use"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .topic_adherence import TopicAdherenceMetric
|