ibm-watsonx-orchestrate-evaluation-framework 1.0.8__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/METADATA +103 -109
- ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info/RECORD +97 -0
- wxo_agentic_evaluation/analytics/tools/main.py +1 -18
- wxo_agentic_evaluation/analyze_run.py +358 -97
- wxo_agentic_evaluation/arg_configs.py +28 -1
- wxo_agentic_evaluation/description_quality_checker.py +149 -0
- wxo_agentic_evaluation/evaluation_package.py +58 -17
- wxo_agentic_evaluation/inference_backend.py +32 -17
- wxo_agentic_evaluation/llm_user.py +2 -1
- wxo_agentic_evaluation/metrics/metrics.py +22 -1
- wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2 +178 -0
- wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +9 -1
- wxo_agentic_evaluation/prompt/off_policy_attack_generation_prompt.jinja2 +34 -0
- wxo_agentic_evaluation/prompt/on_policy_attack_generation_prompt.jinja2 +46 -0
- wxo_agentic_evaluation/prompt/template_render.py +34 -3
- wxo_agentic_evaluation/quick_eval.py +342 -0
- wxo_agentic_evaluation/red_teaming/attack_evaluator.py +113 -0
- wxo_agentic_evaluation/red_teaming/attack_generator.py +286 -0
- wxo_agentic_evaluation/red_teaming/attack_list.py +96 -0
- wxo_agentic_evaluation/red_teaming/attack_runner.py +128 -0
- wxo_agentic_evaluation/referenceless_eval/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/consts.py +28 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/base.py +27 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general.py +49 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_call/general_metrics_runtime.json +580 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection.py +31 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/function_selection/function_selection_metrics_runtime.json +477 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/metrics/loader.py +237 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/adapters.py +101 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/pipeline.py +263 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/semantic_checker.py +455 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/static_checker.py +156 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/transformation_prompts.py +509 -0
- wxo_agentic_evaluation/referenceless_eval/function_calling/pipeline/types.py +547 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/__init__.py +3 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/field.py +258 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metric.py +333 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/metrics_runner.py +188 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/prompt.py +409 -0
- wxo_agentic_evaluation/referenceless_eval/metrics/utils.py +42 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/__init__.py +0 -0
- wxo_agentic_evaluation/referenceless_eval/prompt/runner.py +145 -0
- wxo_agentic_evaluation/referenceless_eval/referenceless_eval.py +114 -0
- wxo_agentic_evaluation/service_instance.py +2 -2
- wxo_agentic_evaluation/service_provider/__init__.py +15 -6
- wxo_agentic_evaluation/service_provider/model_proxy_provider.py +4 -3
- wxo_agentic_evaluation/service_provider/referenceless_provider_wrapper.py +138 -0
- wxo_agentic_evaluation/service_provider/watsonx_provider.py +11 -4
- wxo_agentic_evaluation/tool_planner.py +3 -1
- wxo_agentic_evaluation/type.py +33 -2
- wxo_agentic_evaluation/utils/__init__.py +0 -1
- wxo_agentic_evaluation/utils/open_ai_tool_extractor.py +157 -0
- wxo_agentic_evaluation/utils/rich_utils.py +174 -0
- wxo_agentic_evaluation/utils/rouge_score.py +23 -0
- wxo_agentic_evaluation/utils/utils.py +167 -5
- ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info/RECORD +0 -56
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/WHEEL +0 -0
- {ibm_watsonx_orchestrate_evaluation_framework-1.0.8.dist-info → ibm_watsonx_orchestrate_evaluation_framework-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,123 +1,384 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
import csv
|
|
4
|
-
import
|
|
4
|
+
from jsonargparse import CLI
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import List, Dict, Set, Optional
|
|
7
|
+
|
|
5
8
|
from rich.text import Text
|
|
6
|
-
from rich.panel import Panel
|
|
7
|
-
from rich.layout import Layout
|
|
8
9
|
from rich.table import Table
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
10
|
+
from rich.panel import Panel
|
|
11
|
+
from rich.console import Group
|
|
12
|
+
from rich.style import Style
|
|
13
|
+
|
|
14
|
+
from wxo_agentic_evaluation.type import ExtendedMessage, ContentType, ToolDefinition
|
|
14
15
|
from wxo_agentic_evaluation.metrics.metrics import ToolCallAndRoutingMetrics
|
|
15
16
|
from wxo_agentic_evaluation.arg_configs import AnalyzeConfig
|
|
16
|
-
from
|
|
17
|
+
from wxo_agentic_evaluation.description_quality_checker import (
|
|
18
|
+
DescriptionQualityInspector,
|
|
19
|
+
)
|
|
20
|
+
from wxo_agentic_evaluation.utils.rich_utils import (
|
|
21
|
+
pretty_print,
|
|
22
|
+
warn,
|
|
23
|
+
is_ok,
|
|
24
|
+
print_done,
|
|
25
|
+
IncorrectParameterUtils,
|
|
26
|
+
)
|
|
27
|
+
from wxo_agentic_evaluation.utils.utils import (
|
|
28
|
+
add_line_seperator,
|
|
29
|
+
)
|
|
17
30
|
|
|
18
31
|
|
|
19
|
-
|
|
20
|
-
conversation_lines = []
|
|
21
|
-
reason_lines = []
|
|
32
|
+
class Analyzer:
|
|
22
33
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
def __init__(self):
|
|
35
|
+
self.analysis_cache: Dict[str, List[Text]] = (
|
|
36
|
+
{}
|
|
37
|
+
) # the failing tools cached here won't be re-analyzed.
|
|
38
|
+
# tool_name -> description analysis
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def _generate_style_config():
|
|
42
|
+
return Style(
|
|
43
|
+
color="magenta",
|
|
44
|
+
blink=True,
|
|
45
|
+
bold=True,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def _split_cache(self, failing_tools: Set[str]) -> tuple[List[str], List[Text]]:
|
|
49
|
+
|
|
50
|
+
tools_to_analyze: List[str] = []
|
|
51
|
+
cached_lines: List[Text] = []
|
|
52
|
+
tools_analyzed: List[str] = []
|
|
53
|
+
|
|
54
|
+
for tool_name in sorted(failing_tools):
|
|
55
|
+
cached_analysis = self.analysis_cache.get(tool_name)
|
|
56
|
+
if cached_analysis:
|
|
57
|
+
cached_lines.extend(cached_analysis)
|
|
58
|
+
tools_analyzed.append(tool_name)
|
|
35
59
|
else:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
summary = []
|
|
66
|
-
with open(os.path.join(config.data_path, "summary_metrics.csv"), "r") as f:
|
|
67
|
-
reader = csv.reader(f)
|
|
68
|
-
header = next(reader)
|
|
69
|
-
for row in reader:
|
|
70
|
-
summary.append(dict(zip(header, row)))
|
|
71
|
-
|
|
72
|
-
test_case_with_failed_tools = []
|
|
73
|
-
for entry in summary:
|
|
74
|
-
test_case_name = entry["dataset_name"]
|
|
75
|
-
if test_case_name.lower().strip() == "summary (average)":
|
|
76
|
-
continue
|
|
77
|
-
if not entry["is_success"] or float(entry["tool_calls_with_incorrect_parameter"]) > 0 or float(entry["tool_call_precision"]) < 1.0\
|
|
78
|
-
or float(entry["tool_call_recall"]) < 1.0:
|
|
79
|
-
test_case_with_failed_tools.append(entry)
|
|
80
|
-
if len(test_case_with_failed_tools) == 0:
|
|
81
|
-
header_table = Table(show_header=False, box=None)
|
|
82
|
-
header_table.add_row(f"No Tool Call Error found!")
|
|
83
|
-
header_panel = Panel(
|
|
84
|
-
header_table, title="[bold green]📋 Analysis Summary[/bold green]"
|
|
60
|
+
tools_to_analyze.append(tool_name)
|
|
61
|
+
|
|
62
|
+
if tools_analyzed:
|
|
63
|
+
pretty_print(
|
|
64
|
+
content=f"ℹ️ Loading cached analysis since these failing tools: {tools_analyzed} have been analyzed previously.",
|
|
65
|
+
style="bold cyan",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return (
|
|
69
|
+
tools_to_analyze,
|
|
70
|
+
cached_lines
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def analyze_failing_tool_description_quality(
|
|
75
|
+
self,
|
|
76
|
+
inspector: DescriptionQualityInspector,
|
|
77
|
+
tool_definition_path: str,
|
|
78
|
+
failing_tools: Set[str],
|
|
79
|
+
) -> List[Text]:
|
|
80
|
+
"""
|
|
81
|
+
:param tool_definition_path: Path to the tool definition file.
|
|
82
|
+
:param failing_tools: Set of tool names that failed.
|
|
83
|
+
:return: List of rich `Text` objects containing feedback for the customer.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
pretty_print(
|
|
87
|
+
content=f"⚙️ Checking tool description quality for failing tools: {sorted(failing_tools)}",
|
|
88
|
+
style="bold cyan",
|
|
85
89
|
)
|
|
86
|
-
rich.print(header_panel)
|
|
87
90
|
|
|
88
|
-
|
|
89
|
-
|
|
91
|
+
analysis_for_display: List[Text] = []
|
|
92
|
+
|
|
93
|
+
# Step 1: get tools not yet analyzed and cached analysis for tools analyzed previously
|
|
94
|
+
tools_to_analyze, cached_analysis = self._split_cache(failing_tools)
|
|
95
|
+
if cached_analysis:
|
|
96
|
+
analysis_for_display.extend(cached_analysis)
|
|
97
|
+
|
|
98
|
+
# Step 2: analyze cache misses
|
|
99
|
+
if tools_to_analyze:
|
|
100
|
+
|
|
101
|
+
failing_tool_definitions: List[ToolDefinition] = inspector.extract_tool_desc_from_tool_source(
|
|
102
|
+
Path(tool_definition_path),
|
|
103
|
+
tools_to_analyze,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if not failing_tool_definitions:
|
|
107
|
+
analysis_for_display.append(
|
|
108
|
+
warn(
|
|
109
|
+
message=f"No tool definitions(with '@tool' decorators) for failed tools: '{tools_to_analyze}' found in the file: '{tool_definition_path}'"
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
return analysis_for_display
|
|
113
|
+
|
|
114
|
+
missing_tools = self._get_tools_not_found_in_source(
|
|
115
|
+
tools_to_analyze, failing_tool_definitions
|
|
116
|
+
)
|
|
117
|
+
if missing_tools:
|
|
118
|
+
analysis_for_display.append(
|
|
119
|
+
warn(
|
|
120
|
+
message=f"Missing tool definitions for failed tools: '{missing_tools}' in the file: '{tool_definition_path}'"
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
for tool_definition in failing_tool_definitions:
|
|
125
|
+
|
|
126
|
+
tool_analysis = self._analyze_tool_definition(
|
|
127
|
+
inspector=inspector,
|
|
128
|
+
tool_definition=tool_definition,
|
|
129
|
+
tool_definition_path=tool_definition_path,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
self.analysis_cache[tool_definition.tool_name] = tool_analysis
|
|
133
|
+
analysis_for_display.extend(tool_analysis)
|
|
134
|
+
|
|
135
|
+
return analysis_for_display
|
|
136
|
+
|
|
137
|
+
def render(self, data: List[ExtendedMessage], tool_definition_path: Optional[str]) -> Group:
|
|
138
|
+
"""
|
|
139
|
+
Render the conversation history and analysis results.
|
|
140
|
+
:param data: List of ExtendedMessage objects containing the conversation history.
|
|
141
|
+
:param tool_definition_path: Path to the tool definition file.
|
|
142
|
+
:return: A rich Group object containing the conversation and analysis results.
|
|
143
|
+
"""
|
|
144
|
+
conversation_lines = []
|
|
145
|
+
reason_lines = []
|
|
146
|
+
failing_tools = []
|
|
147
|
+
|
|
148
|
+
for entry in data:
|
|
149
|
+
msg = entry.message
|
|
150
|
+
role = msg.role
|
|
151
|
+
content = msg.content
|
|
152
|
+
reason = entry.reason
|
|
153
|
+
tool_name = None
|
|
154
|
+
if msg.type == ContentType.tool_call or msg.type == ContentType.tool_response:
|
|
155
|
+
tool_name = json.loads(msg.content)["name"]
|
|
90
156
|
|
|
91
|
-
|
|
92
|
-
|
|
157
|
+
if role == "user":
|
|
158
|
+
label = "👤 User"
|
|
159
|
+
elif role == "assistant" and msg.type == ContentType.tool_call:
|
|
160
|
+
if reason:
|
|
161
|
+
label = "❌ Tool Call"
|
|
162
|
+
|
|
163
|
+
if reason.get("reason") == "incorrect parameter":
|
|
164
|
+
failing_tools.append(
|
|
165
|
+
tool_name
|
|
166
|
+
) # create a list of failing tools for description quality analysis.
|
|
167
|
+
else:
|
|
168
|
+
label = "✅ Tool Call"
|
|
169
|
+
elif role == "assistant":
|
|
170
|
+
label = "🤖 Assistant"
|
|
171
|
+
else:
|
|
172
|
+
label = "📦 Unknown"
|
|
173
|
+
|
|
174
|
+
text_line = Text(f"{label}: {content}\n")
|
|
175
|
+
if reason:
|
|
176
|
+
text_line.stylize("bold red")
|
|
177
|
+
reason_text = f"❌ {tool_name}: {json.dumps(reason)}\n\n"
|
|
178
|
+
reason_lines.append(Text(reason_text, style="red"))
|
|
179
|
+
conversation_lines.append(text_line)
|
|
180
|
+
|
|
181
|
+
if failing_tools and tool_definition_path:
|
|
182
|
+
|
|
183
|
+
inspector = DescriptionQualityInspector()
|
|
184
|
+
|
|
185
|
+
description_quality_inspection_lines = (
|
|
186
|
+
self.analyze_failing_tool_description_quality(
|
|
187
|
+
inspector, tool_definition_path, set(failing_tools)
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
print_done()
|
|
192
|
+
|
|
193
|
+
if description_quality_inspection_lines:
|
|
194
|
+
reason_lines.extend(description_quality_inspection_lines)
|
|
195
|
+
|
|
196
|
+
conversation_panel = Panel(
|
|
197
|
+
Text().join(conversation_lines),
|
|
198
|
+
title="Conversation History",
|
|
199
|
+
border_style="blue",
|
|
93
200
|
)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
201
|
+
reason_panel = Panel(
|
|
202
|
+
Text().join(reason_lines),
|
|
203
|
+
title="Analysis Results",
|
|
204
|
+
border_style="red",
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return Group(
|
|
208
|
+
conversation_panel,
|
|
209
|
+
reason_panel,
|
|
103
210
|
)
|
|
104
|
-
|
|
105
|
-
|
|
211
|
+
|
|
212
|
+
def analyze(self, config: AnalyzeConfig):
|
|
213
|
+
"""
|
|
214
|
+
Analyze the results of the tool calls and routing metrics.
|
|
215
|
+
:param config: AnalyzeConfig object containing user provided paths for analysis.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def get_summary(summary_file_name: str = "summary_metrics.csv"):
|
|
219
|
+
summary = []
|
|
220
|
+
|
|
221
|
+
path_to_summary_file = os.path.join(config.data_path, summary_file_name)
|
|
222
|
+
|
|
223
|
+
with open(path_to_summary_file, "r") as f:
|
|
224
|
+
reader = csv.reader(f)
|
|
225
|
+
header = next(reader)
|
|
226
|
+
for row in reader:
|
|
227
|
+
summary.append(dict(zip(header, row)))
|
|
228
|
+
|
|
229
|
+
return summary
|
|
230
|
+
|
|
231
|
+
def get_test_messages(test_case_name):
|
|
232
|
+
test_messages = []
|
|
233
|
+
|
|
234
|
+
test_case_path = os.path.join(
|
|
235
|
+
config.data_path, "messages", f"{test_case_name}.messages.analyze.json"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
with open(test_case_path, "r", encoding="utf-8") as f:
|
|
239
|
+
temp = json.load(f)
|
|
240
|
+
for entry in temp:
|
|
241
|
+
msg = ExtendedMessage(**entry)
|
|
242
|
+
test_messages.append(msg)
|
|
243
|
+
|
|
244
|
+
return test_messages
|
|
245
|
+
|
|
246
|
+
def get_metrics(test_case_name):
|
|
247
|
+
test_metrics_path = os.path.join(
|
|
248
|
+
config.data_path, "messages", f"{test_case_name}.metrics.json"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
with open(test_metrics_path, "r", encoding="utf-8") as f:
|
|
252
|
+
metrics = ToolCallAndRoutingMetrics(**json.load(f))
|
|
253
|
+
|
|
254
|
+
return metrics
|
|
255
|
+
|
|
256
|
+
summary = get_summary()
|
|
257
|
+
|
|
258
|
+
test_case_with_failed_tools = self._get_test_case_with_failed_tools(
|
|
259
|
+
summary=summary
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
if len(test_case_with_failed_tools) == 0:
|
|
263
|
+
header_table = Table(show_header=False, box=None)
|
|
264
|
+
|
|
265
|
+
header_table.add_row("No Tool Call Error found!")
|
|
266
|
+
|
|
267
|
+
panel = Panel(
|
|
268
|
+
header_table, title="[bold green]📋 Analysis Summary[/bold green]"
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
pretty_print(panel)
|
|
272
|
+
|
|
273
|
+
for test_case_entry in test_case_with_failed_tools:
|
|
274
|
+
test_case_name = test_case_entry["dataset_name"]
|
|
275
|
+
|
|
276
|
+
test_messages = get_test_messages(test_case_name=test_case_name)
|
|
277
|
+
|
|
278
|
+
metrics: ToolCallAndRoutingMetrics = get_metrics(
|
|
279
|
+
test_case_name=test_case_name
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
header_panel = self._create_header_analysis_panel(test_case_name, metrics)
|
|
283
|
+
pretty_print(header_panel)
|
|
284
|
+
|
|
285
|
+
tool_definition_path = config.tool_definition_path \
|
|
286
|
+
if config.tool_definition_path else None
|
|
287
|
+
|
|
288
|
+
rendered_content = self.render(
|
|
289
|
+
data=test_messages,
|
|
290
|
+
tool_definition_path=tool_definition_path
|
|
291
|
+
)
|
|
292
|
+
pretty_print(rendered_content)
|
|
293
|
+
|
|
294
|
+
add_line_seperator(
|
|
295
|
+
self._generate_style_config()
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
def _create_header_analysis_panel(
|
|
299
|
+
self, test_case_name: str, metrics: ToolCallAndRoutingMetrics
|
|
300
|
+
) -> Panel:
|
|
106
301
|
header_table = Table(show_header=False, box=None)
|
|
302
|
+
|
|
107
303
|
header_table.add_row(f"Test Case Name: {test_case_name}")
|
|
108
|
-
header_table.add_row(
|
|
304
|
+
header_table.add_row(f"Expected Tool Calls: {metrics.expected_tool_calls}")
|
|
109
305
|
header_table.add_row(f"Correct Tool Calls: {metrics.correct_tool_calls}")
|
|
110
306
|
header_table.add_row(f"Text Match: {metrics.text_match.value}")
|
|
111
|
-
header_table.add_row(
|
|
112
|
-
|
|
113
|
-
)
|
|
307
|
+
header_table.add_row(f"Journey Success: {metrics.is_success}")
|
|
308
|
+
|
|
114
309
|
header_panel = Panel(
|
|
115
310
|
header_table, title="[bold green]📋 Analysis Summary[/bold green]"
|
|
116
311
|
)
|
|
117
|
-
rich.print(header_panel)
|
|
118
|
-
layout = render(test_messages)
|
|
119
|
-
rich.print(layout)
|
|
120
312
|
|
|
313
|
+
return header_panel
|
|
314
|
+
|
|
315
|
+
def _get_test_case_with_failed_tools(self, summary) -> List:
|
|
316
|
+
|
|
317
|
+
test_case_with_failed_tools = []
|
|
318
|
+
|
|
319
|
+
for entry in summary:
|
|
320
|
+
test_case_name = entry["dataset_name"]
|
|
321
|
+
|
|
322
|
+
if test_case_name.lower().strip() == "summary (average)":
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
if (
|
|
326
|
+
not entry["is_success"]
|
|
327
|
+
or float(entry["tool_calls_with_incorrect_parameter"]) > 0
|
|
328
|
+
or float(entry["tool_call_precision"]) < 1.0
|
|
329
|
+
or float(entry["tool_call_recall"]) < 1.0
|
|
330
|
+
):
|
|
331
|
+
|
|
332
|
+
test_case_with_failed_tools.append(entry)
|
|
333
|
+
|
|
334
|
+
return test_case_with_failed_tools
|
|
335
|
+
|
|
336
|
+
def _get_tools_not_found_in_source(
|
|
337
|
+
self,
|
|
338
|
+
tools_to_analyze: List[str],
|
|
339
|
+
failing_tool_definitions: List[ToolDefinition],
|
|
340
|
+
) -> Set[str]:
|
|
341
|
+
|
|
342
|
+
return set(tools_to_analyze) - {
|
|
343
|
+
tool_def.tool_name for tool_def in failing_tool_definitions
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
def _analyze_tool_definition(
|
|
347
|
+
self,
|
|
348
|
+
inspector: DescriptionQualityInspector,
|
|
349
|
+
tool_definition: ToolDefinition,
|
|
350
|
+
tool_definition_path: str,
|
|
351
|
+
) -> List[Text]:
|
|
352
|
+
|
|
353
|
+
tool_name = tool_definition.tool_name
|
|
354
|
+
tool_desc = tool_definition.tool_description
|
|
355
|
+
|
|
356
|
+
tool_analysis = []
|
|
357
|
+
|
|
358
|
+
# missing description
|
|
359
|
+
if tool_desc is None:
|
|
360
|
+
tool_analysis.extend(
|
|
361
|
+
IncorrectParameterUtils.format_missing_description_message(
|
|
362
|
+
tool_name=tool_name, tool_definition_path=tool_definition_path
|
|
363
|
+
)
|
|
364
|
+
)
|
|
365
|
+
return tool_analysis
|
|
366
|
+
|
|
367
|
+
# bad description
|
|
368
|
+
if inspector.detect_bad_description(tool_definition):
|
|
369
|
+
tool_analysis.extend(
|
|
370
|
+
IncorrectParameterUtils.format_bad_description_message(
|
|
371
|
+
tool_name=tool_name, tool_desc=tool_desc
|
|
372
|
+
)
|
|
373
|
+
)
|
|
374
|
+
return tool_analysis
|
|
375
|
+
|
|
376
|
+
# good description
|
|
377
|
+
tool_analysis.append(
|
|
378
|
+
is_ok(message=f"The description for the `{tool_name}` looks sufficient.")
|
|
379
|
+
)
|
|
380
|
+
return tool_analysis
|
|
121
381
|
|
|
122
382
|
if __name__ == "__main__":
|
|
123
|
-
|
|
383
|
+
dummy_analyzer = Analyzer()
|
|
384
|
+
dummy_analyzer.analyze(CLI(AnalyzeConfig, as_positional=False))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
-
from typing import List
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
4
|
from wxo_agentic_evaluation import __file__
|
|
5
5
|
|
|
6
6
|
root_dir = os.path.dirname(__file__)
|
|
@@ -43,9 +43,32 @@ class TestConfig:
|
|
|
43
43
|
num_workers: int = 2
|
|
44
44
|
|
|
45
45
|
|
|
46
|
+
@dataclass
|
|
47
|
+
class AttackConfig:
|
|
48
|
+
attack_paths: List[str]
|
|
49
|
+
output_dir: str
|
|
50
|
+
auth_config: AuthConfig
|
|
51
|
+
provider_config: ProviderConfig = field(default_factory=ProviderConfig)
|
|
52
|
+
llm_user_config: LLMUserConfig = field(default_factory=LLMUserConfig)
|
|
53
|
+
enable_verbose_logging: bool = True
|
|
54
|
+
enable_manual_user_input: bool = False
|
|
55
|
+
num_workers: int = 2
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class AttackGeneratorConfig:
|
|
60
|
+
attacks_list: Union[List[str], str]
|
|
61
|
+
datasets_path: Union[List[str], str]
|
|
62
|
+
agents_path: str
|
|
63
|
+
target_agent_name: str
|
|
64
|
+
output_dir: str = None
|
|
65
|
+
max_variants: int = None
|
|
66
|
+
|
|
67
|
+
|
|
46
68
|
@dataclass
|
|
47
69
|
class AnalyzeConfig:
|
|
48
70
|
data_path: str
|
|
71
|
+
tool_definition_path: Optional[str] = None
|
|
49
72
|
|
|
50
73
|
|
|
51
74
|
@dataclass
|
|
@@ -77,6 +100,10 @@ class ChatRecordingConfig:
|
|
|
77
100
|
max_retries: int = 5
|
|
78
101
|
|
|
79
102
|
|
|
103
|
+
@dataclass
|
|
104
|
+
class QuickEvalConfig(TestConfig):
|
|
105
|
+
tools_path: str = None
|
|
106
|
+
|
|
80
107
|
@dataclass
|
|
81
108
|
class BatchAnnotateConfig:
|
|
82
109
|
allowed_tools: List[str]
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
import rich
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
from wxo_agentic_evaluation.service_provider import get_provider
|
|
8
|
+
from wxo_agentic_evaluation.prompt.template_render import BadToolDescriptionRenderer
|
|
9
|
+
from wxo_agentic_evaluation.tool_planner import (
|
|
10
|
+
parse_json_string,
|
|
11
|
+
extract_tool_signatures,
|
|
12
|
+
MISSING_DOCSTRING_PROMPT,
|
|
13
|
+
)
|
|
14
|
+
from wxo_agentic_evaluation.type import ToolDefinition
|
|
15
|
+
from wxo_agentic_evaluation.utils.utils import safe_divide
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ToolDescriptionIssue(Enum):
|
|
19
|
+
"""
|
|
20
|
+
Represents the binary outcomes the LLM judge will classify in its assessment \
|
|
21
|
+
of the tool's description.
|
|
22
|
+
The presence of these issues in the tool's description indicates poor quality.
|
|
23
|
+
For more detail on what each issue indicates, please take a look at the template here: `wxo_agentic_evaluation/prompt/bad_tool_descriptions_prompt.jinja2`.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# TODO: Priority-based weighting of issues.
|
|
27
|
+
CONTAINS_REDUNDANT_INFORMATION = "contains_redundant_information"
|
|
28
|
+
USES_VAGUE_LANGUAGE = "uses_vague_language"
|
|
29
|
+
DOES_NOT_HELP_IN_IDENTIFYING_TOOL_UNIQUELY = (
|
|
30
|
+
"does_not_help_in_identifying_tool_uniquely"
|
|
31
|
+
)
|
|
32
|
+
PROVIDES_NO_NEW_INFORMATION = "provides_no_new_information"
|
|
33
|
+
DOES_NOT_CONVEY_TOOL_PURPOSE = "does_not_convey_tool_purpose"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DescriptionQualityInspector:
|
|
37
|
+
DEFAULT_CLASSIFICATION_THRESHOLD = (
|
|
38
|
+
40.0 # 2/5 issues detected. A higher score indicates a worse description.
|
|
39
|
+
)
|
|
40
|
+
CLASSIFICATION_SCORE_THRESHOLD = float(
|
|
41
|
+
os.getenv("CLASSIFICATION_SCORE_THRESHOLD", DEFAULT_CLASSIFICATION_THRESHOLD)
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
LLM_MODEL_ID = "meta-llama/llama-3-2-90b-vision-instruct"
|
|
45
|
+
LLM_PARAMS = {
|
|
46
|
+
"min_new_tokens": 128,
|
|
47
|
+
"decoding_method": "greedy",
|
|
48
|
+
"max_new_tokens": 512,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
WORST_POSSIBLE_EVAL_OUTCOME = len(
|
|
52
|
+
ToolDescriptionIssue
|
|
53
|
+
) # the final score used for classification is normalized against this value.
|
|
54
|
+
|
|
55
|
+
root_dir = os.path.dirname(__file__)
|
|
56
|
+
BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH = os.path.join(
|
|
57
|
+
root_dir, "prompt", "bad_tool_descriptions_prompt.jinja2"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def __init__(self, llm_client=None):
|
|
61
|
+
if llm_client is None:
|
|
62
|
+
llm_client = get_provider(
|
|
63
|
+
model_id=self.LLM_MODEL_ID,
|
|
64
|
+
params=self.LLM_PARAMS,
|
|
65
|
+
)
|
|
66
|
+
self.llm_client = llm_client
|
|
67
|
+
self.template = BadToolDescriptionRenderer(
|
|
68
|
+
self.BAD_TOOL_DESCRIPTIONS_DETECTOR_PATH
|
|
69
|
+
)
|
|
70
|
+
self.cached_response = (
|
|
71
|
+
None # this is used in the unit-tests for nuanced analysis of the response.
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def extract_tool_desc_from_tool_source(
|
|
76
|
+
tool_source: Path, failing_tools: List[str]
|
|
77
|
+
) -> List[ToolDefinition]:
|
|
78
|
+
"""
|
|
79
|
+
Parses the tool source file to extract the tool description.
|
|
80
|
+
Wraps the description along with the tool name, and args into a `ToolDefinition` for all `failing_tools`.
|
|
81
|
+
This `ToolDefinition` is later rendered into the judge's prompt template for evaluation.
|
|
82
|
+
Args:
|
|
83
|
+
tool_source (Path): The path to the tool source file/dir containing `.py` tools.
|
|
84
|
+
failing_tools (List[str]): List of tool names that failed during inference.
|
|
85
|
+
Returns:
|
|
86
|
+
List[ToolDefinition]: The extracted tool definition(s) or [] if the file contains no @tool decorators.
|
|
87
|
+
"""
|
|
88
|
+
all_tool_data = extract_tool_signatures(tool_source)
|
|
89
|
+
|
|
90
|
+
tool_definitions = []
|
|
91
|
+
for tool_data in all_tool_data:
|
|
92
|
+
tool_name = tool_data["Function Name"]
|
|
93
|
+
if tool_name in failing_tools:
|
|
94
|
+
tool_definitions.append(
|
|
95
|
+
ToolDefinition(
|
|
96
|
+
tool_name=tool_name,
|
|
97
|
+
tool_description=(
|
|
98
|
+
tool_data["Docstring"]
|
|
99
|
+
if tool_data["Docstring"] != MISSING_DOCSTRING_PROMPT
|
|
100
|
+
else None
|
|
101
|
+
),
|
|
102
|
+
tool_params=tool_data["Arguments"],
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
return tool_definitions
|
|
106
|
+
|
|
107
|
+
def detect_bad_description(self, tool_definition: ToolDefinition) -> bool:
|
|
108
|
+
"""
|
|
109
|
+
Detects if a tool description is 'bad' using an LLM judge.
|
|
110
|
+
A 'bad' description is one that:
|
|
111
|
+
- does not describe the tool's functionality/use-case clearly
|
|
112
|
+
- does not provide sufficient detail for an agent to understand how to use the tool
|
|
113
|
+
- does not distinguish the tool from other tools
|
|
114
|
+
For the exact definition of a 'bad' description, refer to `ToolDescriptionIssue` Enum.
|
|
115
|
+
Args:
|
|
116
|
+
tool_definition (ToolDefinition): The definition of the tool to evaluate.
|
|
117
|
+
Returns:
|
|
118
|
+
bool: True if the description is 'bad', False otherwise.
|
|
119
|
+
"""
|
|
120
|
+
prompt = self.template.render(tool_definition=tool_definition)
|
|
121
|
+
response = self.llm_client.query(prompt)
|
|
122
|
+
|
|
123
|
+
# parse JSON objects from cleaned text
|
|
124
|
+
json_objects = parse_json_string(response)
|
|
125
|
+
|
|
126
|
+
# pick the first JSON object
|
|
127
|
+
if json_objects:
|
|
128
|
+
response_data = json_objects[0]
|
|
129
|
+
self.cached_response = response_data
|
|
130
|
+
else:
|
|
131
|
+
return False # likely some unexpected parsing issue, in this case - flags description as good.
|
|
132
|
+
|
|
133
|
+
# calculate weighted score
|
|
134
|
+
final_description_score = self._calculate_score(response_data=response_data)
|
|
135
|
+
|
|
136
|
+
return final_description_score >= self.CLASSIFICATION_SCORE_THRESHOLD
|
|
137
|
+
|
|
138
|
+
def _calculate_score(self, response_data: dict) -> float:
|
|
139
|
+
"""
|
|
140
|
+
Calculates a final score for the tool description.
|
|
141
|
+
This score is used to finally classify a 'good' or 'bad' description.
|
|
142
|
+
:param response_data: Parsed JSON response returned by the LLM judge.
|
|
143
|
+
"""
|
|
144
|
+
detected_issues = sum(
|
|
145
|
+
1
|
|
146
|
+
for issue in ToolDescriptionIssue
|
|
147
|
+
if response_data.get(issue.value, "FALSE").upper() == "TRUE"
|
|
148
|
+
)
|
|
149
|
+
return safe_divide(detected_issues, self.WORST_POSSIBLE_EVAL_OUTCOME) * 100
|