onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
bench/py.typed
ADDED
|
File without changes
|
bench/reporter.py
ADDED
|
@@ -0,0 +1,629 @@
|
|
|
1
|
+
"""Console reporter for progress events during benchmark runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
8
|
+
|
|
9
|
+
from rich import box
|
|
10
|
+
from rich.rule import Rule
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
|
|
13
|
+
from ot.config import get_config
|
|
14
|
+
from ot.logging import LogSpan
|
|
15
|
+
|
|
16
|
+
# Patterns to detect in MCP tool responses that indicate LLM retry behavior
|
|
17
|
+
TOOL_RESPONSE_ERROR_PATTERNS = [
|
|
18
|
+
r"Code validation failed",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from rich.console import Console
|
|
23
|
+
|
|
24
|
+
from bench.harness.config import HarnessConfig
|
|
25
|
+
from bench.harness.metrics import TaskResult
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _extract_result(response: str) -> str:
|
|
29
|
+
"""Extract result from JSON response if present."""
|
|
30
|
+
try:
|
|
31
|
+
parsed = json.loads(response)
|
|
32
|
+
if isinstance(parsed, dict) and "result" in parsed:
|
|
33
|
+
return str(parsed["result"])
|
|
34
|
+
except json.JSONDecodeError:
|
|
35
|
+
pass
|
|
36
|
+
return response
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SpanPrinter:
|
|
40
|
+
"""Formats and prints spans to console as name=value pairs.
|
|
41
|
+
|
|
42
|
+
Verbose mode:
|
|
43
|
+
- Non-verbose: newlines replaced with \\n, values truncated at compact_max_length
|
|
44
|
+
- Verbose: full content with actual newlines
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, console: Console, verbose: bool = False) -> None:
|
|
48
|
+
self.console = console
|
|
49
|
+
self.verbose = verbose
|
|
50
|
+
self.compact_max_length = get_config().compact_max_length
|
|
51
|
+
|
|
52
|
+
def _format_value(self, value: Any) -> str:
|
|
53
|
+
"""Format a value for console output based on verbose mode."""
|
|
54
|
+
# Format lists as comma-separated
|
|
55
|
+
if isinstance(value, list):
|
|
56
|
+
result = ", ".join(str(v) for v in value) if value else ""
|
|
57
|
+
else:
|
|
58
|
+
result = str(value)
|
|
59
|
+
|
|
60
|
+
# Non-verbose: truncate and replace newlines
|
|
61
|
+
if not self.verbose:
|
|
62
|
+
result = result.replace("\n", "\\n")
|
|
63
|
+
if len(result) > self.compact_max_length:
|
|
64
|
+
result = result[: self.compact_max_length] + "..."
|
|
65
|
+
|
|
66
|
+
return result
|
|
67
|
+
|
|
68
|
+
def print_span(self, data: dict[str, Any]) -> None:
|
|
69
|
+
"""Print all span fields to console.
|
|
70
|
+
|
|
71
|
+
Special handling for taskRequest and taskResponse - highlighted in green
|
|
72
|
+
to match evaluation output style.
|
|
73
|
+
"""
|
|
74
|
+
self.console.print()
|
|
75
|
+
for key, value in data.items():
|
|
76
|
+
formatted = self._format_value(value)
|
|
77
|
+
# Highlight taskRequest and taskResponse in green (like evaluation)
|
|
78
|
+
if key in ("taskRequest", "taskResponse"):
|
|
79
|
+
self.console.print(f" [bold green]{key}[/bold green]={formatted}")
|
|
80
|
+
else:
|
|
81
|
+
self.console.print(f" [cyan]{key}[/cyan]={formatted}")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class ConsoleReporter:
|
|
85
|
+
"""Handles progress reporting for benchmark runs.
|
|
86
|
+
|
|
87
|
+
Encapsulates all console output logic, supporting verbose mode
|
|
88
|
+
and trace mode for debugging.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
console: Console,
|
|
94
|
+
config: HarnessConfig,
|
|
95
|
+
*,
|
|
96
|
+
verbose: bool = False,
|
|
97
|
+
trace: bool = False,
|
|
98
|
+
no_color: bool = False,
|
|
99
|
+
) -> None:
|
|
100
|
+
"""Initialize the reporter.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
console: Rich console for output.
|
|
104
|
+
config: Harness configuration (for looking up prompts).
|
|
105
|
+
verbose: Enable verbose output with full content.
|
|
106
|
+
trace: Enable timestamped trace output for debugging.
|
|
107
|
+
no_color: Disable color output.
|
|
108
|
+
"""
|
|
109
|
+
self.console = console
|
|
110
|
+
self.config = config
|
|
111
|
+
self.verbose = verbose
|
|
112
|
+
self.trace = trace
|
|
113
|
+
self.no_color = no_color
|
|
114
|
+
self.span_printer = SpanPrinter(console, verbose=verbose)
|
|
115
|
+
|
|
116
|
+
# Track LLM call count per task for [call=N] prefix
|
|
117
|
+
self.llm_call_counts: dict[str, int] = {}
|
|
118
|
+
|
|
119
|
+
# Track tool call count per task
|
|
120
|
+
self.tool_call_counts: dict[str, int] = {}
|
|
121
|
+
|
|
122
|
+
# Track connected servers per task (for inferring server in tool calls)
|
|
123
|
+
self.connected_servers: dict[str, list[str]] = {}
|
|
124
|
+
|
|
125
|
+
# Track current task for server connection events
|
|
126
|
+
self.current_task: str | None = None
|
|
127
|
+
|
|
128
|
+
# Current tool call LogSpan (captures timing and data)
|
|
129
|
+
self.current_tool_span: LogSpan | None = None
|
|
130
|
+
|
|
131
|
+
# Current harness LogSpan (captures timing and data)
|
|
132
|
+
self.current_harness_span: LogSpan | None = None
|
|
133
|
+
|
|
134
|
+
# Track code validation errors for summary
|
|
135
|
+
self.validation_errors: list[dict[str, str]] = []
|
|
136
|
+
|
|
137
|
+
# Track current scenario name
|
|
138
|
+
self.current_scenario: str | None = None
|
|
139
|
+
|
|
140
|
+
def _get_server_for_task(self, task: str | None) -> str:
|
|
141
|
+
"""Get the server name for a task.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
task: Task name.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Server name or "mcp" if unknown.
|
|
148
|
+
"""
|
|
149
|
+
if task and task in self.connected_servers and self.connected_servers[task]:
|
|
150
|
+
servers = self.connected_servers[task]
|
|
151
|
+
return servers[0] if len(servers) == 1 else ",".join(servers)
|
|
152
|
+
return "mcp"
|
|
153
|
+
|
|
154
|
+
def on_event(
|
|
155
|
+
self,
|
|
156
|
+
event: str,
|
|
157
|
+
*,
|
|
158
|
+
scenario: str | None = None,
|
|
159
|
+
task: str | None = None,
|
|
160
|
+
result: TaskResult | None = None,
|
|
161
|
+
server: str | None = None,
|
|
162
|
+
server_status: str | None = None, # noqa: ARG002 - part of event interface
|
|
163
|
+
tool_count: int | None = None,
|
|
164
|
+
error: str | None = None,
|
|
165
|
+
tool_name: str | None = None,
|
|
166
|
+
tool_args: dict[str, Any] | None = None,
|
|
167
|
+
tool_result: str | None = None,
|
|
168
|
+
llm_request: list[dict[str, Any]] | None = None,
|
|
169
|
+
llm_response: str | None = None,
|
|
170
|
+
) -> None:
|
|
171
|
+
"""Handle a progress event from the runner.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
event: Event type (scenario_start, task_start, task_complete, etc.).
|
|
175
|
+
scenario: Scenario name (if applicable).
|
|
176
|
+
task: Task name (if applicable).
|
|
177
|
+
result: TaskResult (for task_complete events).
|
|
178
|
+
server: Server name (for server_* events).
|
|
179
|
+
server_status: Status message (for server_* events).
|
|
180
|
+
tool_count: Number of tools available (for server_connected events).
|
|
181
|
+
error: Error message (for server_failed events).
|
|
182
|
+
tool_name: Name of tool being called (for tool_call/tool_response events).
|
|
183
|
+
tool_args: Arguments passed to tool (for tool_call events).
|
|
184
|
+
tool_result: Result from tool (for tool_response events).
|
|
185
|
+
llm_request: Messages sent to LLM (for llm_request events).
|
|
186
|
+
llm_response: Final LLM response text (for llm_response events).
|
|
187
|
+
"""
|
|
188
|
+
if event == "scenario_start":
|
|
189
|
+
self._on_scenario_start(scenario)
|
|
190
|
+
elif event == "task_start":
|
|
191
|
+
self._on_task_start(task)
|
|
192
|
+
elif event == "server_connecting":
|
|
193
|
+
self._on_server_connecting(server)
|
|
194
|
+
elif event == "server_connected":
|
|
195
|
+
self._on_server_connected(task, server, tool_count)
|
|
196
|
+
elif event == "server_failed":
|
|
197
|
+
self._on_server_failed(server, error)
|
|
198
|
+
elif event == "tool_call":
|
|
199
|
+
self._on_tool_call(task, tool_name, tool_args)
|
|
200
|
+
elif event == "tool_response":
|
|
201
|
+
self._on_tool_response(task, tool_name, tool_result)
|
|
202
|
+
elif event == "llm_request":
|
|
203
|
+
self._on_llm_request(task, llm_request)
|
|
204
|
+
elif event == "llm_response":
|
|
205
|
+
self._on_llm_response(task, llm_response)
|
|
206
|
+
elif event == "task_complete":
|
|
207
|
+
self._on_task_complete(task, result)
|
|
208
|
+
elif event == "task_evaluated":
|
|
209
|
+
self._on_task_evaluated(result)
|
|
210
|
+
|
|
211
|
+
def _on_scenario_start(self, scenario: str | None) -> None:
|
|
212
|
+
"""Handle scenario_start event."""
|
|
213
|
+
self.current_scenario = scenario
|
|
214
|
+
self.console.print(f"\n[yellow]Scenario[/yellow]: {scenario}")
|
|
215
|
+
|
|
216
|
+
def _on_task_start(self, task: str | None) -> None:
|
|
217
|
+
"""Handle task_start event."""
|
|
218
|
+
if task:
|
|
219
|
+
self.llm_call_counts[task] = 0
|
|
220
|
+
self.tool_call_counts[task] = 0
|
|
221
|
+
self.connected_servers[task] = []
|
|
222
|
+
self.current_task = task
|
|
223
|
+
|
|
224
|
+
# Visual separator between tasks
|
|
225
|
+
self.console.print()
|
|
226
|
+
self.console.print(Rule(style="dim"))
|
|
227
|
+
self.console.print(f" [bold cyan]Task[/bold cyan]: {task}")
|
|
228
|
+
|
|
229
|
+
def _on_server_connecting(self, _server: str | None) -> None:
|
|
230
|
+
"""Handle server_connecting event."""
|
|
231
|
+
# Server connection logged but not displayed (too noisy)
|
|
232
|
+
|
|
233
|
+
def _on_server_connected(
|
|
234
|
+
self,
|
|
235
|
+
task: str | None,
|
|
236
|
+
server: str | None,
|
|
237
|
+
_tool_count: int | None,
|
|
238
|
+
) -> None:
|
|
239
|
+
"""Handle server_connected event."""
|
|
240
|
+
# Track connected server for this task
|
|
241
|
+
task_key = task or self.current_task
|
|
242
|
+
if task_key and server:
|
|
243
|
+
if task_key not in self.connected_servers:
|
|
244
|
+
self.connected_servers[task_key] = []
|
|
245
|
+
self.connected_servers[task_key].append(server)
|
|
246
|
+
|
|
247
|
+
# Server connection logged but not displayed (too noisy)
|
|
248
|
+
|
|
249
|
+
def _on_server_failed(self, server: str | None, error: str | None) -> None:
|
|
250
|
+
"""Handle server_failed event."""
|
|
251
|
+
self.console.print(
|
|
252
|
+
f" [yellow]mcpFailed[/yellow]: [red]{server}[/red], error: {error}"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def _on_tool_call(
|
|
256
|
+
self,
|
|
257
|
+
task: str | None,
|
|
258
|
+
tool_name: str | None,
|
|
259
|
+
tool_args: dict[str, Any] | None,
|
|
260
|
+
) -> None:
|
|
261
|
+
"""Handle tool_call event - start a new tool span."""
|
|
262
|
+
# Increment tool call counter
|
|
263
|
+
if task and task in self.tool_call_counts:
|
|
264
|
+
self.tool_call_counts[task] += 1
|
|
265
|
+
|
|
266
|
+
server = self._get_server_for_task(task)
|
|
267
|
+
call_num = self.tool_call_counts.get(task, 1) if task else 1
|
|
268
|
+
|
|
269
|
+
# Format request as tool_name(args) - this is what the LLM sent to MCP
|
|
270
|
+
request_llm = ""
|
|
271
|
+
if tool_name and tool_args:
|
|
272
|
+
if tool_name == "run" and "command" in tool_args:
|
|
273
|
+
request_llm = str(tool_args["command"])
|
|
274
|
+
else:
|
|
275
|
+
request_llm = (
|
|
276
|
+
f"{tool_name}({json.dumps(tool_args, separators=(',', ':'))})"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# Create LogSpan - it captures timing automatically
|
|
280
|
+
self.current_tool_span = LogSpan(
|
|
281
|
+
span="bench.tool_call",
|
|
282
|
+
task=task or "",
|
|
283
|
+
call=call_num,
|
|
284
|
+
server=server,
|
|
285
|
+
tool=tool_name or "",
|
|
286
|
+
requestLLM=request_llm,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def _on_tool_response(
|
|
290
|
+
self,
|
|
291
|
+
task: str | None,
|
|
292
|
+
tool_name: str | None, # noqa: ARG002
|
|
293
|
+
tool_result: str | None,
|
|
294
|
+
) -> None:
|
|
295
|
+
"""Handle tool_response event - complete and emit tool span."""
|
|
296
|
+
if not self.current_tool_span:
|
|
297
|
+
return
|
|
298
|
+
|
|
299
|
+
# Extract result for cleaner output
|
|
300
|
+
response_mcp = _extract_result(tool_result or "")
|
|
301
|
+
|
|
302
|
+
# Add response to span
|
|
303
|
+
self.current_tool_span.add("responseMCP", response_mcp)
|
|
304
|
+
|
|
305
|
+
# Check for error patterns and save for summary
|
|
306
|
+
for pattern in TOOL_RESPONSE_ERROR_PATTERNS:
|
|
307
|
+
if re.search(pattern, response_mcp):
|
|
308
|
+
# Get requestLLM from the span (dict-style access)
|
|
309
|
+
try:
|
|
310
|
+
request_llm = self.current_tool_span["requestLLM"]
|
|
311
|
+
except KeyError:
|
|
312
|
+
request_llm = ""
|
|
313
|
+
self.validation_errors.append(
|
|
314
|
+
{
|
|
315
|
+
"scenario": self.current_scenario or "",
|
|
316
|
+
"task": task or "",
|
|
317
|
+
"requestLLM": request_llm,
|
|
318
|
+
"responseMCP": response_mcp,
|
|
319
|
+
}
|
|
320
|
+
)
|
|
321
|
+
break
|
|
322
|
+
|
|
323
|
+
# Get data from span for console output (includes duration)
|
|
324
|
+
span_data = self.current_tool_span.to_dict()
|
|
325
|
+
|
|
326
|
+
# Complete the LogSpan (logs to file)
|
|
327
|
+
self.current_tool_span.__exit__(None, None, None)
|
|
328
|
+
|
|
329
|
+
# Print to console
|
|
330
|
+
self.span_printer.print_span(span_data)
|
|
331
|
+
|
|
332
|
+
self.current_tool_span = None
|
|
333
|
+
|
|
334
|
+
def _on_llm_request(
|
|
335
|
+
self, task: str | None, llm_request: list[dict[str, Any]] | None
|
|
336
|
+
) -> None:
|
|
337
|
+
"""Handle llm_request event - start harness span on first call."""
|
|
338
|
+
if not llm_request:
|
|
339
|
+
return
|
|
340
|
+
|
|
341
|
+
# Increment LLM call counter
|
|
342
|
+
if task:
|
|
343
|
+
if task not in self.llm_call_counts:
|
|
344
|
+
self.llm_call_counts[task] = 0
|
|
345
|
+
self.llm_call_counts[task] += 1
|
|
346
|
+
|
|
347
|
+
# Only create harness span on first LLM call
|
|
348
|
+
call_count = self.llm_call_counts.get(task, 0) if task else 0
|
|
349
|
+
if call_count == 1:
|
|
350
|
+
system_prompt = ""
|
|
351
|
+
user_request = ""
|
|
352
|
+
for msg in llm_request:
|
|
353
|
+
role = msg.get("role", "")
|
|
354
|
+
content = msg.get("content", "")
|
|
355
|
+
if role == "system" and content:
|
|
356
|
+
system_prompt = content
|
|
357
|
+
elif role == "user" and content:
|
|
358
|
+
user_request = content
|
|
359
|
+
|
|
360
|
+
# Create LogSpan - it captures timing automatically
|
|
361
|
+
self.current_harness_span = LogSpan(
|
|
362
|
+
span="bench.llm_call",
|
|
363
|
+
task=task or "",
|
|
364
|
+
systemPrompt=system_prompt,
|
|
365
|
+
taskRequest=user_request,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
def _on_llm_response(self, task: str | None, llm_response: str | None) -> None: # noqa: ARG002
|
|
369
|
+
"""Handle llm_response event - capture final response for harness span."""
|
|
370
|
+
# Update harness span with final response
|
|
371
|
+
if self.current_harness_span and llm_response:
|
|
372
|
+
self.current_harness_span.add("taskResponse", llm_response)
|
|
373
|
+
|
|
374
|
+
def _on_task_complete(self, task: str | None, result: TaskResult | None) -> None:
|
|
375
|
+
"""Handle task_complete event - emit harness span."""
|
|
376
|
+
if not result:
|
|
377
|
+
return
|
|
378
|
+
|
|
379
|
+
# Complete and emit harness span
|
|
380
|
+
if self.current_harness_span:
|
|
381
|
+
# Add metrics and status to the span
|
|
382
|
+
self.current_harness_span.add(
|
|
383
|
+
tokensIn=result.input_tokens,
|
|
384
|
+
tokensOut=result.output_tokens,
|
|
385
|
+
llmCalls=result.llm_calls,
|
|
386
|
+
toolCalls=result.tool_calls,
|
|
387
|
+
cost=round(result.cost_usd, 6),
|
|
388
|
+
taskStatus="error" if result.error else "complete",
|
|
389
|
+
toolsUsed=result.tools_used or [],
|
|
390
|
+
)
|
|
391
|
+
if result.error:
|
|
392
|
+
self.current_harness_span.add(error=result.error)
|
|
393
|
+
|
|
394
|
+
# Get data from span for console output (includes duration)
|
|
395
|
+
span_data = self.current_harness_span.to_dict()
|
|
396
|
+
|
|
397
|
+
# Complete the LogSpan (logs to file)
|
|
398
|
+
self.current_harness_span.__exit__(None, None, None)
|
|
399
|
+
|
|
400
|
+
# Print to console
|
|
401
|
+
self.span_printer.print_span(span_data)
|
|
402
|
+
|
|
403
|
+
self.current_harness_span = None
|
|
404
|
+
|
|
405
|
+
# Clean up task state
|
|
406
|
+
if task:
|
|
407
|
+
self.connected_servers.pop(task, None)
|
|
408
|
+
|
|
409
|
+
def _on_task_evaluated(self, result: TaskResult | None) -> None:
|
|
410
|
+
"""Handle task_evaluated event - display evaluation result."""
|
|
411
|
+
if not result or not result.evaluation:
|
|
412
|
+
return
|
|
413
|
+
|
|
414
|
+
eval_result = result.evaluation
|
|
415
|
+
self.console.print()
|
|
416
|
+
|
|
417
|
+
if eval_result.eval_type == "pass_fail":
|
|
418
|
+
# Pass/fail evaluation - show PASS or FAIL
|
|
419
|
+
if eval_result.passed:
|
|
420
|
+
status_style = "bold green"
|
|
421
|
+
status = "PASS"
|
|
422
|
+
else:
|
|
423
|
+
status_style = "bold red"
|
|
424
|
+
status = "FAIL"
|
|
425
|
+
|
|
426
|
+
self.console.print(
|
|
427
|
+
f" [{status_style}]evaluation[/{status_style}]: {status}"
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# In verbose mode, show expected vs actual
|
|
431
|
+
if self.verbose:
|
|
432
|
+
if eval_result.expected:
|
|
433
|
+
self.console.print(
|
|
434
|
+
f" [dim]expected[/dim]: {eval_result.expected}"
|
|
435
|
+
)
|
|
436
|
+
if eval_result.actual:
|
|
437
|
+
self.console.print(f" [dim]actual[/dim]: {eval_result.actual}")
|
|
438
|
+
if eval_result.reason:
|
|
439
|
+
self.console.print(f" [dim]reason[/dim]: {eval_result.reason}")
|
|
440
|
+
elif not eval_result.passed:
|
|
441
|
+
# For failures, always show expected vs actual for debugging
|
|
442
|
+
if eval_result.expected:
|
|
443
|
+
self.console.print(
|
|
444
|
+
f" [dim]expected[/dim]: {eval_result.expected}"
|
|
445
|
+
)
|
|
446
|
+
if eval_result.actual:
|
|
447
|
+
self.console.print(f" [dim]actual[/dim]: {eval_result.actual}")
|
|
448
|
+
if eval_result.reason:
|
|
449
|
+
self.console.print(f" [dim]reason[/dim]: {eval_result.reason}")
|
|
450
|
+
else:
|
|
451
|
+
# Scored evaluation - show numeric score
|
|
452
|
+
score = eval_result.score
|
|
453
|
+
if score >= 80:
|
|
454
|
+
score_style = "bold green"
|
|
455
|
+
elif score >= 50:
|
|
456
|
+
score_style = "bold yellow"
|
|
457
|
+
else:
|
|
458
|
+
score_style = "bold red"
|
|
459
|
+
|
|
460
|
+
self.console.print(
|
|
461
|
+
f" [{score_style}]evaluation[/{score_style}]: "
|
|
462
|
+
f"score={score}/100, reason={eval_result.reason}"
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
def print_results_header(self) -> None:
|
|
466
|
+
"""Print the BENCHMARK RESULTS header with a double-line separator."""
|
|
467
|
+
self.console.print()
|
|
468
|
+
self.console.print(Rule("BENCHMARK RESULTS", style="bold cyan", characters="═"))
|
|
469
|
+
self.console.print()
|
|
470
|
+
|
|
471
|
+
def _format_eval_result(self, task_result: Any) -> str:
|
|
472
|
+
"""Format evaluation result for table display.
|
|
473
|
+
|
|
474
|
+
Returns PASS/FAIL for pass_fail evaluations, numeric score for scored.
|
|
475
|
+
"""
|
|
476
|
+
if not task_result.evaluation:
|
|
477
|
+
return "-"
|
|
478
|
+
|
|
479
|
+
eval_result = task_result.evaluation
|
|
480
|
+
if eval_result.eval_type == "pass_fail":
|
|
481
|
+
return "PASS" if eval_result.passed else "FAIL"
|
|
482
|
+
else:
|
|
483
|
+
return str(eval_result.score)
|
|
484
|
+
|
|
485
|
+
def _style_eval_result(self, task_result: Any) -> str:
|
|
486
|
+
"""Get style for evaluation result based on pass/fail or score."""
|
|
487
|
+
if not task_result.evaluation:
|
|
488
|
+
return ""
|
|
489
|
+
|
|
490
|
+
eval_result = task_result.evaluation
|
|
491
|
+
if eval_result.eval_type == "pass_fail":
|
|
492
|
+
return "bold green" if eval_result.passed else "bold red"
|
|
493
|
+
else:
|
|
494
|
+
score = eval_result.score
|
|
495
|
+
if score >= 80:
|
|
496
|
+
return "bold green"
|
|
497
|
+
elif score >= 50:
|
|
498
|
+
return "bold yellow"
|
|
499
|
+
else:
|
|
500
|
+
return "bold red"
|
|
501
|
+
|
|
502
|
+
def print_results_table(
|
|
503
|
+
self, scenario_result: Any, *, show_header: bool = False
|
|
504
|
+
) -> None:
|
|
505
|
+
"""Print a results table for a scenario.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
scenario_result: ScenarioResult with tasks to display.
|
|
509
|
+
show_header: If True, print the BENCHMARK RESULTS header first.
|
|
510
|
+
"""
|
|
511
|
+
if show_header:
|
|
512
|
+
self.print_results_header()
|
|
513
|
+
|
|
514
|
+
self.console.print(f"[yellow]Scenario[/yellow]: {scenario_result.name}")
|
|
515
|
+
|
|
516
|
+
# Check if any task has per-call metrics (for context columns)
|
|
517
|
+
has_call_metrics = any(
|
|
518
|
+
task.llm_call_metrics for task in scenario_result.tasks
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
# Create comparison table with ROUNDED box style
|
|
522
|
+
table = Table(show_header=True, header_style="bold", box=box.ROUNDED)
|
|
523
|
+
table.add_column("Task", min_width=16, no_wrap=True)
|
|
524
|
+
table.add_column("in", justify="right", no_wrap=True)
|
|
525
|
+
table.add_column("out", justify="right", no_wrap=True)
|
|
526
|
+
table.add_column("tools", justify="right", no_wrap=True)
|
|
527
|
+
table.add_column("time", justify="right", no_wrap=True)
|
|
528
|
+
table.add_column("cost", justify="right", no_wrap=True)
|
|
529
|
+
table.add_column("result", justify="right", no_wrap=True)
|
|
530
|
+
|
|
531
|
+
for task_result in scenario_result.tasks:
|
|
532
|
+
eval_display = self._format_eval_result(task_result)
|
|
533
|
+
eval_style = self._style_eval_result(task_result)
|
|
534
|
+
cost_cents = task_result.cost_usd * 100
|
|
535
|
+
|
|
536
|
+
# Apply style to evaluation result
|
|
537
|
+
if eval_style:
|
|
538
|
+
eval_display = f"[{eval_style}]{eval_display}[/{eval_style}]"
|
|
539
|
+
|
|
540
|
+
row = [
|
|
541
|
+
task_result.name,
|
|
542
|
+
str(task_result.input_tokens),
|
|
543
|
+
str(task_result.output_tokens),
|
|
544
|
+
str(task_result.tool_calls),
|
|
545
|
+
]
|
|
546
|
+
|
|
547
|
+
row.extend([
|
|
548
|
+
f"{task_result.duration_seconds:.0f}s",
|
|
549
|
+
f"{cost_cents:.2f}¢",
|
|
550
|
+
eval_display,
|
|
551
|
+
])
|
|
552
|
+
|
|
553
|
+
table.add_row(*row)
|
|
554
|
+
|
|
555
|
+
self.console.print(table)
|
|
556
|
+
|
|
557
|
+
# Show per-call breakdown in verbose mode
|
|
558
|
+
if self.verbose and has_call_metrics:
|
|
559
|
+
self.console.print("\n [dim]Per-call breakdown:[/dim]")
|
|
560
|
+
for task_result in scenario_result.tasks:
|
|
561
|
+
if task_result.llm_call_metrics:
|
|
562
|
+
self.console.print(f" [cyan]{task_result.name}[/cyan]:")
|
|
563
|
+
for m in task_result.llm_call_metrics:
|
|
564
|
+
self.console.print(
|
|
565
|
+
f" call{m.call_number}: "
|
|
566
|
+
f"in={m.input_tokens}, out={m.output_tokens}, "
|
|
567
|
+
f"tools={m.tool_calls_made}, "
|
|
568
|
+
f"cumulative={m.cumulative_input}, "
|
|
569
|
+
f"latency={m.latency_ms}ms"
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Show totals with camelCase labels
|
|
573
|
+
totals = scenario_result.calculate_totals()
|
|
574
|
+
cost_cents = totals["total_cost_usd"] * 100
|
|
575
|
+
|
|
576
|
+
# Build totals line
|
|
577
|
+
totals_parts = [
|
|
578
|
+
f"tokensIn={totals['total_input_tokens']}",
|
|
579
|
+
f"tokensOut={totals['total_output_tokens']}",
|
|
580
|
+
f"llmCalls={totals['total_llm_calls']}",
|
|
581
|
+
f"toolCalls={totals['total_tool_calls']}",
|
|
582
|
+
f"cost={cost_cents:.2f}¢",
|
|
583
|
+
]
|
|
584
|
+
|
|
585
|
+
# Add evaluation summary
|
|
586
|
+
if "pass_count" in totals or "fail_count" in totals:
|
|
587
|
+
pass_count = totals.get("pass_count", 0)
|
|
588
|
+
fail_count = totals.get("fail_count", 0)
|
|
589
|
+
if fail_count == 0:
|
|
590
|
+
totals_parts.append(f"[bold green]{pass_count} passed[/bold green]")
|
|
591
|
+
else:
|
|
592
|
+
totals_parts.append(
|
|
593
|
+
f"[bold green]{pass_count} passed[/bold green], "
|
|
594
|
+
f"[bold red]{fail_count} failed[/bold red]"
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
if "avg_score" in totals:
|
|
598
|
+
avg = totals["avg_score"]
|
|
599
|
+
if avg >= 80:
|
|
600
|
+
style = "bold green"
|
|
601
|
+
elif avg >= 50:
|
|
602
|
+
style = "bold yellow"
|
|
603
|
+
else:
|
|
604
|
+
style = "bold red"
|
|
605
|
+
totals_parts.append(f"[{style}]avgScore={avg}[/{style}]")
|
|
606
|
+
|
|
607
|
+
self.console.print(f"\n totals: {', '.join(totals_parts)}")
|
|
608
|
+
|
|
609
|
+
def print_validation_errors(self) -> None:
|
|
610
|
+
"""Print summary of validation errors detected during the run."""
|
|
611
|
+
if not self.validation_errors:
|
|
612
|
+
return
|
|
613
|
+
|
|
614
|
+
self.console.print()
|
|
615
|
+
self.console.print(
|
|
616
|
+
Rule("VALIDATION ERRORS", style="bold yellow", characters="─")
|
|
617
|
+
)
|
|
618
|
+
self.console.print()
|
|
619
|
+
self.console.print(
|
|
620
|
+
f"[yellow]{len(self.validation_errors)} validation error(s) detected "
|
|
621
|
+
"(LLM retried after these):[/yellow]"
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
for error in self.validation_errors:
|
|
625
|
+
self.console.print()
|
|
626
|
+
self.console.print(f" [cyan]Scenario[/cyan]: {error['scenario']}")
|
|
627
|
+
self.console.print(f" [cyan]Task[/cyan]: {error['task']}")
|
|
628
|
+
self.console.print(f" [cyan]requestLLM[/cyan]= {error['requestLLM']}")
|
|
629
|
+
self.console.print(f" [cyan]responseMCP[/cyan]= {error['responseMCP']}")
|