onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
bench/run.py
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
"""Run command for running agent benchmarks with MCP servers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import glob
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import questionary
|
|
10
|
+
import typer
|
|
11
|
+
import yaml
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
|
|
15
|
+
from bench.cli import app
|
|
16
|
+
from bench.harness.config import load_config
|
|
17
|
+
from bench.harness.csv_writer import write_results_csv
|
|
18
|
+
from bench.harness.runner import AgenticRunner
|
|
19
|
+
from bench.reporter import ConsoleReporter
|
|
20
|
+
from bench.utils import run_async
|
|
21
|
+
from ot._tui import ask_select
|
|
22
|
+
from ot.logging import LogSpan, configure_logging
|
|
23
|
+
from ot.paths import get_effective_cwd, get_global_dir
|
|
24
|
+
from ot.support import get_support_banner, get_version
|
|
25
|
+
|
|
26
|
+
# Exit codes
|
|
27
|
+
EXIT_SUCCESS = 0
|
|
28
|
+
EXIT_RUNTIME_ERROR = 1
|
|
29
|
+
EXIT_CONFIG_ERROR = 2
|
|
30
|
+
EXIT_FILE_NOT_FOUND = 3
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _print_startup_banner(console: Console) -> None:
|
|
34
|
+
"""Print startup message."""
|
|
35
|
+
version = get_version()
|
|
36
|
+
console.print(f"[bold cyan]OneTool Benchmark[/bold cyan] [dim]v{version}[/dim]")
|
|
37
|
+
console.print(get_support_banner())
|
|
38
|
+
console.print()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BenchFavorite(BaseModel):
|
|
42
|
+
"""A favorite benchmark entry."""
|
|
43
|
+
|
|
44
|
+
name: str = Field(description="Display name in picker")
|
|
45
|
+
path: str = Field(description="File path or directory")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class BenchConfig(BaseModel):
|
|
49
|
+
"""Configuration for bench CLI."""
|
|
50
|
+
|
|
51
|
+
favorites: list[BenchFavorite] = Field(
|
|
52
|
+
default_factory=list, description="Favorite benchmarks"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def load_bench_config(config_path: Path | str | None = None) -> BenchConfig:
|
|
57
|
+
"""Load bench configuration from YAML file.
|
|
58
|
+
|
|
59
|
+
Resolution order (when config_path is None):
|
|
60
|
+
1. BENCH_CONFIG env var
|
|
61
|
+
2. cwd/.onetool/config/bench.yaml
|
|
62
|
+
3. cwd/.onetool/bench.yaml
|
|
63
|
+
4. ~/.onetool/bench.yaml
|
|
64
|
+
5. Built-in defaults
|
|
65
|
+
"""
|
|
66
|
+
if config_path is None:
|
|
67
|
+
# Check BENCH_CONFIG env var first
|
|
68
|
+
env_config = os.getenv("BENCH_CONFIG")
|
|
69
|
+
if env_config:
|
|
70
|
+
config_path = Path(env_config)
|
|
71
|
+
else:
|
|
72
|
+
cwd = get_effective_cwd()
|
|
73
|
+
# Try project config: cwd/.onetool/config/bench.yaml (preferred)
|
|
74
|
+
project_config = cwd / ".onetool" / "config" / "bench.yaml"
|
|
75
|
+
if project_config.exists():
|
|
76
|
+
config_path = project_config
|
|
77
|
+
else:
|
|
78
|
+
# Try legacy location: cwd/.onetool/bench.yaml
|
|
79
|
+
legacy_config = cwd / ".onetool" / "bench.yaml"
|
|
80
|
+
if legacy_config.exists():
|
|
81
|
+
config_path = legacy_config
|
|
82
|
+
else:
|
|
83
|
+
# Try global config: ~/.onetool/bench.yaml
|
|
84
|
+
global_config = get_global_dir() / "bench.yaml"
|
|
85
|
+
if global_config.exists():
|
|
86
|
+
config_path = global_config
|
|
87
|
+
else:
|
|
88
|
+
# No config found, use defaults
|
|
89
|
+
return BenchConfig()
|
|
90
|
+
else:
|
|
91
|
+
config_path = Path(config_path)
|
|
92
|
+
|
|
93
|
+
if not config_path.exists():
|
|
94
|
+
return BenchConfig()
|
|
95
|
+
|
|
96
|
+
with config_path.open() as f:
|
|
97
|
+
raw_data = yaml.safe_load(f) or {}
|
|
98
|
+
|
|
99
|
+
return BenchConfig.model_validate(raw_data)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_yaml_description(file_path: Path) -> str | None:
|
|
103
|
+
"""Extract description field from a YAML benchmark file."""
|
|
104
|
+
try:
|
|
105
|
+
with file_path.open() as f:
|
|
106
|
+
data = yaml.safe_load(f)
|
|
107
|
+
if isinstance(data, dict):
|
|
108
|
+
return data.get("description")
|
|
109
|
+
except Exception:
|
|
110
|
+
pass
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def scan_yaml_files(directory: Path) -> list[Path]:
|
|
115
|
+
"""Recursively scan directory for YAML files, excluding hidden directories."""
|
|
116
|
+
files = []
|
|
117
|
+
for path in directory.rglob("*"):
|
|
118
|
+
# Skip hidden directories
|
|
119
|
+
if any(part.startswith(".") for part in path.parts):
|
|
120
|
+
continue
|
|
121
|
+
if path.is_file() and path.suffix in (".yaml", ".yml"):
|
|
122
|
+
files.append(path)
|
|
123
|
+
return sorted(files)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def run_tui_picker(console: Console) -> list[Path] | None:
|
|
127
|
+
"""Run interactive TUI for selecting benchmark file(s).
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
List of paths when a glob pattern matches multiple files.
|
|
131
|
+
Single-item list for directory browsing or single file favorites.
|
|
132
|
+
None if user cancels.
|
|
133
|
+
"""
|
|
134
|
+
import asyncio
|
|
135
|
+
|
|
136
|
+
bench_config = load_bench_config()
|
|
137
|
+
|
|
138
|
+
if not bench_config.favorites:
|
|
139
|
+
console.print("[dim]No favorites configured[/dim]")
|
|
140
|
+
console.print("[dim]Add favorites to .onetool/config/bench.yaml[/dim]")
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
async def pick_favorite() -> list[Path] | None:
|
|
144
|
+
favorites = bench_config.favorites
|
|
145
|
+
|
|
146
|
+
while True:
|
|
147
|
+
# Build choices using indices to avoid questionary value issues
|
|
148
|
+
choices = [
|
|
149
|
+
questionary.Choice(fav.name, value=str(i))
|
|
150
|
+
for i, fav in enumerate(favorites)
|
|
151
|
+
]
|
|
152
|
+
choices.append(
|
|
153
|
+
questionary.Choice("Exit", value="__exit__", shortcut_key="e")
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
selected = await ask_select("Select benchmark:", choices)
|
|
157
|
+
if not selected or selected == "__exit__" or not selected.isdigit():
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
fav = favorites[int(selected)]
|
|
161
|
+
fav_path_str = fav.path
|
|
162
|
+
|
|
163
|
+
# Check if path contains glob characters
|
|
164
|
+
has_glob = any(c in fav_path_str for c in "*?[")
|
|
165
|
+
|
|
166
|
+
if has_glob:
|
|
167
|
+
# Expand glob pattern - return ALL matching files
|
|
168
|
+
yaml_files = expand_glob_patterns([fav_path_str])
|
|
169
|
+
if not yaml_files:
|
|
170
|
+
console.print(f"[dim]No files matched: {fav_path_str}[/dim]")
|
|
171
|
+
continue # Go back to favorites picker
|
|
172
|
+
|
|
173
|
+
# Return all matching files (don't prompt to pick one)
|
|
174
|
+
return yaml_files
|
|
175
|
+
|
|
176
|
+
fav_path = Path(fav_path_str)
|
|
177
|
+
|
|
178
|
+
# If it's a file, return it directly (as a list)
|
|
179
|
+
if fav_path.is_file():
|
|
180
|
+
return [fav_path]
|
|
181
|
+
|
|
182
|
+
# If it's a directory, scan for YAML files
|
|
183
|
+
if fav_path.is_dir():
|
|
184
|
+
yaml_files = scan_yaml_files(fav_path)
|
|
185
|
+
if not yaml_files:
|
|
186
|
+
console.print(f"[dim]No YAML files found in {fav_path}[/dim]")
|
|
187
|
+
continue # Go back to favorites picker
|
|
188
|
+
|
|
189
|
+
# Build choices using indices
|
|
190
|
+
file_choices = []
|
|
191
|
+
for i, f in enumerate(yaml_files):
|
|
192
|
+
rel_path = f.relative_to(fav_path)
|
|
193
|
+
desc = get_yaml_description(f)
|
|
194
|
+
label = f"{rel_path}" + (f" - {desc}" if desc else "")
|
|
195
|
+
file_choices.append(questionary.Choice(label, value=str(i)))
|
|
196
|
+
file_choices.append(
|
|
197
|
+
questionary.Choice("Back", value="__back__", shortcut_key="b")
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
file_selected = await ask_select("Select file:", file_choices)
|
|
201
|
+
if (
|
|
202
|
+
not file_selected
|
|
203
|
+
or file_selected == "__back__"
|
|
204
|
+
or not file_selected.isdigit()
|
|
205
|
+
):
|
|
206
|
+
continue # Go back to favorites picker
|
|
207
|
+
return [yaml_files[int(file_selected)]]
|
|
208
|
+
|
|
209
|
+
console.print(f"[red]Path not found: {fav_path}[/red]")
|
|
210
|
+
continue # Go back to favorites picker
|
|
211
|
+
|
|
212
|
+
return asyncio.run(pick_favorite())
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def expand_glob_patterns(patterns: list[str]) -> list[Path]:
|
|
216
|
+
"""Expand glob patterns to list of files, preserving order."""
|
|
217
|
+
files: list[Path] = []
|
|
218
|
+
seen: set[Path] = set()
|
|
219
|
+
for pattern in patterns:
|
|
220
|
+
# Try glob expansion first
|
|
221
|
+
expanded = glob.glob(pattern) # noqa: PTH207 - glob.glob handles string patterns directly
|
|
222
|
+
if expanded:
|
|
223
|
+
for f in sorted(expanded):
|
|
224
|
+
path = Path(f)
|
|
225
|
+
if path.is_file() and path not in seen:
|
|
226
|
+
files.append(path)
|
|
227
|
+
seen.add(path)
|
|
228
|
+
else:
|
|
229
|
+
# No glob match, treat as literal path
|
|
230
|
+
path = Path(pattern)
|
|
231
|
+
if path not in seen:
|
|
232
|
+
files.append(path)
|
|
233
|
+
seen.add(path)
|
|
234
|
+
return files
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def run_single_benchmark(
|
|
238
|
+
config_file: Path,
|
|
239
|
+
console: Console,
|
|
240
|
+
scenario: str | None,
|
|
241
|
+
task: str | None,
|
|
242
|
+
tag: list[str] | None,
|
|
243
|
+
dry_run: bool,
|
|
244
|
+
verbose: bool,
|
|
245
|
+
trace: bool,
|
|
246
|
+
no_color: bool,
|
|
247
|
+
) -> tuple[list, bool]:
|
|
248
|
+
"""Run a single benchmark file.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Tuple of (results, success) where:
|
|
252
|
+
- results: List of ScenarioResult objects
|
|
253
|
+
- success: True if completed without runtime errors or interrupts.
|
|
254
|
+
Test evaluation failures (PASS/FAIL) don't affect this.
|
|
255
|
+
"""
|
|
256
|
+
with LogSpan(span="bench.config.load", path=str(config_file)) as span:
|
|
257
|
+
try:
|
|
258
|
+
config = load_config(config_file)
|
|
259
|
+
span.add(scenarios=len(config.scenarios), servers=len(config.servers))
|
|
260
|
+
except FileNotFoundError as e:
|
|
261
|
+
span.add(error="file_not_found")
|
|
262
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
263
|
+
return [], False
|
|
264
|
+
except Exception as e:
|
|
265
|
+
span.add(error=str(e))
|
|
266
|
+
console.print(f"[red]Configuration error:[/red] {e}")
|
|
267
|
+
return [], False
|
|
268
|
+
|
|
269
|
+
console.print(f"Loaded config: {config_file}")
|
|
270
|
+
console.print(f" Scenarios: {len(config.scenarios)}")
|
|
271
|
+
console.print(f" Servers: {list(config.servers.keys())}")
|
|
272
|
+
|
|
273
|
+
reporter = ConsoleReporter(
|
|
274
|
+
console=console,
|
|
275
|
+
config=config,
|
|
276
|
+
verbose=verbose,
|
|
277
|
+
trace=trace,
|
|
278
|
+
no_color=no_color,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
runner = AgenticRunner(
|
|
282
|
+
config,
|
|
283
|
+
dry_run=dry_run,
|
|
284
|
+
verbose=verbose,
|
|
285
|
+
on_progress=reporter.on_event,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
interrupted = False
|
|
289
|
+
try:
|
|
290
|
+
results = run_async(
|
|
291
|
+
runner.run_scenario(scenario_name=scenario, task_name=task, tags=tag)
|
|
292
|
+
)
|
|
293
|
+
except KeyboardInterrupt:
|
|
294
|
+
console.print("\n[yellow]Interrupted by user[/yellow]")
|
|
295
|
+
results = runner.partial_results
|
|
296
|
+
if results:
|
|
297
|
+
task_count = sum(len(s.tasks) for s in results)
|
|
298
|
+
console.print(f"[dim]Showing {task_count} completed task(s)[/dim]")
|
|
299
|
+
interrupted = True
|
|
300
|
+
except Exception as e:
|
|
301
|
+
console.print(f"[red]Runtime error:[/red] {e}")
|
|
302
|
+
return [], False
|
|
303
|
+
|
|
304
|
+
# Output results for this file (even if interrupted)
|
|
305
|
+
if results:
|
|
306
|
+
reporter.print_results_header()
|
|
307
|
+
for scenario_result in results:
|
|
308
|
+
reporter.print_results_table(scenario_result)
|
|
309
|
+
reporter.print_validation_errors()
|
|
310
|
+
|
|
311
|
+
return results or [], not interrupted
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
@app.command()
|
|
315
|
+
def run(
|
|
316
|
+
config_files: list[str] = typer.Argument(
|
|
317
|
+
None,
|
|
318
|
+
help="Path(s) to YAML config file(s). Supports glob patterns (e.g., *.yaml).",
|
|
319
|
+
),
|
|
320
|
+
scenario: str | None = typer.Option(
|
|
321
|
+
None,
|
|
322
|
+
"--scenario",
|
|
323
|
+
"-s",
|
|
324
|
+
help="Run only scenarios matching this pattern (supports wildcards).",
|
|
325
|
+
),
|
|
326
|
+
task: str | None = typer.Option(
|
|
327
|
+
None,
|
|
328
|
+
"--task",
|
|
329
|
+
"-t",
|
|
330
|
+
help="Run only tasks matching this pattern (supports wildcards: direct*, *:sha256:*).",
|
|
331
|
+
),
|
|
332
|
+
tag: list[str] | None = typer.Option(
|
|
333
|
+
None,
|
|
334
|
+
"--tag",
|
|
335
|
+
help="Run only tasks with the specified tag(s). Can be specified multiple times.",
|
|
336
|
+
),
|
|
337
|
+
output: Path | None = typer.Option(
|
|
338
|
+
None,
|
|
339
|
+
"--output",
|
|
340
|
+
"-o",
|
|
341
|
+
help="Path to write results YAML file.",
|
|
342
|
+
),
|
|
343
|
+
dry_run: bool = typer.Option(
|
|
344
|
+
False,
|
|
345
|
+
"--dry-run",
|
|
346
|
+
help="Validate config without making API calls.",
|
|
347
|
+
),
|
|
348
|
+
verbose: bool = typer.Option(
|
|
349
|
+
False,
|
|
350
|
+
"--verbose",
|
|
351
|
+
"-v",
|
|
352
|
+
help="Enable verbose output with full content.",
|
|
353
|
+
),
|
|
354
|
+
trace: bool = typer.Option(
|
|
355
|
+
False,
|
|
356
|
+
"--trace",
|
|
357
|
+
help="Show timestamped request/response cycle for debugging timing.",
|
|
358
|
+
),
|
|
359
|
+
no_color: bool = typer.Option(
|
|
360
|
+
False,
|
|
361
|
+
"--no-color",
|
|
362
|
+
help="Disable colored output (for CI/CD compatibility).",
|
|
363
|
+
),
|
|
364
|
+
tui: bool = typer.Option(
|
|
365
|
+
False,
|
|
366
|
+
"--tui",
|
|
367
|
+
help="Interactive TUI mode for selecting from favorites.",
|
|
368
|
+
),
|
|
369
|
+
csv: bool = typer.Option(
|
|
370
|
+
False,
|
|
371
|
+
"--csv",
|
|
372
|
+
help="Write results to CSV file with per-call metrics breakdown.",
|
|
373
|
+
),
|
|
374
|
+
) -> None:
|
|
375
|
+
"""Run tasks (direct MCP calls or agent benchmarks).
|
|
376
|
+
|
|
377
|
+
Task types:
|
|
378
|
+
type: direct - Direct MCP tool invocation (no LLM)
|
|
379
|
+
type: harness - LLM benchmark with MCP servers (default)
|
|
380
|
+
|
|
381
|
+
Examples:
|
|
382
|
+
bench run config.yaml
|
|
383
|
+
bench run examples/bench/*.yaml
|
|
384
|
+
bench run file1.yaml file2.yaml
|
|
385
|
+
bench run config.yaml --scenario "Tool Tests"
|
|
386
|
+
bench run config.yaml --task "direct*"
|
|
387
|
+
bench run config.yaml --tag focus
|
|
388
|
+
bench run config.yaml --verbose --trace
|
|
389
|
+
bench run config.yaml --dry-run
|
|
390
|
+
bench run config.yaml --output results.yaml
|
|
391
|
+
bench run --tui
|
|
392
|
+
"""
|
|
393
|
+
# Initialize console with no_color option and no auto-highlighting
|
|
394
|
+
console = Console(no_color=no_color, force_terminal=not no_color, highlight=False)
|
|
395
|
+
|
|
396
|
+
# Initialize logging inside command to avoid module-level side effects
|
|
397
|
+
configure_logging(log_name="bench")
|
|
398
|
+
|
|
399
|
+
# Print startup banner
|
|
400
|
+
_print_startup_banner(console)
|
|
401
|
+
|
|
402
|
+
# Handle TUI mode
|
|
403
|
+
if tui:
|
|
404
|
+
selected_files = run_tui_picker(console)
|
|
405
|
+
if not selected_files:
|
|
406
|
+
raise typer.Exit(EXIT_SUCCESS)
|
|
407
|
+
files_to_run = selected_files
|
|
408
|
+
elif not config_files:
|
|
409
|
+
console.print(
|
|
410
|
+
"[red]Error:[/red] Missing config file. Use --tui or provide a path."
|
|
411
|
+
)
|
|
412
|
+
raise typer.Exit(EXIT_CONFIG_ERROR)
|
|
413
|
+
else:
|
|
414
|
+
# Expand glob patterns
|
|
415
|
+
files_to_run = expand_glob_patterns(config_files)
|
|
416
|
+
if not files_to_run:
|
|
417
|
+
console.print("[red]Error:[/red] No files matched the provided pattern(s).")
|
|
418
|
+
raise typer.Exit(EXIT_FILE_NOT_FOUND)
|
|
419
|
+
|
|
420
|
+
# Validate all files exist
|
|
421
|
+
missing = [f for f in files_to_run if not f.exists()]
|
|
422
|
+
if missing:
|
|
423
|
+
for f in missing:
|
|
424
|
+
console.print(f"[red]Error:[/red] File not found: {f}")
|
|
425
|
+
raise typer.Exit(EXIT_FILE_NOT_FOUND)
|
|
426
|
+
|
|
427
|
+
if dry_run:
|
|
428
|
+
console.print("[yellow]Dry run mode - no API calls will be made[/yellow]")
|
|
429
|
+
|
|
430
|
+
if len(files_to_run) > 1:
|
|
431
|
+
console.print(f"[cyan]Running {len(files_to_run)} benchmark files[/cyan]\n")
|
|
432
|
+
|
|
433
|
+
# Run each benchmark file
|
|
434
|
+
# Note: runtime_error tracks exceptions/interrupts, NOT test evaluation failures.
|
|
435
|
+
# Test failures (PASS/FAIL) don't affect exit code - only runtime errors do.
|
|
436
|
+
all_results = []
|
|
437
|
+
runtime_error = False
|
|
438
|
+
|
|
439
|
+
for i, config_file in enumerate(files_to_run):
|
|
440
|
+
if len(files_to_run) > 1:
|
|
441
|
+
console.print(
|
|
442
|
+
f"\n[bold cyan]═══ File {i + 1}/{len(files_to_run)}: {config_file} ═══[/bold cyan]\n"
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
results, success = run_single_benchmark(
|
|
446
|
+
config_file=config_file,
|
|
447
|
+
console=console,
|
|
448
|
+
scenario=scenario,
|
|
449
|
+
task=task,
|
|
450
|
+
tag=tag,
|
|
451
|
+
dry_run=dry_run,
|
|
452
|
+
verbose=verbose,
|
|
453
|
+
trace=trace,
|
|
454
|
+
no_color=no_color,
|
|
455
|
+
)
|
|
456
|
+
all_results.extend(results)
|
|
457
|
+
if not success:
|
|
458
|
+
runtime_error = True
|
|
459
|
+
|
|
460
|
+
if not all_results:
|
|
461
|
+
console.print("[yellow]No results to report[/yellow]")
|
|
462
|
+
raise typer.Exit(EXIT_SUCCESS if not runtime_error else EXIT_RUNTIME_ERROR)
|
|
463
|
+
|
|
464
|
+
# Write aggregated results to file if specified
|
|
465
|
+
if output:
|
|
466
|
+
try:
|
|
467
|
+
output_data = {"results": [r.to_dict() for r in all_results]}
|
|
468
|
+
with output.open("w") as f:
|
|
469
|
+
yaml.dump(output_data, f, default_flow_style=False, sort_keys=False)
|
|
470
|
+
console.print(f"\nResults written to: {output}")
|
|
471
|
+
except OSError as e:
|
|
472
|
+
console.print(f"[red]Error writing results:[/red] {e}")
|
|
473
|
+
raise typer.Exit(EXIT_RUNTIME_ERROR) from e
|
|
474
|
+
|
|
475
|
+
# Write CSV with per-call metrics if requested
|
|
476
|
+
if csv:
|
|
477
|
+
try:
|
|
478
|
+
csv_path = write_results_csv(all_results)
|
|
479
|
+
console.print(f"CSV results written to: {csv_path}")
|
|
480
|
+
except OSError as e:
|
|
481
|
+
console.print(f"[red]Error writing CSV:[/red] {e}")
|
|
482
|
+
raise typer.Exit(EXIT_RUNTIME_ERROR) from e
|
|
483
|
+
|
|
484
|
+
# Exit 1 only for runtime errors (exceptions, config errors, interrupts)
|
|
485
|
+
# Test evaluation failures (PASS/FAIL) exit 0 - they're not runtime errors
|
|
486
|
+
if runtime_error:
|
|
487
|
+
raise typer.Exit(EXIT_RUNTIME_ERROR)
|
bench/secrets.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Secrets loading for bench.
|
|
2
|
+
|
|
3
|
+
Loads bench secrets from bench-secrets.yaml, separate from onetool secrets.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
from ot.logging import LogSpan
|
|
13
|
+
from ot.paths import get_effective_cwd, get_global_dir
|
|
14
|
+
|
|
15
|
+
# Cached bench secrets
|
|
16
|
+
_bench_secrets: dict[str, str] | None = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _find_bench_secrets_file() -> Path | None:
|
|
20
|
+
"""Find bench-secrets.yaml file.
|
|
21
|
+
|
|
22
|
+
Resolution order:
|
|
23
|
+
1. .onetool/config/bench-secrets.yaml (project-level, preferred)
|
|
24
|
+
2. .onetool/bench-secrets.yaml (project-level, legacy)
|
|
25
|
+
3. ~/.onetool/bench-secrets.yaml (global)
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Path to bench-secrets.yaml if found, None otherwise
|
|
29
|
+
"""
|
|
30
|
+
cwd = get_effective_cwd()
|
|
31
|
+
|
|
32
|
+
# Project-level (preferred location)
|
|
33
|
+
project_config_path = cwd / ".onetool" / "config" / "bench-secrets.yaml"
|
|
34
|
+
if project_config_path.exists():
|
|
35
|
+
return project_config_path
|
|
36
|
+
|
|
37
|
+
# Project-level (legacy location)
|
|
38
|
+
project_path = cwd / ".onetool" / "bench-secrets.yaml"
|
|
39
|
+
if project_path.exists():
|
|
40
|
+
return project_path
|
|
41
|
+
|
|
42
|
+
# Global
|
|
43
|
+
global_path = get_global_dir() / "bench-secrets.yaml"
|
|
44
|
+
if global_path.exists():
|
|
45
|
+
return global_path
|
|
46
|
+
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def load_bench_secrets() -> dict[str, str]:
|
|
51
|
+
"""Load bench secrets from bench-secrets.yaml.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Dictionary of secret name -> value
|
|
55
|
+
"""
|
|
56
|
+
global _bench_secrets
|
|
57
|
+
|
|
58
|
+
if _bench_secrets is not None:
|
|
59
|
+
return _bench_secrets
|
|
60
|
+
|
|
61
|
+
secrets_path = _find_bench_secrets_file()
|
|
62
|
+
|
|
63
|
+
with LogSpan(
|
|
64
|
+
span="bench.secrets.load",
|
|
65
|
+
path=str(secrets_path) if secrets_path else "not_found",
|
|
66
|
+
) as span:
|
|
67
|
+
if secrets_path is None:
|
|
68
|
+
span.add(error="bench-secrets.yaml not found")
|
|
69
|
+
_bench_secrets = {}
|
|
70
|
+
return _bench_secrets
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
with secrets_path.open() as f:
|
|
74
|
+
raw_data = yaml.safe_load(f)
|
|
75
|
+
except (yaml.YAMLError, OSError) as e:
|
|
76
|
+
span.add(error=str(e))
|
|
77
|
+
_bench_secrets = {}
|
|
78
|
+
return _bench_secrets
|
|
79
|
+
|
|
80
|
+
if raw_data is None:
|
|
81
|
+
span.add(count=0)
|
|
82
|
+
_bench_secrets = {}
|
|
83
|
+
return _bench_secrets
|
|
84
|
+
|
|
85
|
+
# Convert all values to strings
|
|
86
|
+
_bench_secrets = {k: str(v) for k, v in raw_data.items() if v is not None}
|
|
87
|
+
span.add(count=len(_bench_secrets))
|
|
88
|
+
return _bench_secrets
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_bench_secret(name: str) -> str:
|
|
92
|
+
"""Get a bench secret by name.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
name: Secret name (e.g., "OPENAI_API_KEY")
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Secret value, or empty string if not found
|
|
99
|
+
"""
|
|
100
|
+
secrets = load_bench_secrets()
|
|
101
|
+
return secrets.get(name, "")
|
bench/utils.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Shared utilities for bench CLI commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Coroutine
|
|
10
|
+
|
|
11
|
+
T = TypeVar("T")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_async(coro: Coroutine[Any, Any, T]) -> T:
|
|
15
|
+
"""Run an async coroutine synchronously."""
|
|
16
|
+
return asyncio.new_event_loop().run_until_complete(coro)
|
onetool/__init__.py
ADDED