kodo-agent 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmark/__init__.py +0 -0
- benchmark/__main__.py +426 -0
- benchmark/_util.py +188 -0
- benchmark/curate_subset.py +125 -0
- benchmark/evaluate.py +1198 -0
- benchmark/evaluate_pending.py +256 -0
- benchmark/online/__init__.py +0 -0
- benchmark/online/cleanup_dummy_results.py +200 -0
- benchmark/online/client.py +310 -0
- benchmark/online/config.py +107 -0
- benchmark/online/db.py +1096 -0
- benchmark/online/distribute.py +65 -0
- benchmark/online/migrate_to_seeds.py +177 -0
- benchmark/online/mirror.py +203 -0
- benchmark/online/publish.py +399 -0
- benchmark/online/rename_arm.py +130 -0
- benchmark/online/server.py +682 -0
- benchmark/online/static/index.html +1183 -0
- benchmark/online/static/methodology.md +70 -0
- benchmark/online/static/progress.html +365 -0
- benchmark/online/static/register.html +197 -0
- benchmark/online/static/scheduling.html +450 -0
- benchmark/online/upload_history.py +244 -0
- benchmark/online/upload_tracker.py +114 -0
- benchmark/online/validation.py +139 -0
- benchmark/report.py +247 -0
- benchmark/runner.py +869 -0
- benchmark/tasks.py +97 -0
- kodo/__init__.py +119 -0
- kodo/__main__.py +5 -0
- kodo/advisory.py +127 -0
- kodo/agent.py +309 -0
- kodo/cli/__init__.py +23 -0
- kodo/cli/_improve.py +405 -0
- kodo/cli/_intake.py +642 -0
- kodo/cli/_interactive.py +191 -0
- kodo/cli/_launch.py +860 -0
- kodo/cli/_main.py +948 -0
- kodo/cli/_params.py +438 -0
- kodo/cli/_shared.py +86 -0
- kodo/cli/_subcommands.py +1168 -0
- kodo/cli/_teams_delete_pick.py +244 -0
- kodo/cli/_test.py +429 -0
- kodo/cli/_ui.py +116 -0
- kodo/coach.py +453 -0
- kodo/dashboard/__init__.py +18 -0
- kodo/dashboard/__main__.py +4 -0
- kodo/dashboard/dashboard.css +310 -0
- kodo/dashboard/dashboard.html +132 -0
- kodo/dashboard/dashboard.js +921 -0
- kodo/dashboard/server.py +543 -0
- kodo/debug.py +332 -0
- kodo/defaults/team-full.json +48 -0
- kodo/defaults/team-quick.json +26 -0
- kodo/env.py +9 -0
- kodo/factory.py +819 -0
- kodo/formatting.py +18 -0
- kodo/knowledge/__init__.py +6 -0
- kodo/knowledge/cli.py +101 -0
- kodo/knowledge/convergence.py +98 -0
- kodo/knowledge/models.py +221 -0
- kodo/knowledge/orchestrator.py +414 -0
- kodo/knowledge/prompts.py +226 -0
- kodo/knowledge/sessions.py +213 -0
- kodo/knowledge/team_designer.py +91 -0
- kodo/knowledge/tools.py +240 -0
- kodo/log.py +778 -0
- kodo/models.py +698 -0
- kodo/orchestrators/__init__.py +1 -0
- kodo/orchestrators/advisor.py +334 -0
- kodo/orchestrators/agent_tools.py +138 -0
- kodo/orchestrators/api.py +512 -0
- kodo/orchestrators/base.py +963 -0
- kodo/orchestrators/claude_code.py +220 -0
- kodo/orchestrators/cli_base.py +163 -0
- kodo/orchestrators/codex_cli.py +118 -0
- kodo/orchestrators/cursor_cli.py +157 -0
- kodo/orchestrators/cycle_utils.py +67 -0
- kodo/orchestrators/gemini_cli.py +154 -0
- kodo/orchestrators/git_ops.py +804 -0
- kodo/orchestrators/kimi_code.py +187 -0
- kodo/orchestrators/mcp_server.py +224 -0
- kodo/orchestrators/parallel.py +346 -0
- kodo/orchestrators/resume.py +40 -0
- kodo/orchestrators/run_status.py +93 -0
- kodo/orchestrators/stage_planning.py +113 -0
- kodo/orchestrators/tools.py +356 -0
- kodo/orchestrators/types.py +153 -0
- kodo/orchestrators/verification.py +383 -0
- kodo/prompts/improve.py +189 -0
- kodo/prompts/intake.py +58 -0
- kodo/prompts/other.py +8 -0
- kodo/prompts/roles.py +168 -0
- kodo/prompts/test.py +224 -0
- kodo/sessions/__init__.py +1 -0
- kodo/sessions/base.py +451 -0
- kodo/sessions/claude.py +535 -0
- kodo/sessions/codex.py +232 -0
- kodo/sessions/cursor.py +185 -0
- kodo/sessions/gemini_cli.py +197 -0
- kodo/sessions/kimi.py +376 -0
- kodo/sessions/kiro.py +151 -0
- kodo/sessions/opencode.py +192 -0
- kodo/summarizer.py +186 -0
- kodo/team_config.py +280 -0
- kodo/trace_upload.py +283 -0
- kodo/user_config.py +41 -0
- kodo/utils.py +64 -0
- kodo/viewer.html +1119 -0
- kodo/viewer.py +220 -0
- kodo_agent-0.5.0.dist-info/METADATA +419 -0
- kodo_agent-0.5.0.dist-info/RECORD +116 -0
- kodo_agent-0.5.0.dist-info/WHEEL +5 -0
- kodo_agent-0.5.0.dist-info/entry_points.txt +3 -0
- kodo_agent-0.5.0.dist-info/licenses/LICENSE +21 -0
- kodo_agent-0.5.0.dist-info/top_level.txt +2 -0
benchmark/__init__.py
ADDED
|
File without changes
|
benchmark/__main__.py
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
"""SWE-bench benchmark: kodo vs raw Claude Code / Cursor / Codex / Gemini.
|
|
2
|
+
|
|
3
|
+
By default, connects to the central server (KODO_BENCH_URL) to receive task
|
|
4
|
+
assignments and auto-detects available backends. Use --local for standalone runs.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
uv run python -m benchmark # server decides everything
|
|
8
|
+
uv run python -m benchmark --local --subset benchmark/subsets/pro-20.json
|
|
9
|
+
uv run python -m benchmark --local --arm kodo:solo --limit 2 --skip-eval
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
# Suppress noisy urllib3/chardet version mismatch warning from requests
|
|
15
|
+
# (triggered transitively via datasets/swebench imports).
|
|
16
|
+
import warnings
|
|
17
|
+
|
|
18
|
+
warnings.filterwarnings(
|
|
19
|
+
"ignore",
|
|
20
|
+
message=r"urllib3.*doesn't match a supported version",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from dotenv import find_dotenv, load_dotenv
|
|
24
|
+
|
|
25
|
+
load_dotenv(find_dotenv(usecwd=True))
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import os
|
|
29
|
+
import sys
|
|
30
|
+
from datetime import datetime, timezone
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
from benchmark._util import detect_backends, log, setup_logging
|
|
34
|
+
|
|
35
|
+
WORKSPACE = Path.home() / ".kodo" / "benchmark"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def main() -> int:
|
|
39
|
+
"""CLI entrypoint for the SWE-bench benchmark harness."""
|
|
40
|
+
parser = argparse.ArgumentParser(
|
|
41
|
+
description="SWE-bench benchmark: kodo vs raw Claude Code",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Dataset and task selection
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--dataset",
|
|
47
|
+
choices=["pro", "verified", "lite"],
|
|
48
|
+
default="pro",
|
|
49
|
+
help="SWE-bench variant (default: pro)",
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--subset",
|
|
53
|
+
type=Path,
|
|
54
|
+
default=None,
|
|
55
|
+
help="Path to a subset JSON file (e.g. subsets/pro-20.json). "
|
|
56
|
+
"Overrides --dataset and --instance-ids.",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument("--limit", type=int, default=None, help="Run first N tasks")
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--instance-ids", nargs="+", default=None, help="Specific instance IDs"
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--repo",
|
|
64
|
+
type=str,
|
|
65
|
+
default=None,
|
|
66
|
+
help="Filter to repo (e.g. 'ansible/ansible')",
|
|
67
|
+
)
|
|
68
|
+
parser.add_argument(
|
|
69
|
+
"--language",
|
|
70
|
+
type=str,
|
|
71
|
+
default=None,
|
|
72
|
+
help="Filter by language (e.g. 'python', 'go', 'js'). Pro only.",
|
|
73
|
+
)
|
|
74
|
+
parser.add_argument("--offset", type=int, default=0, help="Skip first N tasks")
|
|
75
|
+
|
|
76
|
+
# Arm selection
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--arm",
|
|
79
|
+
action="append",
|
|
80
|
+
default=None,
|
|
81
|
+
help="Arm to benchmark. Repeatable. 'claude', 'cursor', 'codex', "
|
|
82
|
+
"'gemini' for raw CLI tools; 'kodo' for default team, "
|
|
83
|
+
"'kodo:<team>' for a specific team (e.g. 'kodo:quick'). "
|
|
84
|
+
"Default: claude + kodo.",
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Execution
|
|
88
|
+
parser.add_argument(
|
|
89
|
+
"--timeout",
|
|
90
|
+
type=int,
|
|
91
|
+
default=7200,
|
|
92
|
+
help="Per-task timeout for non-orchestrated arms in seconds (default: 7200 / 2h)",
|
|
93
|
+
)
|
|
94
|
+
parser.add_argument(
|
|
95
|
+
"--timeout-kodo",
|
|
96
|
+
type=int,
|
|
97
|
+
default=43200,
|
|
98
|
+
help="Per-task timeout for kodo arms in seconds (default: 43200 / 12h)",
|
|
99
|
+
)
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
"--workspace", type=Path, default=WORKSPACE, help="Workspace directory"
|
|
102
|
+
)
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
"--run-id", type=str, default=None, help="Resume or reference a run ID"
|
|
105
|
+
)
|
|
106
|
+
parser.add_argument(
|
|
107
|
+
"--parallel", type=int, default=1, help="Concurrent tasks (default: 1)"
|
|
108
|
+
)
|
|
109
|
+
parser.add_argument(
|
|
110
|
+
"--seed",
|
|
111
|
+
type=int,
|
|
112
|
+
default=0,
|
|
113
|
+
help="Seed for deduplication. Same task+arm+seed won't re-run. "
|
|
114
|
+
"Use different seeds to get multiple runs of the same tasks (default: 0).",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Phase control
|
|
118
|
+
parser.add_argument(
|
|
119
|
+
"--status",
|
|
120
|
+
action="store_true",
|
|
121
|
+
help="Show status of all benchmark runs and exit",
|
|
122
|
+
)
|
|
123
|
+
parser.add_argument(
|
|
124
|
+
"--skip-eval", action="store_true", help="Skip swebench evaluation"
|
|
125
|
+
)
|
|
126
|
+
parser.add_argument(
|
|
127
|
+
"--evaluate-only",
|
|
128
|
+
action="store_true",
|
|
129
|
+
help="Only evaluate existing predictions",
|
|
130
|
+
)
|
|
131
|
+
parser.add_argument(
|
|
132
|
+
"--report-only",
|
|
133
|
+
action="store_true",
|
|
134
|
+
help="Only generate report from existing results",
|
|
135
|
+
)
|
|
136
|
+
parser.add_argument(
|
|
137
|
+
"--publish",
|
|
138
|
+
action="store_true",
|
|
139
|
+
help="Publish results to GitHub Pages for the online viewer",
|
|
140
|
+
)
|
|
141
|
+
parser.add_argument(
|
|
142
|
+
"--extract-patch",
|
|
143
|
+
nargs=2,
|
|
144
|
+
metavar=("INSTANCE_ID", "ARM"),
|
|
145
|
+
help="Print a patch from published data",
|
|
146
|
+
)
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--upload-pending",
|
|
149
|
+
action="store_true",
|
|
150
|
+
help="Upload results not yet sent to the online server (requires KODO_BENCH_URL/TOKEN)",
|
|
151
|
+
)
|
|
152
|
+
parser.add_argument(
|
|
153
|
+
"--evaluate-pending",
|
|
154
|
+
action="store_true",
|
|
155
|
+
help="Fetch unevaluated predictions from the online server and run Docker-based "
|
|
156
|
+
"swebench evaluation locally. Uploads results back when done. "
|
|
157
|
+
"(Requires KODO_BENCH_URL/TOKEN and Docker.)",
|
|
158
|
+
)
|
|
159
|
+
parser.add_argument(
|
|
160
|
+
"--mirror-online",
|
|
161
|
+
action="store_true",
|
|
162
|
+
help="Mirror public online benchmark data into local JSON files for plotting",
|
|
163
|
+
)
|
|
164
|
+
parser.add_argument(
|
|
165
|
+
"--mirror-out",
|
|
166
|
+
type=Path,
|
|
167
|
+
default=WORKSPACE / "mirror",
|
|
168
|
+
help="Output directory for --mirror-online (default: ~/.kodo/benchmark/mirror)",
|
|
169
|
+
)
|
|
170
|
+
parser.add_argument(
|
|
171
|
+
"--mirror-patches",
|
|
172
|
+
action="store_true",
|
|
173
|
+
help="With --mirror-online, also download patches.json",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Mode
|
|
177
|
+
parser.add_argument(
|
|
178
|
+
"--local",
|
|
179
|
+
action="store_true",
|
|
180
|
+
help="Run locally instead of connecting to the central server. "
|
|
181
|
+
"Required when KODO_BENCH_URL/TOKEN are not set.",
|
|
182
|
+
)
|
|
183
|
+
parser.add_argument(
|
|
184
|
+
"--backends",
|
|
185
|
+
type=str,
|
|
186
|
+
default=None,
|
|
187
|
+
help="Override backend detection (e.g. 'claude,kodo:solo'). "
|
|
188
|
+
"Default: auto-detect from PATH.",
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
args = parser.parse_args()
|
|
192
|
+
setup_logging()
|
|
193
|
+
workspace: Path = args.workspace
|
|
194
|
+
workspace.mkdir(parents=True, exist_ok=True)
|
|
195
|
+
|
|
196
|
+
# UTC timestamp as run ID
|
|
197
|
+
run_id = args.run_id or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
198
|
+
arms = args.arm if args.arm else ["claude", "kodo"]
|
|
199
|
+
|
|
200
|
+
if args.status:
|
|
201
|
+
from benchmark.report import print_status
|
|
202
|
+
|
|
203
|
+
return print_status(workspace)
|
|
204
|
+
|
|
205
|
+
if args.publish:
|
|
206
|
+
from benchmark.online.publish import publish_results
|
|
207
|
+
|
|
208
|
+
return publish_results(workspace, run_id=args.run_id)
|
|
209
|
+
|
|
210
|
+
if args.extract_patch:
|
|
211
|
+
from benchmark.online.publish import extract_patch
|
|
212
|
+
|
|
213
|
+
return extract_patch(args.extract_patch[0], args.extract_patch[1])
|
|
214
|
+
|
|
215
|
+
if args.upload_pending:
|
|
216
|
+
from benchmark.online.upload_tracker import flush_pending_uploads
|
|
217
|
+
|
|
218
|
+
return flush_pending_uploads(workspace)
|
|
219
|
+
|
|
220
|
+
if args.evaluate_pending:
|
|
221
|
+
from benchmark.evaluate_pending import evaluate_pending
|
|
222
|
+
|
|
223
|
+
return evaluate_pending(workspace, dataset_arg=args.dataset, arms=args.arm)
|
|
224
|
+
|
|
225
|
+
if args.mirror_online:
|
|
226
|
+
from benchmark.online.mirror import mirror_dataset
|
|
227
|
+
|
|
228
|
+
mirror_dataset(
|
|
229
|
+
args.dataset,
|
|
230
|
+
out_dir=args.mirror_out,
|
|
231
|
+
include_patches=args.mirror_patches,
|
|
232
|
+
)
|
|
233
|
+
return 0
|
|
234
|
+
|
|
235
|
+
if args.report_only:
|
|
236
|
+
from benchmark.report import generate_report
|
|
237
|
+
|
|
238
|
+
return generate_report(workspace, run_id)
|
|
239
|
+
|
|
240
|
+
if args.evaluate_only:
|
|
241
|
+
from benchmark.evaluate import evaluate_predictions
|
|
242
|
+
from benchmark.report import generate_report
|
|
243
|
+
|
|
244
|
+
evaluate_predictions(workspace, run_id)
|
|
245
|
+
return generate_report(workspace, run_id)
|
|
246
|
+
|
|
247
|
+
# Distribute mode: check before heavy task-loading imports.
|
|
248
|
+
# Report/evaluate-only runs operate on local artifacts and should not poll
|
|
249
|
+
# the server or load the task pool first.
|
|
250
|
+
local_mode = args.local or args.subset or args.instance_ids
|
|
251
|
+
if not local_mode:
|
|
252
|
+
from benchmark.online.client import is_configured, whoami
|
|
253
|
+
|
|
254
|
+
if is_configured():
|
|
255
|
+
identity = whoami()
|
|
256
|
+
if identity:
|
|
257
|
+
log.info("Authenticated as: %s", identity)
|
|
258
|
+
return _run_distributed(args, workspace, run_id)
|
|
259
|
+
|
|
260
|
+
from benchmark.evaluate import evaluate_predictions
|
|
261
|
+
from benchmark.report import generate_report
|
|
262
|
+
|
|
263
|
+
# Run agents
|
|
264
|
+
import json
|
|
265
|
+
|
|
266
|
+
from benchmark.runner import BenchmarkInterrupted, run_benchmark
|
|
267
|
+
from benchmark.tasks import DATASET_MAP, load_tasks
|
|
268
|
+
|
|
269
|
+
# Resolve dataset and instance_ids from --subset if provided
|
|
270
|
+
instance_ids = args.instance_ids
|
|
271
|
+
dataset = DATASET_MAP[args.dataset]
|
|
272
|
+
if args.subset:
|
|
273
|
+
subset_data = json.loads(args.subset.read_text())
|
|
274
|
+
instance_ids = subset_data["instance_ids"]
|
|
275
|
+
dataset = subset_data.get("dataset", dataset)
|
|
276
|
+
|
|
277
|
+
# Local mode (distribute path handled earlier)
|
|
278
|
+
tasks = load_tasks(
|
|
279
|
+
dataset=dataset,
|
|
280
|
+
limit=args.limit,
|
|
281
|
+
instance_ids=instance_ids,
|
|
282
|
+
repo_filter=args.repo,
|
|
283
|
+
language=args.language,
|
|
284
|
+
offset=args.offset,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
if not tasks:
|
|
288
|
+
log.error("No tasks matched the filters.")
|
|
289
|
+
return 1
|
|
290
|
+
|
|
291
|
+
log.info("Running %d tasks", len(tasks))
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
run_benchmark(
|
|
295
|
+
tasks=tasks,
|
|
296
|
+
arms=arms,
|
|
297
|
+
workspace=workspace,
|
|
298
|
+
run_id=run_id,
|
|
299
|
+
timeout=args.timeout,
|
|
300
|
+
timeout_kodo=args.timeout_kodo,
|
|
301
|
+
parallel=args.parallel,
|
|
302
|
+
dataset=dataset,
|
|
303
|
+
seed=args.seed,
|
|
304
|
+
)
|
|
305
|
+
except BenchmarkInterrupted as exc:
|
|
306
|
+
return _print_interrupted(exc.completed_count)
|
|
307
|
+
|
|
308
|
+
if not args.skip_eval:
|
|
309
|
+
evaluate_predictions(workspace, run_id)
|
|
310
|
+
|
|
311
|
+
return generate_report(workspace, run_id)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _run_distributed(args: argparse.Namespace, workspace: Path, run_id: str) -> int:
|
|
315
|
+
"""Poll central server for task assignments and run them in batches."""
|
|
316
|
+
from benchmark.online.client import fetch_assignments
|
|
317
|
+
from benchmark.runner import BenchmarkInterrupted, run_benchmark
|
|
318
|
+
from benchmark.tasks import DATASET_MAP, DATASET_PRO, load_tasks
|
|
319
|
+
|
|
320
|
+
# Backends: explicit --backends > explicit --arm > auto-detect
|
|
321
|
+
if args.backends:
|
|
322
|
+
backends = args.backends.split(",")
|
|
323
|
+
elif args.arm:
|
|
324
|
+
backends = args.arm
|
|
325
|
+
else:
|
|
326
|
+
backends = detect_backends()
|
|
327
|
+
log.info("Detected agents: %s", ", ".join(backends))
|
|
328
|
+
|
|
329
|
+
# Load only the requested dataset
|
|
330
|
+
all_datasets: dict[str, list[str]] = {}
|
|
331
|
+
all_tasks: dict[str, list] = {} # instance_id -> task
|
|
332
|
+
ds_key = args.dataset
|
|
333
|
+
ds_name = DATASET_MAP[ds_key]
|
|
334
|
+
ds_tasks = load_tasks(dataset=ds_name)
|
|
335
|
+
all_datasets[ds_key] = [t.instance_id for t in ds_tasks]
|
|
336
|
+
for t in ds_tasks:
|
|
337
|
+
all_tasks[t.instance_id] = t
|
|
338
|
+
total_tasks = sum(len(v) for v in all_datasets.values())
|
|
339
|
+
log.info(
|
|
340
|
+
"Task pool: %d tasks (%s)",
|
|
341
|
+
total_tasks,
|
|
342
|
+
", ".join(f"{k}: {len(v)}" for k, v in all_datasets.items()),
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
batch_size = args.limit or 20
|
|
346
|
+
total_completed = 0
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
while True:
|
|
350
|
+
try:
|
|
351
|
+
assignments = fetch_assignments(
|
|
352
|
+
backends=backends,
|
|
353
|
+
datasets=all_datasets,
|
|
354
|
+
limit=batch_size,
|
|
355
|
+
)
|
|
356
|
+
except Exception as exc:
|
|
357
|
+
log.error(
|
|
358
|
+
"Failed to get assignments from %s: %s",
|
|
359
|
+
os.environ.get("KODO_BENCH_URL", "(not set)"),
|
|
360
|
+
exc,
|
|
361
|
+
)
|
|
362
|
+
return 1 if total_completed == 0 else 0
|
|
363
|
+
|
|
364
|
+
if not assignments:
|
|
365
|
+
if total_completed == 0:
|
|
366
|
+
log.info("No tasks need evaluation — all covered!")
|
|
367
|
+
else:
|
|
368
|
+
log.info("No more tasks. Completed %d total.", total_completed)
|
|
369
|
+
return 0
|
|
370
|
+
|
|
371
|
+
unique_ids = dict.fromkeys(a["instance_id"] for a in assignments)
|
|
372
|
+
tasks = [all_tasks[iid] for iid in unique_ids if iid in all_tasks]
|
|
373
|
+
arms = list({a["arm"] for a in assignments})
|
|
374
|
+
ds_keys = {a.get("dataset", "pro") for a in assignments}
|
|
375
|
+
dataset = DATASET_MAP.get(next(iter(ds_keys)), DATASET_PRO)
|
|
376
|
+
unique_tasks = len({a["instance_id"] for a in assignments})
|
|
377
|
+
log.info(
|
|
378
|
+
"Received %d tasks x %d agents (%s) from %s",
|
|
379
|
+
unique_tasks,
|
|
380
|
+
len(arms),
|
|
381
|
+
", ".join(arms),
|
|
382
|
+
"/".join(ds_keys),
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
run_benchmark(
|
|
386
|
+
tasks=tasks,
|
|
387
|
+
arms=arms,
|
|
388
|
+
workspace=workspace,
|
|
389
|
+
run_id=run_id,
|
|
390
|
+
timeout=args.timeout,
|
|
391
|
+
timeout_kodo=args.timeout_kodo,
|
|
392
|
+
parallel=args.parallel,
|
|
393
|
+
dataset=dataset,
|
|
394
|
+
seed=args.seed,
|
|
395
|
+
assignments=assignments,
|
|
396
|
+
)
|
|
397
|
+
total_completed += len(assignments)
|
|
398
|
+
log.info(
|
|
399
|
+
"Batch done. %d completed so far, polling for more...", total_completed
|
|
400
|
+
)
|
|
401
|
+
except (KeyboardInterrupt, BenchmarkInterrupted) as exc:
|
|
402
|
+
n = exc.completed_count if isinstance(exc, BenchmarkInterrupted) else 0
|
|
403
|
+
return _print_interrupted(total_completed + n)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _print_interrupted(completed: int) -> int:
|
|
407
|
+
"""Print a clean summary on Ctrl+C."""
|
|
408
|
+
print() # newline after ^C
|
|
409
|
+
if completed > 1:
|
|
410
|
+
log.info(
|
|
411
|
+
"Interrupted. %d tasks completed and uploaded. Thanks for contributing!",
|
|
412
|
+
completed,
|
|
413
|
+
)
|
|
414
|
+
elif completed == 1:
|
|
415
|
+
log.info("Interrupted. 1 task completed and uploaded.")
|
|
416
|
+
else:
|
|
417
|
+
log.info("Interrupted. No tasks completed.")
|
|
418
|
+
log.info("Claimed tasks will be reassigned automatically.")
|
|
419
|
+
return 0
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
if __name__ == "__main__":
|
|
423
|
+
try:
|
|
424
|
+
sys.exit(main())
|
|
425
|
+
except KeyboardInterrupt:
|
|
426
|
+
sys.exit(_print_interrupted(0))
|
benchmark/_util.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Shared helpers for the benchmark package."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import platform
|
|
8
|
+
import re
|
|
9
|
+
import shutil
|
|
10
|
+
import subprocess
|
|
11
|
+
import time
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
log = logging.getLogger("benchmark")
|
|
15
|
+
|
|
16
|
+
# Arm name sanitization — used for filenames and Docker container names.
|
|
17
|
+
# Reversible: ":" → "--" (unlike the old ":" → "_" which was lossy).
|
|
18
|
+
_UNSAFE_RE = re.compile(r"[^a-zA-Z0-9_.-]")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def fmt_duration(seconds: int) -> str:
|
|
22
|
+
"""Format seconds as human-readable duration: 7200 -> '2h', 300 -> '5m'."""
|
|
23
|
+
if seconds >= 3600:
|
|
24
|
+
h = seconds / 3600
|
|
25
|
+
return f"{h:.0f}h" if h == int(h) else f"{h:.1f}h"
|
|
26
|
+
if seconds >= 60:
|
|
27
|
+
return f"{seconds // 60}m"
|
|
28
|
+
return f"{seconds}s"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def short_iid(instance_id: str) -> str:
|
|
32
|
+
"""Shorten instance_id for display: 'django__django-13195' -> 'django/django#13195'."""
|
|
33
|
+
parts = instance_id.split("__", 1)
|
|
34
|
+
if len(parts) != 2:
|
|
35
|
+
return instance_id
|
|
36
|
+
owner = parts[0].replace("_", "-")
|
|
37
|
+
rest = parts[1]
|
|
38
|
+
dash_idx = rest.rfind("-")
|
|
39
|
+
if dash_idx > 0:
|
|
40
|
+
repo = rest[:dash_idx].replace("_", "-")
|
|
41
|
+
issue = rest[dash_idx + 1 :]
|
|
42
|
+
# Truncate long hashes (e.g. Go SWE-bench commit SHAs)
|
|
43
|
+
if len(issue) > 12:
|
|
44
|
+
issue = issue[:8]
|
|
45
|
+
return f"{owner}/{repo}#{issue}"
|
|
46
|
+
return f"{owner}/{rest}"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def docker_safe(name: str) -> str:
|
|
50
|
+
"""Replace chars invalid in Docker container names with underscores."""
|
|
51
|
+
return _UNSAFE_RE.sub("_", name)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def load_json(path: Path) -> dict:
|
|
55
|
+
"""Load a JSON file, returning {} on missing/corrupt files."""
|
|
56
|
+
if path.exists():
|
|
57
|
+
try:
|
|
58
|
+
return json.loads(path.read_text())
|
|
59
|
+
except (json.JSONDecodeError, OSError):
|
|
60
|
+
log.warning("Failed to parse %s", path)
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def load_jsonl(path: Path) -> list[dict]:
|
|
65
|
+
"""Load a JSONL file line-by-line, skipping bad lines."""
|
|
66
|
+
results: list[dict] = []
|
|
67
|
+
if not path.exists():
|
|
68
|
+
return results
|
|
69
|
+
with open(path) as f:
|
|
70
|
+
for line in f:
|
|
71
|
+
line = line.strip()
|
|
72
|
+
if not line:
|
|
73
|
+
continue
|
|
74
|
+
try:
|
|
75
|
+
results.append(json.loads(line))
|
|
76
|
+
except json.JSONDecodeError:
|
|
77
|
+
log.warning("Skipping bad JSONL line in %s", path)
|
|
78
|
+
return results
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def iter_jsonl(path: Path):
|
|
82
|
+
"""Iterate over JSONL lines without loading all into memory."""
|
|
83
|
+
if not path.exists():
|
|
84
|
+
return
|
|
85
|
+
with open(path) as f:
|
|
86
|
+
for line in f:
|
|
87
|
+
line = line.strip()
|
|
88
|
+
if not line:
|
|
89
|
+
continue
|
|
90
|
+
try:
|
|
91
|
+
yield json.loads(line)
|
|
92
|
+
except json.JSONDecodeError:
|
|
93
|
+
log.warning("Skipping bad JSONL line in %s", path)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def setup_logging(verbose: bool = False) -> None:
|
|
97
|
+
"""Configure logging for benchmark runs."""
|
|
98
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
99
|
+
logging.basicConfig(
|
|
100
|
+
format="%(asctime)s %(levelname)-5s %(message)s",
|
|
101
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
102
|
+
level=level,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# CLI tool name → arm name(s). kodo is always available (it's this project).
|
|
107
|
+
_BACKEND_CLI_MAP: list[tuple[str, list[str]]] = [
|
|
108
|
+
("claude", ["claude"]),
|
|
109
|
+
("cursor-agent", ["cursor"]),
|
|
110
|
+
("codex", ["codex"]),
|
|
111
|
+
("gemini", ["gemini"]),
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def ensure_docker_running(timeout: int = 60) -> bool:
|
|
116
|
+
"""Check if Docker daemon is running; attempt to start it if not.
|
|
117
|
+
|
|
118
|
+
Returns True if Docker is available, False otherwise.
|
|
119
|
+
On macOS, tries OrbStack first, then Docker Desktop.
|
|
120
|
+
"""
|
|
121
|
+
if _docker_is_ready():
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
log.info("Docker daemon is not running. Attempting to start...")
|
|
125
|
+
|
|
126
|
+
if platform.system() != "Darwin":
|
|
127
|
+
log.warning(
|
|
128
|
+
"Docker is not running. Start it manually:\n sudo systemctl start docker"
|
|
129
|
+
)
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
if not _start_docker_macos():
|
|
133
|
+
log.error("Could not start Docker. Please start it manually.")
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
deadline = time.monotonic() + timeout
|
|
137
|
+
while time.monotonic() < deadline:
|
|
138
|
+
if _docker_is_ready():
|
|
139
|
+
log.info("Docker is now running.")
|
|
140
|
+
return True
|
|
141
|
+
time.sleep(2)
|
|
142
|
+
|
|
143
|
+
log.error("Docker did not become ready within %ds.", timeout)
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _docker_is_ready() -> bool:
|
|
148
|
+
"""Return True if ``docker info`` succeeds."""
|
|
149
|
+
try:
|
|
150
|
+
result = subprocess.run(
|
|
151
|
+
["docker", "info"],
|
|
152
|
+
capture_output=True,
|
|
153
|
+
timeout=10,
|
|
154
|
+
)
|
|
155
|
+
return result.returncode == 0
|
|
156
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _start_docker_macos() -> bool:
|
|
161
|
+
"""Try to start Docker on macOS via OrbStack or Docker Desktop."""
|
|
162
|
+
if shutil.which("orbctl"):
|
|
163
|
+
log.info("Starting Docker via OrbStack...")
|
|
164
|
+
try:
|
|
165
|
+
subprocess.run(["orbctl", "start"], check=True, timeout=30)
|
|
166
|
+
return True
|
|
167
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
|
|
168
|
+
log.warning("OrbStack start failed, trying Docker Desktop...")
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
subprocess.run(["open", "-a", "Docker"], check=True, timeout=10)
|
|
172
|
+
log.info("Starting Docker Desktop (this may take 30-60s)...")
|
|
173
|
+
return True
|
|
174
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
175
|
+
return False
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def detect_backends() -> list[str]:
|
|
179
|
+
"""Auto-detect which benchmark backends are available on this machine.
|
|
180
|
+
|
|
181
|
+
Checks PATH for each CLI tool. ``kodo`` is always included since it's
|
|
182
|
+
the project itself (runs via ``uv run kodo``).
|
|
183
|
+
"""
|
|
184
|
+
found: list[str] = ["kodo"]
|
|
185
|
+
for cli_name, arm_names in _BACKEND_CLI_MAP:
|
|
186
|
+
if shutil.which(cli_name):
|
|
187
|
+
found.extend(arm_names)
|
|
188
|
+
return found
|