kodo-agent 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. benchmark/__init__.py +0 -0
  2. benchmark/__main__.py +426 -0
  3. benchmark/_util.py +188 -0
  4. benchmark/curate_subset.py +125 -0
  5. benchmark/evaluate.py +1198 -0
  6. benchmark/evaluate_pending.py +256 -0
  7. benchmark/online/__init__.py +0 -0
  8. benchmark/online/cleanup_dummy_results.py +200 -0
  9. benchmark/online/client.py +310 -0
  10. benchmark/online/config.py +107 -0
  11. benchmark/online/db.py +1096 -0
  12. benchmark/online/distribute.py +65 -0
  13. benchmark/online/migrate_to_seeds.py +177 -0
  14. benchmark/online/mirror.py +203 -0
  15. benchmark/online/publish.py +399 -0
  16. benchmark/online/rename_arm.py +130 -0
  17. benchmark/online/server.py +682 -0
  18. benchmark/online/static/index.html +1183 -0
  19. benchmark/online/static/methodology.md +70 -0
  20. benchmark/online/static/progress.html +365 -0
  21. benchmark/online/static/register.html +197 -0
  22. benchmark/online/static/scheduling.html +450 -0
  23. benchmark/online/upload_history.py +244 -0
  24. benchmark/online/upload_tracker.py +114 -0
  25. benchmark/online/validation.py +139 -0
  26. benchmark/report.py +247 -0
  27. benchmark/runner.py +869 -0
  28. benchmark/tasks.py +97 -0
  29. kodo/__init__.py +119 -0
  30. kodo/__main__.py +5 -0
  31. kodo/advisory.py +127 -0
  32. kodo/agent.py +309 -0
  33. kodo/cli/__init__.py +23 -0
  34. kodo/cli/_improve.py +405 -0
  35. kodo/cli/_intake.py +642 -0
  36. kodo/cli/_interactive.py +191 -0
  37. kodo/cli/_launch.py +860 -0
  38. kodo/cli/_main.py +948 -0
  39. kodo/cli/_params.py +438 -0
  40. kodo/cli/_shared.py +86 -0
  41. kodo/cli/_subcommands.py +1168 -0
  42. kodo/cli/_teams_delete_pick.py +244 -0
  43. kodo/cli/_test.py +429 -0
  44. kodo/cli/_ui.py +116 -0
  45. kodo/coach.py +453 -0
  46. kodo/dashboard/__init__.py +18 -0
  47. kodo/dashboard/__main__.py +4 -0
  48. kodo/dashboard/dashboard.css +310 -0
  49. kodo/dashboard/dashboard.html +132 -0
  50. kodo/dashboard/dashboard.js +921 -0
  51. kodo/dashboard/server.py +543 -0
  52. kodo/debug.py +332 -0
  53. kodo/defaults/team-full.json +48 -0
  54. kodo/defaults/team-quick.json +26 -0
  55. kodo/env.py +9 -0
  56. kodo/factory.py +819 -0
  57. kodo/formatting.py +18 -0
  58. kodo/knowledge/__init__.py +6 -0
  59. kodo/knowledge/cli.py +101 -0
  60. kodo/knowledge/convergence.py +98 -0
  61. kodo/knowledge/models.py +221 -0
  62. kodo/knowledge/orchestrator.py +414 -0
  63. kodo/knowledge/prompts.py +226 -0
  64. kodo/knowledge/sessions.py +213 -0
  65. kodo/knowledge/team_designer.py +91 -0
  66. kodo/knowledge/tools.py +240 -0
  67. kodo/log.py +778 -0
  68. kodo/models.py +698 -0
  69. kodo/orchestrators/__init__.py +1 -0
  70. kodo/orchestrators/advisor.py +334 -0
  71. kodo/orchestrators/agent_tools.py +138 -0
  72. kodo/orchestrators/api.py +512 -0
  73. kodo/orchestrators/base.py +963 -0
  74. kodo/orchestrators/claude_code.py +220 -0
  75. kodo/orchestrators/cli_base.py +163 -0
  76. kodo/orchestrators/codex_cli.py +118 -0
  77. kodo/orchestrators/cursor_cli.py +157 -0
  78. kodo/orchestrators/cycle_utils.py +67 -0
  79. kodo/orchestrators/gemini_cli.py +154 -0
  80. kodo/orchestrators/git_ops.py +804 -0
  81. kodo/orchestrators/kimi_code.py +187 -0
  82. kodo/orchestrators/mcp_server.py +224 -0
  83. kodo/orchestrators/parallel.py +346 -0
  84. kodo/orchestrators/resume.py +40 -0
  85. kodo/orchestrators/run_status.py +93 -0
  86. kodo/orchestrators/stage_planning.py +113 -0
  87. kodo/orchestrators/tools.py +356 -0
  88. kodo/orchestrators/types.py +153 -0
  89. kodo/orchestrators/verification.py +383 -0
  90. kodo/prompts/improve.py +189 -0
  91. kodo/prompts/intake.py +58 -0
  92. kodo/prompts/other.py +8 -0
  93. kodo/prompts/roles.py +168 -0
  94. kodo/prompts/test.py +224 -0
  95. kodo/sessions/__init__.py +1 -0
  96. kodo/sessions/base.py +451 -0
  97. kodo/sessions/claude.py +535 -0
  98. kodo/sessions/codex.py +232 -0
  99. kodo/sessions/cursor.py +185 -0
  100. kodo/sessions/gemini_cli.py +197 -0
  101. kodo/sessions/kimi.py +376 -0
  102. kodo/sessions/kiro.py +151 -0
  103. kodo/sessions/opencode.py +192 -0
  104. kodo/summarizer.py +186 -0
  105. kodo/team_config.py +280 -0
  106. kodo/trace_upload.py +283 -0
  107. kodo/user_config.py +41 -0
  108. kodo/utils.py +64 -0
  109. kodo/viewer.html +1119 -0
  110. kodo/viewer.py +220 -0
  111. kodo_agent-0.5.0.dist-info/METADATA +419 -0
  112. kodo_agent-0.5.0.dist-info/RECORD +116 -0
  113. kodo_agent-0.5.0.dist-info/WHEEL +5 -0
  114. kodo_agent-0.5.0.dist-info/entry_points.txt +3 -0
  115. kodo_agent-0.5.0.dist-info/licenses/LICENSE +21 -0
  116. kodo_agent-0.5.0.dist-info/top_level.txt +2 -0
benchmark/__init__.py ADDED
File without changes
benchmark/__main__.py ADDED
@@ -0,0 +1,426 @@
1
+ """SWE-bench benchmark: kodo vs raw Claude Code / Cursor / Codex / Gemini.
2
+
3
+ By default, connects to the central server (KODO_BENCH_URL) to receive task
4
+ assignments and auto-detects available backends. Use --local for standalone runs.
5
+
6
+ Usage:
7
+ uv run python -m benchmark # server decides everything
8
+ uv run python -m benchmark --local --subset benchmark/subsets/pro-20.json
9
+ uv run python -m benchmark --local --arm kodo:solo --limit 2 --skip-eval
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ # Suppress noisy urllib3/chardet version mismatch warning from requests
15
+ # (triggered transitively via datasets/swebench imports).
16
+ import warnings
17
+
18
+ warnings.filterwarnings(
19
+ "ignore",
20
+ message=r"urllib3.*doesn't match a supported version",
21
+ )
22
+
23
+ from dotenv import find_dotenv, load_dotenv
24
+
25
+ load_dotenv(find_dotenv(usecwd=True))
26
+
27
+ import argparse
28
+ import os
29
+ import sys
30
+ from datetime import datetime, timezone
31
+ from pathlib import Path
32
+
33
+ from benchmark._util import detect_backends, log, setup_logging
34
+
35
+ WORKSPACE = Path.home() / ".kodo" / "benchmark"
36
+
37
+
38
+ def main() -> int:
39
+ """CLI entrypoint for the SWE-bench benchmark harness."""
40
+ parser = argparse.ArgumentParser(
41
+ description="SWE-bench benchmark: kodo vs raw Claude Code",
42
+ )
43
+
44
+ # Dataset and task selection
45
+ parser.add_argument(
46
+ "--dataset",
47
+ choices=["pro", "verified", "lite"],
48
+ default="pro",
49
+ help="SWE-bench variant (default: pro)",
50
+ )
51
+ parser.add_argument(
52
+ "--subset",
53
+ type=Path,
54
+ default=None,
55
+ help="Path to a subset JSON file (e.g. subsets/pro-20.json). "
56
+ "Overrides --dataset and --instance-ids.",
57
+ )
58
+ parser.add_argument("--limit", type=int, default=None, help="Run first N tasks")
59
+ parser.add_argument(
60
+ "--instance-ids", nargs="+", default=None, help="Specific instance IDs"
61
+ )
62
+ parser.add_argument(
63
+ "--repo",
64
+ type=str,
65
+ default=None,
66
+ help="Filter to repo (e.g. 'ansible/ansible')",
67
+ )
68
+ parser.add_argument(
69
+ "--language",
70
+ type=str,
71
+ default=None,
72
+ help="Filter by language (e.g. 'python', 'go', 'js'). Pro only.",
73
+ )
74
+ parser.add_argument("--offset", type=int, default=0, help="Skip first N tasks")
75
+
76
+ # Arm selection
77
+ parser.add_argument(
78
+ "--arm",
79
+ action="append",
80
+ default=None,
81
+ help="Arm to benchmark. Repeatable. 'claude', 'cursor', 'codex', "
82
+ "'gemini' for raw CLI tools; 'kodo' for default team, "
83
+ "'kodo:<team>' for a specific team (e.g. 'kodo:quick'). "
84
+ "Default: claude + kodo.",
85
+ )
86
+
87
+ # Execution
88
+ parser.add_argument(
89
+ "--timeout",
90
+ type=int,
91
+ default=7200,
92
+ help="Per-task timeout for non-orchestrated arms in seconds (default: 7200 / 2h)",
93
+ )
94
+ parser.add_argument(
95
+ "--timeout-kodo",
96
+ type=int,
97
+ default=43200,
98
+ help="Per-task timeout for kodo arms in seconds (default: 43200 / 12h)",
99
+ )
100
+ parser.add_argument(
101
+ "--workspace", type=Path, default=WORKSPACE, help="Workspace directory"
102
+ )
103
+ parser.add_argument(
104
+ "--run-id", type=str, default=None, help="Resume or reference a run ID"
105
+ )
106
+ parser.add_argument(
107
+ "--parallel", type=int, default=1, help="Concurrent tasks (default: 1)"
108
+ )
109
+ parser.add_argument(
110
+ "--seed",
111
+ type=int,
112
+ default=0,
113
+ help="Seed for deduplication. Same task+arm+seed won't re-run. "
114
+ "Use different seeds to get multiple runs of the same tasks (default: 0).",
115
+ )
116
+
117
+ # Phase control
118
+ parser.add_argument(
119
+ "--status",
120
+ action="store_true",
121
+ help="Show status of all benchmark runs and exit",
122
+ )
123
+ parser.add_argument(
124
+ "--skip-eval", action="store_true", help="Skip swebench evaluation"
125
+ )
126
+ parser.add_argument(
127
+ "--evaluate-only",
128
+ action="store_true",
129
+ help="Only evaluate existing predictions",
130
+ )
131
+ parser.add_argument(
132
+ "--report-only",
133
+ action="store_true",
134
+ help="Only generate report from existing results",
135
+ )
136
+ parser.add_argument(
137
+ "--publish",
138
+ action="store_true",
139
+ help="Publish results to GitHub Pages for the online viewer",
140
+ )
141
+ parser.add_argument(
142
+ "--extract-patch",
143
+ nargs=2,
144
+ metavar=("INSTANCE_ID", "ARM"),
145
+ help="Print a patch from published data",
146
+ )
147
+ parser.add_argument(
148
+ "--upload-pending",
149
+ action="store_true",
150
+ help="Upload results not yet sent to the online server (requires KODO_BENCH_URL/TOKEN)",
151
+ )
152
+ parser.add_argument(
153
+ "--evaluate-pending",
154
+ action="store_true",
155
+ help="Fetch unevaluated predictions from the online server and run Docker-based "
156
+ "swebench evaluation locally. Uploads results back when done. "
157
+ "(Requires KODO_BENCH_URL/TOKEN and Docker.)",
158
+ )
159
+ parser.add_argument(
160
+ "--mirror-online",
161
+ action="store_true",
162
+ help="Mirror public online benchmark data into local JSON files for plotting",
163
+ )
164
+ parser.add_argument(
165
+ "--mirror-out",
166
+ type=Path,
167
+ default=WORKSPACE / "mirror",
168
+ help="Output directory for --mirror-online (default: ~/.kodo/benchmark/mirror)",
169
+ )
170
+ parser.add_argument(
171
+ "--mirror-patches",
172
+ action="store_true",
173
+ help="With --mirror-online, also download patches.json",
174
+ )
175
+
176
+ # Mode
177
+ parser.add_argument(
178
+ "--local",
179
+ action="store_true",
180
+ help="Run locally instead of connecting to the central server. "
181
+ "Required when KODO_BENCH_URL/TOKEN are not set.",
182
+ )
183
+ parser.add_argument(
184
+ "--backends",
185
+ type=str,
186
+ default=None,
187
+ help="Override backend detection (e.g. 'claude,kodo:solo'). "
188
+ "Default: auto-detect from PATH.",
189
+ )
190
+
191
+ args = parser.parse_args()
192
+ setup_logging()
193
+ workspace: Path = args.workspace
194
+ workspace.mkdir(parents=True, exist_ok=True)
195
+
196
+ # UTC timestamp as run ID
197
+ run_id = args.run_id or datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
198
+ arms = args.arm if args.arm else ["claude", "kodo"]
199
+
200
+ if args.status:
201
+ from benchmark.report import print_status
202
+
203
+ return print_status(workspace)
204
+
205
+ if args.publish:
206
+ from benchmark.online.publish import publish_results
207
+
208
+ return publish_results(workspace, run_id=args.run_id)
209
+
210
+ if args.extract_patch:
211
+ from benchmark.online.publish import extract_patch
212
+
213
+ return extract_patch(args.extract_patch[0], args.extract_patch[1])
214
+
215
+ if args.upload_pending:
216
+ from benchmark.online.upload_tracker import flush_pending_uploads
217
+
218
+ return flush_pending_uploads(workspace)
219
+
220
+ if args.evaluate_pending:
221
+ from benchmark.evaluate_pending import evaluate_pending
222
+
223
+ return evaluate_pending(workspace, dataset_arg=args.dataset, arms=args.arm)
224
+
225
+ if args.mirror_online:
226
+ from benchmark.online.mirror import mirror_dataset
227
+
228
+ mirror_dataset(
229
+ args.dataset,
230
+ out_dir=args.mirror_out,
231
+ include_patches=args.mirror_patches,
232
+ )
233
+ return 0
234
+
235
+ if args.report_only:
236
+ from benchmark.report import generate_report
237
+
238
+ return generate_report(workspace, run_id)
239
+
240
+ if args.evaluate_only:
241
+ from benchmark.evaluate import evaluate_predictions
242
+ from benchmark.report import generate_report
243
+
244
+ evaluate_predictions(workspace, run_id)
245
+ return generate_report(workspace, run_id)
246
+
247
+ # Distribute mode: check before heavy task-loading imports.
248
+ # Report/evaluate-only runs operate on local artifacts and should not poll
249
+ # the server or load the task pool first.
250
+ local_mode = args.local or args.subset or args.instance_ids
251
+ if not local_mode:
252
+ from benchmark.online.client import is_configured, whoami
253
+
254
+ if is_configured():
255
+ identity = whoami()
256
+ if identity:
257
+ log.info("Authenticated as: %s", identity)
258
+ return _run_distributed(args, workspace, run_id)
259
+
260
+ from benchmark.evaluate import evaluate_predictions
261
+ from benchmark.report import generate_report
262
+
263
+ # Run agents
264
+ import json
265
+
266
+ from benchmark.runner import BenchmarkInterrupted, run_benchmark
267
+ from benchmark.tasks import DATASET_MAP, load_tasks
268
+
269
+ # Resolve dataset and instance_ids from --subset if provided
270
+ instance_ids = args.instance_ids
271
+ dataset = DATASET_MAP[args.dataset]
272
+ if args.subset:
273
+ subset_data = json.loads(args.subset.read_text())
274
+ instance_ids = subset_data["instance_ids"]
275
+ dataset = subset_data.get("dataset", dataset)
276
+
277
+ # Local mode (distribute path handled earlier)
278
+ tasks = load_tasks(
279
+ dataset=dataset,
280
+ limit=args.limit,
281
+ instance_ids=instance_ids,
282
+ repo_filter=args.repo,
283
+ language=args.language,
284
+ offset=args.offset,
285
+ )
286
+
287
+ if not tasks:
288
+ log.error("No tasks matched the filters.")
289
+ return 1
290
+
291
+ log.info("Running %d tasks", len(tasks))
292
+
293
+ try:
294
+ run_benchmark(
295
+ tasks=tasks,
296
+ arms=arms,
297
+ workspace=workspace,
298
+ run_id=run_id,
299
+ timeout=args.timeout,
300
+ timeout_kodo=args.timeout_kodo,
301
+ parallel=args.parallel,
302
+ dataset=dataset,
303
+ seed=args.seed,
304
+ )
305
+ except BenchmarkInterrupted as exc:
306
+ return _print_interrupted(exc.completed_count)
307
+
308
+ if not args.skip_eval:
309
+ evaluate_predictions(workspace, run_id)
310
+
311
+ return generate_report(workspace, run_id)
312
+
313
+
314
+ def _run_distributed(args: argparse.Namespace, workspace: Path, run_id: str) -> int:
315
+ """Poll central server for task assignments and run them in batches."""
316
+ from benchmark.online.client import fetch_assignments
317
+ from benchmark.runner import BenchmarkInterrupted, run_benchmark
318
+ from benchmark.tasks import DATASET_MAP, DATASET_PRO, load_tasks
319
+
320
+ # Backends: explicit --backends > explicit --arm > auto-detect
321
+ if args.backends:
322
+ backends = args.backends.split(",")
323
+ elif args.arm:
324
+ backends = args.arm
325
+ else:
326
+ backends = detect_backends()
327
+ log.info("Detected agents: %s", ", ".join(backends))
328
+
329
+ # Load only the requested dataset
330
+ all_datasets: dict[str, list[str]] = {}
331
+ all_tasks: dict[str, list] = {} # instance_id -> task
332
+ ds_key = args.dataset
333
+ ds_name = DATASET_MAP[ds_key]
334
+ ds_tasks = load_tasks(dataset=ds_name)
335
+ all_datasets[ds_key] = [t.instance_id for t in ds_tasks]
336
+ for t in ds_tasks:
337
+ all_tasks[t.instance_id] = t
338
+ total_tasks = sum(len(v) for v in all_datasets.values())
339
+ log.info(
340
+ "Task pool: %d tasks (%s)",
341
+ total_tasks,
342
+ ", ".join(f"{k}: {len(v)}" for k, v in all_datasets.items()),
343
+ )
344
+
345
+ batch_size = args.limit or 20
346
+ total_completed = 0
347
+
348
+ try:
349
+ while True:
350
+ try:
351
+ assignments = fetch_assignments(
352
+ backends=backends,
353
+ datasets=all_datasets,
354
+ limit=batch_size,
355
+ )
356
+ except Exception as exc:
357
+ log.error(
358
+ "Failed to get assignments from %s: %s",
359
+ os.environ.get("KODO_BENCH_URL", "(not set)"),
360
+ exc,
361
+ )
362
+ return 1 if total_completed == 0 else 0
363
+
364
+ if not assignments:
365
+ if total_completed == 0:
366
+ log.info("No tasks need evaluation — all covered!")
367
+ else:
368
+ log.info("No more tasks. Completed %d total.", total_completed)
369
+ return 0
370
+
371
+ unique_ids = dict.fromkeys(a["instance_id"] for a in assignments)
372
+ tasks = [all_tasks[iid] for iid in unique_ids if iid in all_tasks]
373
+ arms = list({a["arm"] for a in assignments})
374
+ ds_keys = {a.get("dataset", "pro") for a in assignments}
375
+ dataset = DATASET_MAP.get(next(iter(ds_keys)), DATASET_PRO)
376
+ unique_tasks = len({a["instance_id"] for a in assignments})
377
+ log.info(
378
+ "Received %d tasks x %d agents (%s) from %s",
379
+ unique_tasks,
380
+ len(arms),
381
+ ", ".join(arms),
382
+ "/".join(ds_keys),
383
+ )
384
+
385
+ run_benchmark(
386
+ tasks=tasks,
387
+ arms=arms,
388
+ workspace=workspace,
389
+ run_id=run_id,
390
+ timeout=args.timeout,
391
+ timeout_kodo=args.timeout_kodo,
392
+ parallel=args.parallel,
393
+ dataset=dataset,
394
+ seed=args.seed,
395
+ assignments=assignments,
396
+ )
397
+ total_completed += len(assignments)
398
+ log.info(
399
+ "Batch done. %d completed so far, polling for more...", total_completed
400
+ )
401
+ except (KeyboardInterrupt, BenchmarkInterrupted) as exc:
402
+ n = exc.completed_count if isinstance(exc, BenchmarkInterrupted) else 0
403
+ return _print_interrupted(total_completed + n)
404
+
405
+
406
+ def _print_interrupted(completed: int) -> int:
407
+ """Print a clean summary on Ctrl+C."""
408
+ print() # newline after ^C
409
+ if completed > 1:
410
+ log.info(
411
+ "Interrupted. %d tasks completed and uploaded. Thanks for contributing!",
412
+ completed,
413
+ )
414
+ elif completed == 1:
415
+ log.info("Interrupted. 1 task completed and uploaded.")
416
+ else:
417
+ log.info("Interrupted. No tasks completed.")
418
+ log.info("Claimed tasks will be reassigned automatically.")
419
+ return 0
420
+
421
+
422
+ if __name__ == "__main__":
423
+ try:
424
+ sys.exit(main())
425
+ except KeyboardInterrupt:
426
+ sys.exit(_print_interrupted(0))
benchmark/_util.py ADDED
@@ -0,0 +1,188 @@
1
+ """Shared helpers for the benchmark package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import platform
8
+ import re
9
+ import shutil
10
+ import subprocess
11
+ import time
12
+ from pathlib import Path
13
+
14
+ log = logging.getLogger("benchmark")
15
+
16
+ # Arm name sanitization — used for filenames and Docker container names.
17
+ # Reversible: ":" → "--" (unlike the old ":" → "_" which was lossy).
18
+ _UNSAFE_RE = re.compile(r"[^a-zA-Z0-9_.-]")
19
+
20
+
21
+ def fmt_duration(seconds: int) -> str:
22
+ """Format seconds as human-readable duration: 7200 -> '2h', 300 -> '5m'."""
23
+ if seconds >= 3600:
24
+ h = seconds / 3600
25
+ return f"{h:.0f}h" if h == int(h) else f"{h:.1f}h"
26
+ if seconds >= 60:
27
+ return f"{seconds // 60}m"
28
+ return f"{seconds}s"
29
+
30
+
31
+ def short_iid(instance_id: str) -> str:
32
+ """Shorten instance_id for display: 'django__django-13195' -> 'django/django#13195'."""
33
+ parts = instance_id.split("__", 1)
34
+ if len(parts) != 2:
35
+ return instance_id
36
+ owner = parts[0].replace("_", "-")
37
+ rest = parts[1]
38
+ dash_idx = rest.rfind("-")
39
+ if dash_idx > 0:
40
+ repo = rest[:dash_idx].replace("_", "-")
41
+ issue = rest[dash_idx + 1 :]
42
+ # Truncate long hashes (e.g. Go SWE-bench commit SHAs)
43
+ if len(issue) > 12:
44
+ issue = issue[:8]
45
+ return f"{owner}/{repo}#{issue}"
46
+ return f"{owner}/{rest}"
47
+
48
+
49
+ def docker_safe(name: str) -> str:
50
+ """Replace chars invalid in Docker container names with underscores."""
51
+ return _UNSAFE_RE.sub("_", name)
52
+
53
+
54
+ def load_json(path: Path) -> dict:
55
+ """Load a JSON file, returning {} on missing/corrupt files."""
56
+ if path.exists():
57
+ try:
58
+ return json.loads(path.read_text())
59
+ except (json.JSONDecodeError, OSError):
60
+ log.warning("Failed to parse %s", path)
61
+ return {}
62
+
63
+
64
+ def load_jsonl(path: Path) -> list[dict]:
65
+ """Load a JSONL file line-by-line, skipping bad lines."""
66
+ results: list[dict] = []
67
+ if not path.exists():
68
+ return results
69
+ with open(path) as f:
70
+ for line in f:
71
+ line = line.strip()
72
+ if not line:
73
+ continue
74
+ try:
75
+ results.append(json.loads(line))
76
+ except json.JSONDecodeError:
77
+ log.warning("Skipping bad JSONL line in %s", path)
78
+ return results
79
+
80
+
81
+ def iter_jsonl(path: Path):
82
+ """Iterate over JSONL lines without loading all into memory."""
83
+ if not path.exists():
84
+ return
85
+ with open(path) as f:
86
+ for line in f:
87
+ line = line.strip()
88
+ if not line:
89
+ continue
90
+ try:
91
+ yield json.loads(line)
92
+ except json.JSONDecodeError:
93
+ log.warning("Skipping bad JSONL line in %s", path)
94
+
95
+
96
+ def setup_logging(verbose: bool = False) -> None:
97
+ """Configure logging for benchmark runs."""
98
+ level = logging.DEBUG if verbose else logging.INFO
99
+ logging.basicConfig(
100
+ format="%(asctime)s %(levelname)-5s %(message)s",
101
+ datefmt="%Y-%m-%d %H:%M:%S",
102
+ level=level,
103
+ )
104
+
105
+
106
+ # CLI tool name → arm name(s). kodo is always available (it's this project).
107
+ _BACKEND_CLI_MAP: list[tuple[str, list[str]]] = [
108
+ ("claude", ["claude"]),
109
+ ("cursor-agent", ["cursor"]),
110
+ ("codex", ["codex"]),
111
+ ("gemini", ["gemini"]),
112
+ ]
113
+
114
+
115
+ def ensure_docker_running(timeout: int = 60) -> bool:
116
+ """Check if Docker daemon is running; attempt to start it if not.
117
+
118
+ Returns True if Docker is available, False otherwise.
119
+ On macOS, tries OrbStack first, then Docker Desktop.
120
+ """
121
+ if _docker_is_ready():
122
+ return True
123
+
124
+ log.info("Docker daemon is not running. Attempting to start...")
125
+
126
+ if platform.system() != "Darwin":
127
+ log.warning(
128
+ "Docker is not running. Start it manually:\n sudo systemctl start docker"
129
+ )
130
+ return False
131
+
132
+ if not _start_docker_macos():
133
+ log.error("Could not start Docker. Please start it manually.")
134
+ return False
135
+
136
+ deadline = time.monotonic() + timeout
137
+ while time.monotonic() < deadline:
138
+ if _docker_is_ready():
139
+ log.info("Docker is now running.")
140
+ return True
141
+ time.sleep(2)
142
+
143
+ log.error("Docker did not become ready within %ds.", timeout)
144
+ return False
145
+
146
+
147
+ def _docker_is_ready() -> bool:
148
+ """Return True if ``docker info`` succeeds."""
149
+ try:
150
+ result = subprocess.run(
151
+ ["docker", "info"],
152
+ capture_output=True,
153
+ timeout=10,
154
+ )
155
+ return result.returncode == 0
156
+ except (FileNotFoundError, subprocess.TimeoutExpired):
157
+ return False
158
+
159
+
160
+ def _start_docker_macos() -> bool:
161
+ """Try to start Docker on macOS via OrbStack or Docker Desktop."""
162
+ if shutil.which("orbctl"):
163
+ log.info("Starting Docker via OrbStack...")
164
+ try:
165
+ subprocess.run(["orbctl", "start"], check=True, timeout=30)
166
+ return True
167
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired):
168
+ log.warning("OrbStack start failed, trying Docker Desktop...")
169
+
170
+ try:
171
+ subprocess.run(["open", "-a", "Docker"], check=True, timeout=10)
172
+ log.info("Starting Docker Desktop (this may take 30-60s)...")
173
+ return True
174
+ except (subprocess.CalledProcessError, FileNotFoundError):
175
+ return False
176
+
177
+
178
+ def detect_backends() -> list[str]:
179
+ """Auto-detect which benchmark backends are available on this machine.
180
+
181
+ Checks PATH for each CLI tool. ``kodo`` is always included since it's
182
+ the project itself (runs via ``uv run kodo``).
183
+ """
184
+ found: list[str] = ["kodo"]
185
+ for cli_name, arm_names in _BACKEND_CLI_MAP:
186
+ if shutil.which(cli_name):
187
+ found.extend(arm_names)
188
+ return found