coderace 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coderace-1.2.0 → coderace-1.3.0}/CHANGELOG.md +18 -0
- {coderace-1.2.0 → coderace-1.3.0}/PKG-INFO +44 -1
- {coderace-1.2.0 → coderace-1.3.0}/README.md +43 -0
- coderace-1.3.0/all-day-build-contract-model-selection.md +121 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/__init__.py +1 -1
- coderace-1.3.0/coderace/adapters/__init__.py +77 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/aider.py +11 -4
- {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/base.py +13 -3
- {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/claude.py +12 -6
- {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/codex.py +12 -5
- {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/gemini.py +12 -8
- {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/opencode.py +12 -8
- {coderace-1.2.0 → coderace-1.3.0}/coderace/benchmark.py +17 -11
- {coderace-1.2.0 → coderace-1.3.0}/coderace/cli.py +36 -25
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/race.py +2 -2
- {coderace-1.2.0 → coderace-1.3.0}/coderace/types.py +3 -1
- coderace-1.3.0/examples/model-selection.yaml +30 -0
- coderace-1.3.0/progress-log.md +70 -0
- {coderace-1.2.0 → coderace-1.3.0}/pyproject.toml +1 -1
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_examples.py +3 -2
- coderace-1.3.0/tests/test_model_selection_d1_d2.py +200 -0
- coderace-1.3.0/tests/test_model_selection_d3.py +164 -0
- coderace-1.3.0/tests/test_model_selection_d4.py +186 -0
- coderace-1.2.0/coderace/adapters/__init__.py +0 -26
- coderace-1.2.0/progress-log.md +0 -918
- {coderace-1.2.0 → coderace-1.3.0}/.github/workflows/publish.yml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/.gitignore +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/DONE.txt +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/LICENSE +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/action.yml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-benchmark.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-builtin-tasks.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-ci-integration.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-context-eval.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-cost-tracking.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-dashboard.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-leaderboard.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-race-mode.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-v0.2.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-v090-tasks.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-v1.0-statistical.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-verification-tests.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/benchmark-results/fibonacci-2026-02-27.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/benchmark-results/fibonacci-v2-2026-02-27.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/benchmark-results/hard-tasks-2026-02-27.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/benchmark-results/multi-task-2026-02-27.md +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/benchmark_report.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/benchmark_stats.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/__init__.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/binary-search-tree.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/cli-args-parser.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/csv-analyzer.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/data-pipeline.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/diff-algorithm.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/expression-evaluator.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/fibonacci.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/file-watcher.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/http-server.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/json-parser.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/lru-cache.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/markdown-to-html.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/regex-engine.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/state-machine.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/task-scheduler.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/url-router.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/__init__.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/benchmark.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/context_eval.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/dashboard.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/diff.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/history.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/leaderboard.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/results.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/tasks.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/context_eval.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/context_eval_report.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/cost.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/dashboard.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/elo.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/export.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/git_ops.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/html_report.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/publish.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/reporter.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/scorer.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/statistics.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/stats.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/store.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/coderace/task.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/demo-race.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/examples/add-type-hints.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/examples/ci-race-on-pr.yml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/examples/context-eval-demo.sh +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/examples/example-task.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/examples/fix-edge-case.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/examples/write-tests.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/scripts/ci-run.sh +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/scripts/format-comment.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tasks/markdown-table.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tasks/parse-duration.yaml +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/__init__.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/conftest.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_adapters.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_benchmark.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_benchmark_trials.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_benchmark_v1_integration.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_builtins.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_cli.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_cli_store_integration.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_context_eval.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_context_eval_dashboard.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_cost.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_cost_config.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_cost_integration.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_dashboard.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_dashboard_cli.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_diff.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_elo.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_export.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_format_comment.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_full_workflow.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_git_ops.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_history.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_html_report.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_leaderboard.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_markdown_results.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_publish.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_race.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_reporter.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_scorer.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_statistics.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_stats.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_store.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_task.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_tasks_cli.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/tests/test_verification_integration.py +0 -0
- {coderace-1.2.0 → coderace-1.3.0}/uv.lock +0 -0
|
@@ -1,5 +1,23 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [1.3.0] - 2026-03-05
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- **Model selection**: Per-agent model override via `agent:model` syntax in `--agents` / `--agent` flags
|
|
7
|
+
- Example: `coderace run task.yaml --agent codex:gpt-5.4 --agent codex:gpt-5.3-codex`
|
|
8
|
+
- Example: `coderace benchmark --agents claude:opus-4-6,claude:sonnet-4-6`
|
|
9
|
+
- `BaseAdapter.__init__(model=None)`: all adapters accept optional model at construction
|
|
10
|
+
- `BaseAdapter.build_command(task, model=None)`: model parameter flows to CLI flag
|
|
11
|
+
- `parse_agent_spec()`, `make_display_name()`, `instantiate_adapter()` in `coderace.adapters`
|
|
12
|
+
- All adapters (codex, claude, aider, gemini, opencode) append `--model <name>` when specified
|
|
13
|
+
- Benchmark and race commands handle model-specific agents; display names flow to results, store, ELO, dashboard
|
|
14
|
+
- Task YAML: `agents` list accepts `agent:model` entries (e.g. `- codex:gpt-5.4`)
|
|
15
|
+
|
|
16
|
+
### Changed
|
|
17
|
+
- `AgentResult.agent` is now the display name (`codex (gpt-5.4)`) when a model is specified
|
|
18
|
+
- ELO ratings, leaderboard, and dashboard automatically track model variants as separate entries
|
|
19
|
+
- Branch names sanitized to be git-compatible (colons replaced with dashes)
|
|
20
|
+
|
|
3
21
|
## [1.2.0] - 2026-03-03
|
|
4
22
|
|
|
5
23
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: coderace
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Race coding agents against each other on real tasks
|
|
5
5
|
Project-URL: Homepage, https://github.com/mikiships/coderace
|
|
6
6
|
Project-URL: Repository, https://github.com/mikiships/coderace
|
|
@@ -30,6 +30,11 @@ Description-Content-Type: text/markdown
|
|
|
30
30
|
|
|
31
31
|
# coderace
|
|
32
32
|
|
|
33
|
+
[](https://pypi.org/project/coderace/)
|
|
34
|
+
[](#install)
|
|
35
|
+
[](#)
|
|
36
|
+
[](#license)
|
|
37
|
+
|
|
33
38
|
Stop reading blog comparisons. Race coding agents against each other on real tasks in *your* repo with *your* code.
|
|
34
39
|
|
|
35
40
|
Every week there's a new "Claude Code vs Codex vs Cursor" post. They test on toy problems with cherry-picked examples. coderace gives you automated, reproducible, scored comparisons on the tasks you actually care about.
|
|
@@ -340,6 +345,41 @@ Keys can be agent names (`claude`, `codex`, `aider`, `gemini`, `opencode`) or mo
|
|
|
340
345
|
|
|
341
346
|
Pricing is easy to update: the table lives in `coderace/cost.py` as a plain dict.
|
|
342
347
|
|
|
348
|
+
## Model Selection
|
|
349
|
+
|
|
350
|
+
Compare different models of the same agent head-to-head using the `agent:model` syntax:
|
|
351
|
+
|
|
352
|
+
```bash
|
|
353
|
+
# Compare two Codex models on the same task
|
|
354
|
+
coderace run task.yaml --agent codex:gpt-5.4 --agent codex:gpt-5.3-codex
|
|
355
|
+
|
|
356
|
+
# Mix agents and models
|
|
357
|
+
coderace run task.yaml --agent codex:gpt-5.4 --agent claude:opus-4-6 --agent claude:sonnet-4-6
|
|
358
|
+
|
|
359
|
+
# Benchmark multiple model variants across built-in tasks
|
|
360
|
+
coderace benchmark --agents codex:gpt-5.4,codex:gpt-5.3-codex,claude:opus-4-6
|
|
361
|
+
|
|
362
|
+
# Race with model variants (parallel)
|
|
363
|
+
coderace race task.yaml
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
In task YAML files:
|
|
367
|
+
|
|
368
|
+
```yaml
|
|
369
|
+
agents:
|
|
370
|
+
- codex:gpt-5.4
|
|
371
|
+
- codex:gpt-5.3-codex
|
|
372
|
+
- claude:opus-4-6
|
|
373
|
+
- claude:sonnet-4-6
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
**How it works:**
|
|
377
|
+
- `agent:model` splits on the first colon: `codex:gpt-5.4` → agent `codex`, model `gpt-5.4`
|
|
378
|
+
- The model is passed via `--model <name>` to the underlying CLI
|
|
379
|
+
- Results display as `codex (gpt-5.4)` vs `codex (gpt-5.3-codex)` for easy comparison
|
|
380
|
+
- ELO ratings, leaderboard, and dashboard track each model variant separately
|
|
381
|
+
- The same agent can appear multiple times with different models in one run
|
|
382
|
+
|
|
343
383
|
## Leaderboard & History
|
|
344
384
|
|
|
345
385
|
Every `coderace run` automatically saves results to a local SQLite database (`~/.coderace/results.db`). Two new commands aggregate this data.
|
|
@@ -854,3 +894,6 @@ coderace context-eval --context-file v2-claude.md --task task.yaml --agents clau
|
|
|
854
894
|
## See Also
|
|
855
895
|
|
|
856
896
|
- **[agentmd](https://github.com/mikiships/agentmd)** — Generate and score context files (CLAUDE.md, AGENTS.md, .cursorrules) for AI coding agents. Pair with coderace: generate context with agentmd, measure agent performance with coderace, iterate with data instead of vibes.
|
|
897
|
+
- **[agentlint](https://github.com/mikiships/agentlint)** — Lint AI agent git diffs for risky patterns (scope drift, secret leaks, test regression). Static analysis, no LLM required.
|
|
898
|
+
|
|
899
|
+
Measure (coderace) → Optimize (agentmd) → Guard (agentlint).
|
|
@@ -1,5 +1,10 @@
|
|
|
1
1
|
# coderace
|
|
2
2
|
|
|
3
|
+
[](https://pypi.org/project/coderace/)
|
|
4
|
+
[](#install)
|
|
5
|
+
[](#)
|
|
6
|
+
[](#license)
|
|
7
|
+
|
|
3
8
|
Stop reading blog comparisons. Race coding agents against each other on real tasks in *your* repo with *your* code.
|
|
4
9
|
|
|
5
10
|
Every week there's a new "Claude Code vs Codex vs Cursor" post. They test on toy problems with cherry-picked examples. coderace gives you automated, reproducible, scored comparisons on the tasks you actually care about.
|
|
@@ -310,6 +315,41 @@ Keys can be agent names (`claude`, `codex`, `aider`, `gemini`, `opencode`) or mo
|
|
|
310
315
|
|
|
311
316
|
Pricing is easy to update: the table lives in `coderace/cost.py` as a plain dict.
|
|
312
317
|
|
|
318
|
+
## Model Selection
|
|
319
|
+
|
|
320
|
+
Compare different models of the same agent head-to-head using the `agent:model` syntax:
|
|
321
|
+
|
|
322
|
+
```bash
|
|
323
|
+
# Compare two Codex models on the same task
|
|
324
|
+
coderace run task.yaml --agent codex:gpt-5.4 --agent codex:gpt-5.3-codex
|
|
325
|
+
|
|
326
|
+
# Mix agents and models
|
|
327
|
+
coderace run task.yaml --agent codex:gpt-5.4 --agent claude:opus-4-6 --agent claude:sonnet-4-6
|
|
328
|
+
|
|
329
|
+
# Benchmark multiple model variants across built-in tasks
|
|
330
|
+
coderace benchmark --agents codex:gpt-5.4,codex:gpt-5.3-codex,claude:opus-4-6
|
|
331
|
+
|
|
332
|
+
# Race with model variants (parallel)
|
|
333
|
+
coderace race task.yaml
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
In task YAML files:
|
|
337
|
+
|
|
338
|
+
```yaml
|
|
339
|
+
agents:
|
|
340
|
+
- codex:gpt-5.4
|
|
341
|
+
- codex:gpt-5.3-codex
|
|
342
|
+
- claude:opus-4-6
|
|
343
|
+
- claude:sonnet-4-6
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
**How it works:**
|
|
347
|
+
- `agent:model` splits on the first colon: `codex:gpt-5.4` → agent `codex`, model `gpt-5.4`
|
|
348
|
+
- The model is passed via `--model <name>` to the underlying CLI
|
|
349
|
+
- Results display as `codex (gpt-5.4)` vs `codex (gpt-5.3-codex)` for easy comparison
|
|
350
|
+
- ELO ratings, leaderboard, and dashboard track each model variant separately
|
|
351
|
+
- The same agent can appear multiple times with different models in one run
|
|
352
|
+
|
|
313
353
|
## Leaderboard & History
|
|
314
354
|
|
|
315
355
|
Every `coderace run` automatically saves results to a local SQLite database (`~/.coderace/results.db`). Two new commands aggregate this data.
|
|
@@ -824,3 +864,6 @@ coderace context-eval --context-file v2-claude.md --task task.yaml --agents clau
|
|
|
824
864
|
## See Also
|
|
825
865
|
|
|
826
866
|
- **[agentmd](https://github.com/mikiships/agentmd)** — Generate and score context files (CLAUDE.md, AGENTS.md, .cursorrules) for AI coding agents. Pair with coderace: generate context with agentmd, measure agent performance with coderace, iterate with data instead of vibes.
|
|
867
|
+
- **[agentlint](https://github.com/mikiships/agentlint)** — Lint AI agent git diffs for risky patterns (scope drift, secret leaks, test regression). Static analysis, no LLM required.
|
|
868
|
+
|
|
869
|
+
Measure (coderace) → Optimize (agentmd) → Guard (agentlint).
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# All-Day Build Contract: Model Selection for Adapters
|
|
2
|
+
|
|
3
|
+
Status: In Progress
|
|
4
|
+
Date: 2026-03-05
|
|
5
|
+
Owner: Codex execution pass
|
|
6
|
+
Scope type: Deliverable-gated (no hour promises)
|
|
7
|
+
|
|
8
|
+
## 1. Objective
|
|
9
|
+
|
|
10
|
+
Add per-agent model selection to coderace so users can benchmark different models within the same agent CLI. For example: `coderace run task.yaml --agents codex:gpt-5.4,codex:gpt-5.3-codex,claude:opus-4-6,claude:sonnet-4-6` to compare models head-to-head on the same tasks.
|
|
11
|
+
|
|
12
|
+
This enables the "which model is actually best for coding" benchmark content that vibes-based blog posts can't provide.
|
|
13
|
+
|
|
14
|
+
This contract is considered complete only when every deliverable and validation gate below is satisfied.
|
|
15
|
+
|
|
16
|
+
## 2. Non-Negotiable Build Rules
|
|
17
|
+
|
|
18
|
+
1. No time-based completion claims.
|
|
19
|
+
2. Completion is allowed only when all checklist items are checked.
|
|
20
|
+
3. Full test suite must pass at the end.
|
|
21
|
+
4. New features must ship with docs and report addendum updates in the same pass.
|
|
22
|
+
5. CLI outputs must be deterministic and schema-backed where specified.
|
|
23
|
+
6. Never modify files outside the project directory.
|
|
24
|
+
7. Commit after each completed deliverable (not at the end).
|
|
25
|
+
8. If stuck on same issue for 3 attempts, stop and write a blocker report.
|
|
26
|
+
9. Do NOT refactor, restyle, or "improve" code outside the deliverables.
|
|
27
|
+
10. Read existing tests and docs before writing new code.
|
|
28
|
+
|
|
29
|
+
## 3. Feature Deliverables
|
|
30
|
+
|
|
31
|
+
### D1. Base Adapter Model Support (core)
|
|
32
|
+
|
|
33
|
+
Add optional `model` parameter to BaseAdapter so subclasses can receive a model override.
|
|
34
|
+
|
|
35
|
+
Required files:
|
|
36
|
+
- `coderace/adapters/base.py`
|
|
37
|
+
|
|
38
|
+
- [ ] Add `model: Optional[str] = None` to `__init__` (or as class attribute)
|
|
39
|
+
- [ ] Pass `model` through to `build_command` signature: `build_command(self, task_description: str, model: Optional[str] = None) -> list[str]`
|
|
40
|
+
- [ ] Update `run()` to pass model to `build_command`
|
|
41
|
+
- [ ] Update `parse_cost` calls to use the model override when provided
|
|
42
|
+
- [ ] Tests for D1
|
|
43
|
+
|
|
44
|
+
### D2. Codex and Claude Adapter Model Flags
|
|
45
|
+
|
|
46
|
+
Update the two main adapters to pass `--model` when a model is specified.
|
|
47
|
+
|
|
48
|
+
Required files:
|
|
49
|
+
- `coderace/adapters/codex.py`
|
|
50
|
+
- `coderace/adapters/claude.py`
|
|
51
|
+
|
|
52
|
+
- [ ] CodexAdapter.build_command: append `--model`, model_name when model is not None
|
|
53
|
+
- [ ] ClaudeAdapter.build_command: append `--model`, model_name when model is not None
|
|
54
|
+
- [ ] Update parse_cost to use the provided model name for accurate pricing
|
|
55
|
+
- [ ] Also update aider.py, gemini.py, opencode.py adapters if they support model flags (check their --help)
|
|
56
|
+
- [ ] Tests for D2
|
|
57
|
+
|
|
58
|
+
### D3. Agent:Model CLI Syntax
|
|
59
|
+
|
|
60
|
+
Parse `agent:model` syntax in the CLI so users can specify models per agent.
|
|
61
|
+
|
|
62
|
+
Required files:
|
|
63
|
+
- `coderace/cli.py` (or wherever `--agents` is parsed)
|
|
64
|
+
- `coderace/adapters/__init__.py` (adapter registry/factory)
|
|
65
|
+
|
|
66
|
+
The syntax: `--agents codex:gpt-5.4,claude:opus-4-6`
|
|
67
|
+
- If no `:model` suffix, use the adapter's default (current behavior)
|
|
68
|
+
- If `:model` suffix, pass it through to the adapter
|
|
69
|
+
- The same agent can appear multiple times with different models
|
|
70
|
+
- Agent display name in results should include the model: `codex (gpt-5.4)` vs `codex (gpt-5.3-codex)`
|
|
71
|
+
|
|
72
|
+
- [ ] Parse `agent:model` in CLI --agents flag
|
|
73
|
+
- [ ] Support duplicate agents with different models in the same run
|
|
74
|
+
- [ ] Display agent+model in result tables and reports
|
|
75
|
+
- [ ] Works with `run`, `benchmark`, and `race` commands
|
|
76
|
+
- [ ] Tests for D3
|
|
77
|
+
|
|
78
|
+
### D4. Benchmark and Race Command Integration
|
|
79
|
+
|
|
80
|
+
Ensure `benchmark` and `race` commands correctly handle model-specific agents.
|
|
81
|
+
|
|
82
|
+
Required files:
|
|
83
|
+
- `coderace/benchmark.py`
|
|
84
|
+
- `coderace/commands/` (race command if separate)
|
|
85
|
+
- `coderace/store.py` (results storage)
|
|
86
|
+
|
|
87
|
+
- [ ] Benchmark results store agent+model as the identifier (not just agent name)
|
|
88
|
+
- [ ] ELO ratings track agent+model combinations separately
|
|
89
|
+
- [ ] Leaderboard shows model variants as separate entries
|
|
90
|
+
- [ ] Dashboard HTML includes model information
|
|
91
|
+
- [ ] Tests for D4
|
|
92
|
+
|
|
93
|
+
### D5. Documentation and Version Bump
|
|
94
|
+
|
|
95
|
+
- [ ] Update README.md with model selection examples
|
|
96
|
+
- [ ] Add model selection section to examples/
|
|
97
|
+
- [ ] Update CHANGELOG.md
|
|
98
|
+
- [ ] Bump version to 1.3.0 in pyproject.toml
|
|
99
|
+
- [ ] All existing 526 tests still pass
|
|
100
|
+
- [ ] New tests bring total to 550+
|
|
101
|
+
|
|
102
|
+
## 4. Test Requirements
|
|
103
|
+
|
|
104
|
+
- [ ] Unit tests for each adapter with model override
|
|
105
|
+
- [ ] Unit tests for agent:model parsing
|
|
106
|
+
- [ ] Integration test: dry-run benchmark with model variants
|
|
107
|
+
- [ ] Edge cases: invalid model name, empty model, agent without model support
|
|
108
|
+
- [ ] All existing 526 tests must still pass
|
|
109
|
+
|
|
110
|
+
## 5. Reports
|
|
111
|
+
|
|
112
|
+
- Write progress to `progress-log.md` after each deliverable
|
|
113
|
+
- Include: what was built, what tests pass, what's next, any blockers
|
|
114
|
+
- Final summary when all deliverables done or stopped
|
|
115
|
+
|
|
116
|
+
## 6. Stop Conditions
|
|
117
|
+
|
|
118
|
+
- All deliverables checked and all tests passing -> DONE
|
|
119
|
+
- 3 consecutive failed attempts on same issue -> STOP, write blocker report
|
|
120
|
+
- Scope creep detected (new requirements discovered) -> STOP, report what's new
|
|
121
|
+
- All tests passing but deliverables remain -> continue to next deliverable
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Agent adapters for coderace."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from coderace.adapters.aider import AiderAdapter
|
|
8
|
+
from coderace.adapters.base import BaseAdapter
|
|
9
|
+
from coderace.adapters.claude import ClaudeAdapter
|
|
10
|
+
from coderace.adapters.codex import CodexAdapter
|
|
11
|
+
from coderace.adapters.gemini import GeminiAdapter
|
|
12
|
+
from coderace.adapters.opencode import OpenCodeAdapter
|
|
13
|
+
|
|
14
|
+
ADAPTERS: dict[str, type[BaseAdapter]] = {
|
|
15
|
+
"claude": ClaudeAdapter,
|
|
16
|
+
"codex": CodexAdapter,
|
|
17
|
+
"aider": AiderAdapter,
|
|
18
|
+
"gemini": GeminiAdapter,
|
|
19
|
+
"opencode": OpenCodeAdapter,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_agent_spec(spec: str) -> tuple[str, Optional[str]]:
|
|
24
|
+
"""Parse an agent spec string into (agent_name, model_or_None).
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
"codex" -> ("codex", None)
|
|
28
|
+
"codex:gpt-5.4" -> ("codex", "gpt-5.4")
|
|
29
|
+
"claude:opus-4-6" -> ("claude", "opus-4-6")
|
|
30
|
+
"""
|
|
31
|
+
if ":" in spec:
|
|
32
|
+
agent_name, model = spec.split(":", 1)
|
|
33
|
+
return agent_name.strip(), model.strip() or None
|
|
34
|
+
return spec.strip(), None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def make_display_name(agent_name: str, model: Optional[str]) -> str:
|
|
38
|
+
"""Return display name for agent+model combo.
|
|
39
|
+
|
|
40
|
+
Examples:
|
|
41
|
+
("codex", None) -> "codex"
|
|
42
|
+
("codex", "gpt-5.4") -> "codex (gpt-5.4)"
|
|
43
|
+
"""
|
|
44
|
+
if model:
|
|
45
|
+
return f"{agent_name} ({model})"
|
|
46
|
+
return agent_name
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def instantiate_adapter(spec: str) -> BaseAdapter:
|
|
50
|
+
"""Instantiate an adapter from an agent spec string (e.g. 'codex:gpt-5.4').
|
|
51
|
+
|
|
52
|
+
The returned adapter has:
|
|
53
|
+
- adapter.model set to the parsed model (or None)
|
|
54
|
+
- adapter.name set to the display name (e.g. 'codex (gpt-5.4)')
|
|
55
|
+
|
|
56
|
+
Raises KeyError if the agent name is not in ADAPTERS.
|
|
57
|
+
"""
|
|
58
|
+
agent_name, model = parse_agent_spec(spec)
|
|
59
|
+
adapter_cls = ADAPTERS[agent_name]
|
|
60
|
+
adapter = adapter_cls(model=model)
|
|
61
|
+
# Override the instance name to be the display name
|
|
62
|
+
adapter.name = make_display_name(agent_name, model)
|
|
63
|
+
return adapter
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
__all__ = [
|
|
67
|
+
"ADAPTERS",
|
|
68
|
+
"BaseAdapter",
|
|
69
|
+
"ClaudeAdapter",
|
|
70
|
+
"CodexAdapter",
|
|
71
|
+
"AiderAdapter",
|
|
72
|
+
"GeminiAdapter",
|
|
73
|
+
"OpenCodeAdapter",
|
|
74
|
+
"parse_agent_spec",
|
|
75
|
+
"make_display_name",
|
|
76
|
+
"instantiate_adapter",
|
|
77
|
+
]
|
|
@@ -7,27 +7,34 @@ from typing import Optional
|
|
|
7
7
|
from coderace.adapters.base import BaseAdapter
|
|
8
8
|
from coderace.cost import CostResult, parse_aider_cost
|
|
9
9
|
|
|
10
|
+
DEFAULT_AIDER_MODEL = "aider-default"
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
class AiderAdapter(BaseAdapter):
|
|
12
14
|
"""Adapter for Aider coding assistant."""
|
|
13
15
|
|
|
14
16
|
name = "aider"
|
|
15
17
|
|
|
16
|
-
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
-
|
|
18
|
+
def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
|
|
19
|
+
cmd = [
|
|
18
20
|
"aider",
|
|
19
21
|
"--message",
|
|
20
22
|
task_description,
|
|
21
23
|
"--yes",
|
|
22
24
|
"--no-auto-commits",
|
|
23
25
|
]
|
|
26
|
+
effective_model = model or self.model
|
|
27
|
+
if effective_model:
|
|
28
|
+
cmd += ["--model", effective_model]
|
|
29
|
+
return cmd
|
|
24
30
|
|
|
25
31
|
def parse_cost(
|
|
26
32
|
self,
|
|
27
33
|
stdout: str,
|
|
28
34
|
stderr: str,
|
|
29
|
-
model_name: str = "
|
|
35
|
+
model_name: str = "",
|
|
30
36
|
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
31
37
|
) -> Optional[CostResult]:
|
|
32
38
|
"""Parse cost data from Aider output."""
|
|
33
|
-
|
|
39
|
+
effective_model = model_name or self.model or DEFAULT_AIDER_MODEL
|
|
40
|
+
return parse_aider_cost(stdout, stderr, effective_model, custom_pricing)
|
|
@@ -17,8 +17,12 @@ class BaseAdapter(ABC):
|
|
|
17
17
|
|
|
18
18
|
name: str = "base"
|
|
19
19
|
|
|
20
|
+
def __init__(self, model: Optional[str] = None) -> None:
|
|
21
|
+
"""Initialize adapter with optional model override."""
|
|
22
|
+
self.model = model
|
|
23
|
+
|
|
20
24
|
@abstractmethod
|
|
21
|
-
def build_command(self, task_description: str) -> list[str]:
|
|
25
|
+
def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
|
|
22
26
|
"""Build the CLI command to invoke this agent."""
|
|
23
27
|
...
|
|
24
28
|
|
|
@@ -44,7 +48,8 @@ class BaseAdapter(ABC):
|
|
|
44
48
|
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
45
49
|
) -> AgentResult:
|
|
46
50
|
"""Run the agent on a task and capture results."""
|
|
47
|
-
|
|
51
|
+
model = self.model
|
|
52
|
+
cmd = self.build_command(task_description, model=model)
|
|
48
53
|
start = time.monotonic()
|
|
49
54
|
timed_out = False
|
|
50
55
|
|
|
@@ -76,7 +81,12 @@ class BaseAdapter(ABC):
|
|
|
76
81
|
cost_result: Optional[CostResult] = None
|
|
77
82
|
if not no_cost:
|
|
78
83
|
try:
|
|
79
|
-
cost_result = self.parse_cost(
|
|
84
|
+
cost_result = self.parse_cost(
|
|
85
|
+
stdout,
|
|
86
|
+
stderr,
|
|
87
|
+
model_name=model or "",
|
|
88
|
+
custom_pricing=custom_pricing,
|
|
89
|
+
)
|
|
80
90
|
except Exception:
|
|
81
91
|
pass
|
|
82
92
|
|
|
@@ -7,29 +7,35 @@ from typing import Optional
|
|
|
7
7
|
from coderace.adapters.base import BaseAdapter
|
|
8
8
|
from coderace.cost import CostResult, parse_claude_cost
|
|
9
9
|
|
|
10
|
+
DEFAULT_CLAUDE_MODEL = "claude-sonnet-4-6"
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
class ClaudeAdapter(BaseAdapter):
|
|
12
14
|
"""Adapter for Claude Code CLI."""
|
|
13
15
|
|
|
14
16
|
name = "claude"
|
|
15
17
|
|
|
16
|
-
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
-
|
|
18
|
+
def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
|
|
19
|
+
cmd = [
|
|
18
20
|
"claude",
|
|
19
21
|
"--print",
|
|
20
22
|
"--output-format",
|
|
21
23
|
"json",
|
|
22
24
|
"--dangerously-skip-permissions",
|
|
23
|
-
"-p",
|
|
24
|
-
task_description,
|
|
25
25
|
]
|
|
26
|
+
effective_model = model or self.model
|
|
27
|
+
if effective_model:
|
|
28
|
+
cmd += ["--model", effective_model]
|
|
29
|
+
cmd += ["-p", task_description]
|
|
30
|
+
return cmd
|
|
26
31
|
|
|
27
32
|
def parse_cost(
|
|
28
33
|
self,
|
|
29
34
|
stdout: str,
|
|
30
35
|
stderr: str,
|
|
31
|
-
model_name: str = "
|
|
36
|
+
model_name: str = "",
|
|
32
37
|
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
33
38
|
) -> Optional[CostResult]:
|
|
34
39
|
"""Parse cost data from Claude Code output."""
|
|
35
|
-
|
|
40
|
+
effective_model = model_name or self.model or DEFAULT_CLAUDE_MODEL
|
|
41
|
+
return parse_claude_cost(stdout, stderr, effective_model, custom_pricing)
|
|
@@ -7,26 +7,33 @@ from typing import Optional
|
|
|
7
7
|
from coderace.adapters.base import BaseAdapter
|
|
8
8
|
from coderace.cost import CostResult, parse_codex_cost
|
|
9
9
|
|
|
10
|
+
DEFAULT_CODEX_MODEL = "gpt-5.3-codex"
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
class CodexAdapter(BaseAdapter):
|
|
12
14
|
"""Adapter for OpenAI Codex CLI."""
|
|
13
15
|
|
|
14
16
|
name = "codex"
|
|
15
17
|
|
|
16
|
-
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
-
|
|
18
|
+
def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
|
|
19
|
+
cmd = [
|
|
18
20
|
"codex",
|
|
19
21
|
"exec",
|
|
20
22
|
"--full-auto",
|
|
21
|
-
task_description,
|
|
22
23
|
]
|
|
24
|
+
effective_model = model or self.model
|
|
25
|
+
if effective_model:
|
|
26
|
+
cmd += ["--model", effective_model]
|
|
27
|
+
cmd.append(task_description)
|
|
28
|
+
return cmd
|
|
23
29
|
|
|
24
30
|
def parse_cost(
|
|
25
31
|
self,
|
|
26
32
|
stdout: str,
|
|
27
33
|
stderr: str,
|
|
28
|
-
model_name: str = "
|
|
34
|
+
model_name: str = "",
|
|
29
35
|
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
30
36
|
) -> Optional[CostResult]:
|
|
31
37
|
"""Parse cost data from Codex CLI output."""
|
|
32
|
-
|
|
38
|
+
effective_model = model_name or self.model or DEFAULT_CODEX_MODEL
|
|
39
|
+
return parse_codex_cost(stdout, stderr, effective_model, custom_pricing)
|
|
@@ -7,25 +7,29 @@ from typing import Optional
|
|
|
7
7
|
from coderace.adapters.base import BaseAdapter
|
|
8
8
|
from coderace.cost import CostResult, parse_gemini_cost
|
|
9
9
|
|
|
10
|
+
DEFAULT_GEMINI_MODEL = "gemini-2.5-pro"
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
class GeminiAdapter(BaseAdapter):
|
|
12
14
|
"""Adapter for Google Gemini CLI."""
|
|
13
15
|
|
|
14
16
|
name = "gemini"
|
|
15
17
|
|
|
16
|
-
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
]
|
|
18
|
+
def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
|
|
19
|
+
cmd = ["gemini"]
|
|
20
|
+
effective_model = model or self.model
|
|
21
|
+
if effective_model:
|
|
22
|
+
cmd += ["--model", effective_model]
|
|
23
|
+
cmd += ["-p", task_description]
|
|
24
|
+
return cmd
|
|
22
25
|
|
|
23
26
|
def parse_cost(
|
|
24
27
|
self,
|
|
25
28
|
stdout: str,
|
|
26
29
|
stderr: str,
|
|
27
|
-
model_name: str = "
|
|
30
|
+
model_name: str = "",
|
|
28
31
|
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
29
32
|
) -> Optional[CostResult]:
|
|
30
33
|
"""Parse cost data from Gemini CLI output."""
|
|
31
|
-
|
|
34
|
+
effective_model = model_name or self.model or DEFAULT_GEMINI_MODEL
|
|
35
|
+
return parse_gemini_cost(stdout, stderr, effective_model, custom_pricing)
|
|
@@ -7,25 +7,29 @@ from typing import Optional
|
|
|
7
7
|
from coderace.adapters.base import BaseAdapter
|
|
8
8
|
from coderace.cost import CostResult, parse_opencode_cost
|
|
9
9
|
|
|
10
|
+
DEFAULT_OPENCODE_MODEL = "opencode-default"
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
class OpenCodeAdapter(BaseAdapter):
|
|
12
14
|
"""Adapter for OpenCode CLI (terminal-first AI coding agent)."""
|
|
13
15
|
|
|
14
16
|
name = "opencode"
|
|
15
17
|
|
|
16
|
-
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
18
|
+
def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
|
|
19
|
+
cmd = ["opencode", "run"]
|
|
20
|
+
effective_model = model or self.model
|
|
21
|
+
if effective_model:
|
|
22
|
+
cmd += ["--model", effective_model]
|
|
23
|
+
cmd.append(task_description)
|
|
24
|
+
return cmd
|
|
22
25
|
|
|
23
26
|
def parse_cost(
|
|
24
27
|
self,
|
|
25
28
|
stdout: str,
|
|
26
29
|
stderr: str,
|
|
27
|
-
model_name: str = "
|
|
30
|
+
model_name: str = "",
|
|
28
31
|
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
29
32
|
) -> Optional[CostResult]:
|
|
30
33
|
"""Parse cost data from OpenCode output."""
|
|
31
|
-
|
|
34
|
+
effective_model = model_name or self.model or DEFAULT_OPENCODE_MODEL
|
|
35
|
+
return parse_opencode_cost(stdout, stderr, effective_model, custom_pricing)
|