coderace 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. {coderace-1.2.0 → coderace-1.3.0}/CHANGELOG.md +18 -0
  2. {coderace-1.2.0 → coderace-1.3.0}/PKG-INFO +44 -1
  3. {coderace-1.2.0 → coderace-1.3.0}/README.md +43 -0
  4. coderace-1.3.0/all-day-build-contract-model-selection.md +121 -0
  5. {coderace-1.2.0 → coderace-1.3.0}/coderace/__init__.py +1 -1
  6. coderace-1.3.0/coderace/adapters/__init__.py +77 -0
  7. {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/aider.py +11 -4
  8. {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/base.py +13 -3
  9. {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/claude.py +12 -6
  10. {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/codex.py +12 -5
  11. {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/gemini.py +12 -8
  12. {coderace-1.2.0 → coderace-1.3.0}/coderace/adapters/opencode.py +12 -8
  13. {coderace-1.2.0 → coderace-1.3.0}/coderace/benchmark.py +17 -11
  14. {coderace-1.2.0 → coderace-1.3.0}/coderace/cli.py +36 -25
  15. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/race.py +2 -2
  16. {coderace-1.2.0 → coderace-1.3.0}/coderace/types.py +3 -1
  17. coderace-1.3.0/examples/model-selection.yaml +30 -0
  18. coderace-1.3.0/progress-log.md +70 -0
  19. {coderace-1.2.0 → coderace-1.3.0}/pyproject.toml +1 -1
  20. {coderace-1.2.0 → coderace-1.3.0}/tests/test_examples.py +3 -2
  21. coderace-1.3.0/tests/test_model_selection_d1_d2.py +200 -0
  22. coderace-1.3.0/tests/test_model_selection_d3.py +164 -0
  23. coderace-1.3.0/tests/test_model_selection_d4.py +186 -0
  24. coderace-1.2.0/coderace/adapters/__init__.py +0 -26
  25. coderace-1.2.0/progress-log.md +0 -918
  26. {coderace-1.2.0 → coderace-1.3.0}/.github/workflows/publish.yml +0 -0
  27. {coderace-1.2.0 → coderace-1.3.0}/.gitignore +0 -0
  28. {coderace-1.2.0 → coderace-1.3.0}/DONE.txt +0 -0
  29. {coderace-1.2.0 → coderace-1.3.0}/LICENSE +0 -0
  30. {coderace-1.2.0 → coderace-1.3.0}/action.yml +0 -0
  31. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-benchmark.md +0 -0
  32. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-builtin-tasks.md +0 -0
  33. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-ci-integration.md +0 -0
  34. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-context-eval.md +0 -0
  35. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-cost-tracking.md +0 -0
  36. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-dashboard.md +0 -0
  37. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-leaderboard.md +0 -0
  38. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-race-mode.md +0 -0
  39. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-v0.2.md +0 -0
  40. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-v090-tasks.md +0 -0
  41. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-v1.0-statistical.md +0 -0
  42. {coderace-1.2.0 → coderace-1.3.0}/all-day-build-contract-verification-tests.md +0 -0
  43. {coderace-1.2.0 → coderace-1.3.0}/benchmark-results/fibonacci-2026-02-27.md +0 -0
  44. {coderace-1.2.0 → coderace-1.3.0}/benchmark-results/fibonacci-v2-2026-02-27.md +0 -0
  45. {coderace-1.2.0 → coderace-1.3.0}/benchmark-results/hard-tasks-2026-02-27.md +0 -0
  46. {coderace-1.2.0 → coderace-1.3.0}/benchmark-results/multi-task-2026-02-27.md +0 -0
  47. {coderace-1.2.0 → coderace-1.3.0}/coderace/benchmark_report.py +0 -0
  48. {coderace-1.2.0 → coderace-1.3.0}/coderace/benchmark_stats.py +0 -0
  49. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/__init__.py +0 -0
  50. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/binary-search-tree.yaml +0 -0
  51. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/cli-args-parser.yaml +0 -0
  52. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/csv-analyzer.yaml +0 -0
  53. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/data-pipeline.yaml +0 -0
  54. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/diff-algorithm.yaml +0 -0
  55. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/expression-evaluator.yaml +0 -0
  56. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/fibonacci.yaml +0 -0
  57. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/file-watcher.yaml +0 -0
  58. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/http-server.yaml +0 -0
  59. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/json-parser.yaml +0 -0
  60. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/lru-cache.yaml +0 -0
  61. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/markdown-to-html.yaml +0 -0
  62. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/regex-engine.yaml +0 -0
  63. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/state-machine.yaml +0 -0
  64. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/task-scheduler.yaml +0 -0
  65. {coderace-1.2.0 → coderace-1.3.0}/coderace/builtins/tasks/url-router.yaml +0 -0
  66. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/__init__.py +0 -0
  67. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/benchmark.py +0 -0
  68. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/context_eval.py +0 -0
  69. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/dashboard.py +0 -0
  70. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/diff.py +0 -0
  71. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/history.py +0 -0
  72. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/leaderboard.py +0 -0
  73. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/results.py +0 -0
  74. {coderace-1.2.0 → coderace-1.3.0}/coderace/commands/tasks.py +0 -0
  75. {coderace-1.2.0 → coderace-1.3.0}/coderace/context_eval.py +0 -0
  76. {coderace-1.2.0 → coderace-1.3.0}/coderace/context_eval_report.py +0 -0
  77. {coderace-1.2.0 → coderace-1.3.0}/coderace/cost.py +0 -0
  78. {coderace-1.2.0 → coderace-1.3.0}/coderace/dashboard.py +0 -0
  79. {coderace-1.2.0 → coderace-1.3.0}/coderace/elo.py +0 -0
  80. {coderace-1.2.0 → coderace-1.3.0}/coderace/export.py +0 -0
  81. {coderace-1.2.0 → coderace-1.3.0}/coderace/git_ops.py +0 -0
  82. {coderace-1.2.0 → coderace-1.3.0}/coderace/html_report.py +0 -0
  83. {coderace-1.2.0 → coderace-1.3.0}/coderace/publish.py +0 -0
  84. {coderace-1.2.0 → coderace-1.3.0}/coderace/reporter.py +0 -0
  85. {coderace-1.2.0 → coderace-1.3.0}/coderace/scorer.py +0 -0
  86. {coderace-1.2.0 → coderace-1.3.0}/coderace/statistics.py +0 -0
  87. {coderace-1.2.0 → coderace-1.3.0}/coderace/stats.py +0 -0
  88. {coderace-1.2.0 → coderace-1.3.0}/coderace/store.py +0 -0
  89. {coderace-1.2.0 → coderace-1.3.0}/coderace/task.py +0 -0
  90. {coderace-1.2.0 → coderace-1.3.0}/demo-race.yaml +0 -0
  91. {coderace-1.2.0 → coderace-1.3.0}/examples/add-type-hints.yaml +0 -0
  92. {coderace-1.2.0 → coderace-1.3.0}/examples/ci-race-on-pr.yml +0 -0
  93. {coderace-1.2.0 → coderace-1.3.0}/examples/context-eval-demo.sh +0 -0
  94. {coderace-1.2.0 → coderace-1.3.0}/examples/example-task.yaml +0 -0
  95. {coderace-1.2.0 → coderace-1.3.0}/examples/fix-edge-case.yaml +0 -0
  96. {coderace-1.2.0 → coderace-1.3.0}/examples/write-tests.yaml +0 -0
  97. {coderace-1.2.0 → coderace-1.3.0}/scripts/ci-run.sh +0 -0
  98. {coderace-1.2.0 → coderace-1.3.0}/scripts/format-comment.py +0 -0
  99. {coderace-1.2.0 → coderace-1.3.0}/tasks/markdown-table.yaml +0 -0
  100. {coderace-1.2.0 → coderace-1.3.0}/tasks/parse-duration.yaml +0 -0
  101. {coderace-1.2.0 → coderace-1.3.0}/tests/__init__.py +0 -0
  102. {coderace-1.2.0 → coderace-1.3.0}/tests/conftest.py +0 -0
  103. {coderace-1.2.0 → coderace-1.3.0}/tests/test_adapters.py +0 -0
  104. {coderace-1.2.0 → coderace-1.3.0}/tests/test_benchmark.py +0 -0
  105. {coderace-1.2.0 → coderace-1.3.0}/tests/test_benchmark_trials.py +0 -0
  106. {coderace-1.2.0 → coderace-1.3.0}/tests/test_benchmark_v1_integration.py +0 -0
  107. {coderace-1.2.0 → coderace-1.3.0}/tests/test_builtins.py +0 -0
  108. {coderace-1.2.0 → coderace-1.3.0}/tests/test_cli.py +0 -0
  109. {coderace-1.2.0 → coderace-1.3.0}/tests/test_cli_store_integration.py +0 -0
  110. {coderace-1.2.0 → coderace-1.3.0}/tests/test_context_eval.py +0 -0
  111. {coderace-1.2.0 → coderace-1.3.0}/tests/test_context_eval_dashboard.py +0 -0
  112. {coderace-1.2.0 → coderace-1.3.0}/tests/test_cost.py +0 -0
  113. {coderace-1.2.0 → coderace-1.3.0}/tests/test_cost_config.py +0 -0
  114. {coderace-1.2.0 → coderace-1.3.0}/tests/test_cost_integration.py +0 -0
  115. {coderace-1.2.0 → coderace-1.3.0}/tests/test_dashboard.py +0 -0
  116. {coderace-1.2.0 → coderace-1.3.0}/tests/test_dashboard_cli.py +0 -0
  117. {coderace-1.2.0 → coderace-1.3.0}/tests/test_diff.py +0 -0
  118. {coderace-1.2.0 → coderace-1.3.0}/tests/test_elo.py +0 -0
  119. {coderace-1.2.0 → coderace-1.3.0}/tests/test_export.py +0 -0
  120. {coderace-1.2.0 → coderace-1.3.0}/tests/test_format_comment.py +0 -0
  121. {coderace-1.2.0 → coderace-1.3.0}/tests/test_full_workflow.py +0 -0
  122. {coderace-1.2.0 → coderace-1.3.0}/tests/test_git_ops.py +0 -0
  123. {coderace-1.2.0 → coderace-1.3.0}/tests/test_history.py +0 -0
  124. {coderace-1.2.0 → coderace-1.3.0}/tests/test_html_report.py +0 -0
  125. {coderace-1.2.0 → coderace-1.3.0}/tests/test_leaderboard.py +0 -0
  126. {coderace-1.2.0 → coderace-1.3.0}/tests/test_markdown_results.py +0 -0
  127. {coderace-1.2.0 → coderace-1.3.0}/tests/test_publish.py +0 -0
  128. {coderace-1.2.0 → coderace-1.3.0}/tests/test_race.py +0 -0
  129. {coderace-1.2.0 → coderace-1.3.0}/tests/test_reporter.py +0 -0
  130. {coderace-1.2.0 → coderace-1.3.0}/tests/test_scorer.py +0 -0
  131. {coderace-1.2.0 → coderace-1.3.0}/tests/test_statistics.py +0 -0
  132. {coderace-1.2.0 → coderace-1.3.0}/tests/test_stats.py +0 -0
  133. {coderace-1.2.0 → coderace-1.3.0}/tests/test_store.py +0 -0
  134. {coderace-1.2.0 → coderace-1.3.0}/tests/test_task.py +0 -0
  135. {coderace-1.2.0 → coderace-1.3.0}/tests/test_tasks_cli.py +0 -0
  136. {coderace-1.2.0 → coderace-1.3.0}/tests/test_verification_integration.py +0 -0
  137. {coderace-1.2.0 → coderace-1.3.0}/uv.lock +0 -0
@@ -1,5 +1,23 @@
1
1
  # Changelog
2
2
 
3
+ ## [1.3.0] - 2026-03-05
4
+
5
+ ### Added
6
+ - **Model selection**: Per-agent model override via `agent:model` syntax in `--agents` / `--agent` flags
7
+ - Example: `coderace run task.yaml --agent codex:gpt-5.4 --agent codex:gpt-5.3-codex`
8
+ - Example: `coderace benchmark --agents claude:opus-4-6,claude:sonnet-4-6`
9
+ - `BaseAdapter.__init__(model=None)`: all adapters accept optional model at construction
10
+ - `BaseAdapter.build_command(task, model=None)`: model parameter flows to CLI flag
11
+ - `parse_agent_spec()`, `make_display_name()`, `instantiate_adapter()` in `coderace.adapters`
12
+ - All adapters (codex, claude, aider, gemini, opencode) append `--model <name>` when specified
13
+ - Benchmark and race commands handle model-specific agents; display names flow to results, store, ELO, dashboard
14
+ - Task YAML: `agents` list accepts `agent:model` entries (e.g. `- codex:gpt-5.4`)
15
+
16
+ ### Changed
17
+ - `AgentResult.agent` is now the display name (`codex (gpt-5.4)`) when a model is specified
18
+ - ELO ratings, leaderboard, and dashboard automatically track model variants as separate entries
19
+ - Branch names sanitized to be git-compatible (colons replaced with dashes)
20
+
3
21
  ## [1.2.0] - 2026-03-03
4
22
 
5
23
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderace
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: Race coding agents against each other on real tasks
5
5
  Project-URL: Homepage, https://github.com/mikiships/coderace
6
6
  Project-URL: Repository, https://github.com/mikiships/coderace
@@ -30,6 +30,11 @@ Description-Content-Type: text/markdown
30
30
 
31
31
  # coderace
32
32
 
33
+ [![PyPI](https://img.shields.io/pypi/v/coderace)](https://pypi.org/project/coderace/)
34
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](#install)
35
+ [![Tests](https://img.shields.io/badge/tests-526%20passing-brightgreen)](#)
36
+ [![License](https://img.shields.io/badge/license-MIT-lightgrey)](#license)
37
+
33
38
  Stop reading blog comparisons. Race coding agents against each other on real tasks in *your* repo with *your* code.
34
39
 
35
40
  Every week there's a new "Claude Code vs Codex vs Cursor" post. They test on toy problems with cherry-picked examples. coderace gives you automated, reproducible, scored comparisons on the tasks you actually care about.
@@ -340,6 +345,41 @@ Keys can be agent names (`claude`, `codex`, `aider`, `gemini`, `opencode`) or mo
340
345
 
341
346
  Pricing is easy to update: the table lives in `coderace/cost.py` as a plain dict.
342
347
 
348
+ ## Model Selection
349
+
350
+ Compare different models of the same agent head-to-head using the `agent:model` syntax:
351
+
352
+ ```bash
353
+ # Compare two Codex models on the same task
354
+ coderace run task.yaml --agent codex:gpt-5.4 --agent codex:gpt-5.3-codex
355
+
356
+ # Mix agents and models
357
+ coderace run task.yaml --agent codex:gpt-5.4 --agent claude:opus-4-6 --agent claude:sonnet-4-6
358
+
359
+ # Benchmark multiple model variants across built-in tasks
360
+ coderace benchmark --agents codex:gpt-5.4,codex:gpt-5.3-codex,claude:opus-4-6
361
+
362
+ # Race with model variants (parallel)
363
+ coderace race task.yaml
364
+ ```
365
+
366
+ In task YAML files:
367
+
368
+ ```yaml
369
+ agents:
370
+ - codex:gpt-5.4
371
+ - codex:gpt-5.3-codex
372
+ - claude:opus-4-6
373
+ - claude:sonnet-4-6
374
+ ```
375
+
376
+ **How it works:**
377
+ - `agent:model` splits on the first colon: `codex:gpt-5.4` → agent `codex`, model `gpt-5.4`
378
+ - The model is passed via `--model <name>` to the underlying CLI
379
+ - Results display as `codex (gpt-5.4)` vs `codex (gpt-5.3-codex)` for easy comparison
380
+ - ELO ratings, leaderboard, and dashboard track each model variant separately
381
+ - The same agent can appear multiple times with different models in one run
382
+
343
383
  ## Leaderboard & History
344
384
 
345
385
  Every `coderace run` automatically saves results to a local SQLite database (`~/.coderace/results.db`). Two new commands aggregate this data.
@@ -854,3 +894,6 @@ coderace context-eval --context-file v2-claude.md --task task.yaml --agents clau
854
894
  ## See Also
855
895
 
856
896
  - **[agentmd](https://github.com/mikiships/agentmd)** — Generate and score context files (CLAUDE.md, AGENTS.md, .cursorrules) for AI coding agents. Pair with coderace: generate context with agentmd, measure agent performance with coderace, iterate with data instead of vibes.
897
+ - **[agentlint](https://github.com/mikiships/agentlint)** — Lint AI agent git diffs for risky patterns (scope drift, secret leaks, test regression). Static analysis, no LLM required.
898
+
899
+ Measure (coderace) → Optimize (agentmd) → Guard (agentlint).
@@ -1,5 +1,10 @@
1
1
  # coderace
2
2
 
3
+ [![PyPI](https://img.shields.io/pypi/v/coderace)](https://pypi.org/project/coderace/)
4
+ [![Python](https://img.shields.io/badge/python-3.10%2B-blue)](#install)
5
+ [![Tests](https://img.shields.io/badge/tests-526%20passing-brightgreen)](#)
6
+ [![License](https://img.shields.io/badge/license-MIT-lightgrey)](#license)
7
+
3
8
  Stop reading blog comparisons. Race coding agents against each other on real tasks in *your* repo with *your* code.
4
9
 
5
10
  Every week there's a new "Claude Code vs Codex vs Cursor" post. They test on toy problems with cherry-picked examples. coderace gives you automated, reproducible, scored comparisons on the tasks you actually care about.
@@ -310,6 +315,41 @@ Keys can be agent names (`claude`, `codex`, `aider`, `gemini`, `opencode`) or mo
310
315
 
311
316
  Pricing is easy to update: the table lives in `coderace/cost.py` as a plain dict.
312
317
 
318
+ ## Model Selection
319
+
320
+ Compare different models of the same agent head-to-head using the `agent:model` syntax:
321
+
322
+ ```bash
323
+ # Compare two Codex models on the same task
324
+ coderace run task.yaml --agent codex:gpt-5.4 --agent codex:gpt-5.3-codex
325
+
326
+ # Mix agents and models
327
+ coderace run task.yaml --agent codex:gpt-5.4 --agent claude:opus-4-6 --agent claude:sonnet-4-6
328
+
329
+ # Benchmark multiple model variants across built-in tasks
330
+ coderace benchmark --agents codex:gpt-5.4,codex:gpt-5.3-codex,claude:opus-4-6
331
+
332
+ # Race with model variants (parallel)
333
+ coderace race task.yaml
334
+ ```
335
+
336
+ In task YAML files:
337
+
338
+ ```yaml
339
+ agents:
340
+ - codex:gpt-5.4
341
+ - codex:gpt-5.3-codex
342
+ - claude:opus-4-6
343
+ - claude:sonnet-4-6
344
+ ```
345
+
346
+ **How it works:**
347
+ - `agent:model` splits on the first colon: `codex:gpt-5.4` → agent `codex`, model `gpt-5.4`
348
+ - The model is passed via `--model <name>` to the underlying CLI
349
+ - Results display as `codex (gpt-5.4)` vs `codex (gpt-5.3-codex)` for easy comparison
350
+ - ELO ratings, leaderboard, and dashboard track each model variant separately
351
+ - The same agent can appear multiple times with different models in one run
352
+
313
353
  ## Leaderboard & History
314
354
 
315
355
  Every `coderace run` automatically saves results to a local SQLite database (`~/.coderace/results.db`). Two new commands aggregate this data.
@@ -824,3 +864,6 @@ coderace context-eval --context-file v2-claude.md --task task.yaml --agents clau
824
864
  ## See Also
825
865
 
826
866
  - **[agentmd](https://github.com/mikiships/agentmd)** — Generate and score context files (CLAUDE.md, AGENTS.md, .cursorrules) for AI coding agents. Pair with coderace: generate context with agentmd, measure agent performance with coderace, iterate with data instead of vibes.
867
+ - **[agentlint](https://github.com/mikiships/agentlint)** — Lint AI agent git diffs for risky patterns (scope drift, secret leaks, test regression). Static analysis, no LLM required.
868
+
869
+ Measure (coderace) → Optimize (agentmd) → Guard (agentlint).
@@ -0,0 +1,121 @@
1
+ # All-Day Build Contract: Model Selection for Adapters
2
+
3
+ Status: In Progress
4
+ Date: 2026-03-05
5
+ Owner: Codex execution pass
6
+ Scope type: Deliverable-gated (no hour promises)
7
+
8
+ ## 1. Objective
9
+
10
+ Add per-agent model selection to coderace so users can benchmark different models within the same agent CLI. For example: `coderace run task.yaml --agents codex:gpt-5.4,codex:gpt-5.3-codex,claude:opus-4-6,claude:sonnet-4-6` to compare models head-to-head on the same tasks.
11
+
12
+ This enables the "which model is actually best for coding" benchmark content that vibes-based blog posts can't provide.
13
+
14
+ This contract is considered complete only when every deliverable and validation gate below is satisfied.
15
+
16
+ ## 2. Non-Negotiable Build Rules
17
+
18
+ 1. No time-based completion claims.
19
+ 2. Completion is allowed only when all checklist items are checked.
20
+ 3. Full test suite must pass at the end.
21
+ 4. New features must ship with docs and report addendum updates in the same pass.
22
+ 5. CLI outputs must be deterministic and schema-backed where specified.
23
+ 6. Never modify files outside the project directory.
24
+ 7. Commit after each completed deliverable (not at the end).
25
+ 8. If stuck on same issue for 3 attempts, stop and write a blocker report.
26
+ 9. Do NOT refactor, restyle, or "improve" code outside the deliverables.
27
+ 10. Read existing tests and docs before writing new code.
28
+
29
+ ## 3. Feature Deliverables
30
+
31
+ ### D1. Base Adapter Model Support (core)
32
+
33
+ Add optional `model` parameter to BaseAdapter so subclasses can receive a model override.
34
+
35
+ Required files:
36
+ - `coderace/adapters/base.py`
37
+
38
+ - [ ] Add `model: Optional[str] = None` to `__init__` (or as class attribute)
39
+ - [ ] Pass `model` through to `build_command` signature: `build_command(self, task_description: str, model: Optional[str] = None) -> list[str]`
40
+ - [ ] Update `run()` to pass model to `build_command`
41
+ - [ ] Update `parse_cost` calls to use the model override when provided
42
+ - [ ] Tests for D1
43
+
44
+ ### D2. Codex and Claude Adapter Model Flags
45
+
46
+ Update the two main adapters to pass `--model` when a model is specified.
47
+
48
+ Required files:
49
+ - `coderace/adapters/codex.py`
50
+ - `coderace/adapters/claude.py`
51
+
52
+ - [ ] CodexAdapter.build_command: append `--model`, model_name when model is not None
53
+ - [ ] ClaudeAdapter.build_command: append `--model`, model_name when model is not None
54
+ - [ ] Update parse_cost to use the provided model name for accurate pricing
55
+ - [ ] Also update aider.py, gemini.py, opencode.py adapters if they support model flags (check their --help)
56
+ - [ ] Tests for D2
57
+
58
+ ### D3. Agent:Model CLI Syntax
59
+
60
+ Parse `agent:model` syntax in the CLI so users can specify models per agent.
61
+
62
+ Required files:
63
+ - `coderace/cli.py` (or wherever `--agents` is parsed)
64
+ - `coderace/adapters/__init__.py` (adapter registry/factory)
65
+
66
+ The syntax: `--agents codex:gpt-5.4,claude:opus-4-6`
67
+ - If no `:model` suffix, use the adapter's default (current behavior)
68
+ - If `:model` suffix, pass it through to the adapter
69
+ - The same agent can appear multiple times with different models
70
+ - Agent display name in results should include the model: `codex (gpt-5.4)` vs `codex (gpt-5.3-codex)`
71
+
72
+ - [ ] Parse `agent:model` in CLI --agents flag
73
+ - [ ] Support duplicate agents with different models in the same run
74
+ - [ ] Display agent+model in result tables and reports
75
+ - [ ] Works with `run`, `benchmark`, and `race` commands
76
+ - [ ] Tests for D3
77
+
78
+ ### D4. Benchmark and Race Command Integration
79
+
80
+ Ensure `benchmark` and `race` commands correctly handle model-specific agents.
81
+
82
+ Required files:
83
+ - `coderace/benchmark.py`
84
+ - `coderace/commands/` (race command if separate)
85
+ - `coderace/store.py` (results storage)
86
+
87
+ - [ ] Benchmark results store agent+model as the identifier (not just agent name)
88
+ - [ ] ELO ratings track agent+model combinations separately
89
+ - [ ] Leaderboard shows model variants as separate entries
90
+ - [ ] Dashboard HTML includes model information
91
+ - [ ] Tests for D4
92
+
93
+ ### D5. Documentation and Version Bump
94
+
95
+ - [ ] Update README.md with model selection examples
96
+ - [ ] Add model selection section to examples/
97
+ - [ ] Update CHANGELOG.md
98
+ - [ ] Bump version to 1.3.0 in pyproject.toml
99
+ - [ ] All existing 526 tests still pass
100
+ - [ ] New tests bring total to 550+
101
+
102
+ ## 4. Test Requirements
103
+
104
+ - [ ] Unit tests for each adapter with model override
105
+ - [ ] Unit tests for agent:model parsing
106
+ - [ ] Integration test: dry-run benchmark with model variants
107
+ - [ ] Edge cases: invalid model name, empty model, agent without model support
108
+ - [ ] All existing 526 tests must still pass
109
+
110
+ ## 5. Reports
111
+
112
+ - Write progress to `progress-log.md` after each deliverable
113
+ - Include: what was built, what tests pass, what's next, any blockers
114
+ - Final summary when all deliverables done or stopped
115
+
116
+ ## 6. Stop Conditions
117
+
118
+ - All deliverables checked and all tests passing -> DONE
119
+ - 3 consecutive failed attempts on same issue -> STOP, write blocker report
120
+ - Scope creep detected (new requirements discovered) -> STOP, report what's new
121
+ - All tests passing but deliverables remain -> continue to next deliverable
@@ -1,3 +1,3 @@
1
1
  """coderace - Race coding agents against each other on real tasks."""
2
2
 
3
- __version__ = "1.2.0"
3
+ __version__ = "1.3.0"
@@ -0,0 +1,77 @@
1
+ """Agent adapters for coderace."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from coderace.adapters.aider import AiderAdapter
8
+ from coderace.adapters.base import BaseAdapter
9
+ from coderace.adapters.claude import ClaudeAdapter
10
+ from coderace.adapters.codex import CodexAdapter
11
+ from coderace.adapters.gemini import GeminiAdapter
12
+ from coderace.adapters.opencode import OpenCodeAdapter
13
+
14
+ ADAPTERS: dict[str, type[BaseAdapter]] = {
15
+ "claude": ClaudeAdapter,
16
+ "codex": CodexAdapter,
17
+ "aider": AiderAdapter,
18
+ "gemini": GeminiAdapter,
19
+ "opencode": OpenCodeAdapter,
20
+ }
21
+
22
+
23
+ def parse_agent_spec(spec: str) -> tuple[str, Optional[str]]:
24
+ """Parse an agent spec string into (agent_name, model_or_None).
25
+
26
+ Examples:
27
+ "codex" -> ("codex", None)
28
+ "codex:gpt-5.4" -> ("codex", "gpt-5.4")
29
+ "claude:opus-4-6" -> ("claude", "opus-4-6")
30
+ """
31
+ if ":" in spec:
32
+ agent_name, model = spec.split(":", 1)
33
+ return agent_name.strip(), model.strip() or None
34
+ return spec.strip(), None
35
+
36
+
37
+ def make_display_name(agent_name: str, model: Optional[str]) -> str:
38
+ """Return display name for agent+model combo.
39
+
40
+ Examples:
41
+ ("codex", None) -> "codex"
42
+ ("codex", "gpt-5.4") -> "codex (gpt-5.4)"
43
+ """
44
+ if model:
45
+ return f"{agent_name} ({model})"
46
+ return agent_name
47
+
48
+
49
+ def instantiate_adapter(spec: str) -> BaseAdapter:
50
+ """Instantiate an adapter from an agent spec string (e.g. 'codex:gpt-5.4').
51
+
52
+ The returned adapter has:
53
+ - adapter.model set to the parsed model (or None)
54
+ - adapter.name set to the display name (e.g. 'codex (gpt-5.4)')
55
+
56
+ Raises KeyError if the agent name is not in ADAPTERS.
57
+ """
58
+ agent_name, model = parse_agent_spec(spec)
59
+ adapter_cls = ADAPTERS[agent_name]
60
+ adapter = adapter_cls(model=model)
61
+ # Override the instance name to be the display name
62
+ adapter.name = make_display_name(agent_name, model)
63
+ return adapter
64
+
65
+
66
+ __all__ = [
67
+ "ADAPTERS",
68
+ "BaseAdapter",
69
+ "ClaudeAdapter",
70
+ "CodexAdapter",
71
+ "AiderAdapter",
72
+ "GeminiAdapter",
73
+ "OpenCodeAdapter",
74
+ "parse_agent_spec",
75
+ "make_display_name",
76
+ "instantiate_adapter",
77
+ ]
@@ -7,27 +7,34 @@ from typing import Optional
7
7
  from coderace.adapters.base import BaseAdapter
8
8
  from coderace.cost import CostResult, parse_aider_cost
9
9
 
10
+ DEFAULT_AIDER_MODEL = "aider-default"
11
+
10
12
 
11
13
  class AiderAdapter(BaseAdapter):
12
14
  """Adapter for Aider coding assistant."""
13
15
 
14
16
  name = "aider"
15
17
 
16
- def build_command(self, task_description: str) -> list[str]:
17
- return [
18
+ def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
19
+ cmd = [
18
20
  "aider",
19
21
  "--message",
20
22
  task_description,
21
23
  "--yes",
22
24
  "--no-auto-commits",
23
25
  ]
26
+ effective_model = model or self.model
27
+ if effective_model:
28
+ cmd += ["--model", effective_model]
29
+ return cmd
24
30
 
25
31
  def parse_cost(
26
32
  self,
27
33
  stdout: str,
28
34
  stderr: str,
29
- model_name: str = "aider-default",
35
+ model_name: str = "",
30
36
  custom_pricing: dict[str, tuple[float, float]] | None = None,
31
37
  ) -> Optional[CostResult]:
32
38
  """Parse cost data from Aider output."""
33
- return parse_aider_cost(stdout, stderr, model_name, custom_pricing)
39
+ effective_model = model_name or self.model or DEFAULT_AIDER_MODEL
40
+ return parse_aider_cost(stdout, stderr, effective_model, custom_pricing)
@@ -17,8 +17,12 @@ class BaseAdapter(ABC):
17
17
 
18
18
  name: str = "base"
19
19
 
20
+ def __init__(self, model: Optional[str] = None) -> None:
21
+ """Initialize adapter with optional model override."""
22
+ self.model = model
23
+
20
24
  @abstractmethod
21
- def build_command(self, task_description: str) -> list[str]:
25
+ def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
22
26
  """Build the CLI command to invoke this agent."""
23
27
  ...
24
28
 
@@ -44,7 +48,8 @@ class BaseAdapter(ABC):
44
48
  custom_pricing: dict[str, tuple[float, float]] | None = None,
45
49
  ) -> AgentResult:
46
50
  """Run the agent on a task and capture results."""
47
- cmd = self.build_command(task_description)
51
+ model = self.model
52
+ cmd = self.build_command(task_description, model=model)
48
53
  start = time.monotonic()
49
54
  timed_out = False
50
55
 
@@ -76,7 +81,12 @@ class BaseAdapter(ABC):
76
81
  cost_result: Optional[CostResult] = None
77
82
  if not no_cost:
78
83
  try:
79
- cost_result = self.parse_cost(stdout, stderr, custom_pricing=custom_pricing)
84
+ cost_result = self.parse_cost(
85
+ stdout,
86
+ stderr,
87
+ model_name=model or "",
88
+ custom_pricing=custom_pricing,
89
+ )
80
90
  except Exception:
81
91
  pass
82
92
 
@@ -7,29 +7,35 @@ from typing import Optional
7
7
  from coderace.adapters.base import BaseAdapter
8
8
  from coderace.cost import CostResult, parse_claude_cost
9
9
 
10
+ DEFAULT_CLAUDE_MODEL = "claude-sonnet-4-6"
11
+
10
12
 
11
13
  class ClaudeAdapter(BaseAdapter):
12
14
  """Adapter for Claude Code CLI."""
13
15
 
14
16
  name = "claude"
15
17
 
16
- def build_command(self, task_description: str) -> list[str]:
17
- return [
18
+ def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
19
+ cmd = [
18
20
  "claude",
19
21
  "--print",
20
22
  "--output-format",
21
23
  "json",
22
24
  "--dangerously-skip-permissions",
23
- "-p",
24
- task_description,
25
25
  ]
26
+ effective_model = model or self.model
27
+ if effective_model:
28
+ cmd += ["--model", effective_model]
29
+ cmd += ["-p", task_description]
30
+ return cmd
26
31
 
27
32
  def parse_cost(
28
33
  self,
29
34
  stdout: str,
30
35
  stderr: str,
31
- model_name: str = "claude-sonnet-4-6",
36
+ model_name: str = "",
32
37
  custom_pricing: dict[str, tuple[float, float]] | None = None,
33
38
  ) -> Optional[CostResult]:
34
39
  """Parse cost data from Claude Code output."""
35
- return parse_claude_cost(stdout, stderr, model_name, custom_pricing)
40
+ effective_model = model_name or self.model or DEFAULT_CLAUDE_MODEL
41
+ return parse_claude_cost(stdout, stderr, effective_model, custom_pricing)
@@ -7,26 +7,33 @@ from typing import Optional
7
7
  from coderace.adapters.base import BaseAdapter
8
8
  from coderace.cost import CostResult, parse_codex_cost
9
9
 
10
+ DEFAULT_CODEX_MODEL = "gpt-5.3-codex"
11
+
10
12
 
11
13
  class CodexAdapter(BaseAdapter):
12
14
  """Adapter for OpenAI Codex CLI."""
13
15
 
14
16
  name = "codex"
15
17
 
16
- def build_command(self, task_description: str) -> list[str]:
17
- return [
18
+ def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
19
+ cmd = [
18
20
  "codex",
19
21
  "exec",
20
22
  "--full-auto",
21
- task_description,
22
23
  ]
24
+ effective_model = model or self.model
25
+ if effective_model:
26
+ cmd += ["--model", effective_model]
27
+ cmd.append(task_description)
28
+ return cmd
23
29
 
24
30
  def parse_cost(
25
31
  self,
26
32
  stdout: str,
27
33
  stderr: str,
28
- model_name: str = "gpt-5.3-codex",
34
+ model_name: str = "",
29
35
  custom_pricing: dict[str, tuple[float, float]] | None = None,
30
36
  ) -> Optional[CostResult]:
31
37
  """Parse cost data from Codex CLI output."""
32
- return parse_codex_cost(stdout, stderr, model_name, custom_pricing)
38
+ effective_model = model_name or self.model or DEFAULT_CODEX_MODEL
39
+ return parse_codex_cost(stdout, stderr, effective_model, custom_pricing)
@@ -7,25 +7,29 @@ from typing import Optional
7
7
  from coderace.adapters.base import BaseAdapter
8
8
  from coderace.cost import CostResult, parse_gemini_cost
9
9
 
10
+ DEFAULT_GEMINI_MODEL = "gemini-2.5-pro"
11
+
10
12
 
11
13
  class GeminiAdapter(BaseAdapter):
12
14
  """Adapter for Google Gemini CLI."""
13
15
 
14
16
  name = "gemini"
15
17
 
16
- def build_command(self, task_description: str) -> list[str]:
17
- return [
18
- "gemini",
19
- "-p",
20
- task_description,
21
- ]
18
+ def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
19
+ cmd = ["gemini"]
20
+ effective_model = model or self.model
21
+ if effective_model:
22
+ cmd += ["--model", effective_model]
23
+ cmd += ["-p", task_description]
24
+ return cmd
22
25
 
23
26
  def parse_cost(
24
27
  self,
25
28
  stdout: str,
26
29
  stderr: str,
27
- model_name: str = "gemini-2.5-pro",
30
+ model_name: str = "",
28
31
  custom_pricing: dict[str, tuple[float, float]] | None = None,
29
32
  ) -> Optional[CostResult]:
30
33
  """Parse cost data from Gemini CLI output."""
31
- return parse_gemini_cost(stdout, stderr, model_name, custom_pricing)
34
+ effective_model = model_name or self.model or DEFAULT_GEMINI_MODEL
35
+ return parse_gemini_cost(stdout, stderr, effective_model, custom_pricing)
@@ -7,25 +7,29 @@ from typing import Optional
7
7
  from coderace.adapters.base import BaseAdapter
8
8
  from coderace.cost import CostResult, parse_opencode_cost
9
9
 
10
+ DEFAULT_OPENCODE_MODEL = "opencode-default"
11
+
10
12
 
11
13
  class OpenCodeAdapter(BaseAdapter):
12
14
  """Adapter for OpenCode CLI (terminal-first AI coding agent)."""
13
15
 
14
16
  name = "opencode"
15
17
 
16
- def build_command(self, task_description: str) -> list[str]:
17
- return [
18
- "opencode",
19
- "run",
20
- task_description,
21
- ]
18
+ def build_command(self, task_description: str, model: Optional[str] = None) -> list[str]:
19
+ cmd = ["opencode", "run"]
20
+ effective_model = model or self.model
21
+ if effective_model:
22
+ cmd += ["--model", effective_model]
23
+ cmd.append(task_description)
24
+ return cmd
22
25
 
23
26
  def parse_cost(
24
27
  self,
25
28
  stdout: str,
26
29
  stderr: str,
27
- model_name: str = "opencode-default",
30
+ model_name: str = "",
28
31
  custom_pricing: dict[str, tuple[float, float]] | None = None,
29
32
  ) -> Optional[CostResult]:
30
33
  """Parse cost data from OpenCode output."""
31
- return parse_opencode_cost(stdout, stderr, model_name, custom_pricing)
34
+ effective_model = model_name or self.model or DEFAULT_OPENCODE_MODEL
35
+ return parse_opencode_cost(stdout, stderr, effective_model, custom_pricing)