coderace 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. coderace-0.2.0/CHANGELOG.md +34 -0
  2. {coderace-0.1.0 → coderace-0.2.0}/PKG-INFO +79 -10
  3. {coderace-0.1.0 → coderace-0.2.0}/README.md +78 -9
  4. coderace-0.2.0/all-day-build-contract-v0.2.md +137 -0
  5. {coderace-0.1.0 → coderace-0.2.0}/coderace/__init__.py +1 -1
  6. {coderace-0.1.0 → coderace-0.2.0}/coderace/adapters/__init__.py +3 -0
  7. coderace-0.2.0/coderace/adapters/opencode.py +18 -0
  8. coderace-0.2.0/coderace/cli.py +509 -0
  9. coderace-0.2.0/coderace/html_report.py +134 -0
  10. {coderace-0.1.0 → coderace-0.2.0}/coderace/reporter.py +60 -0
  11. {coderace-0.1.0 → coderace-0.2.0}/coderace/scorer.py +10 -15
  12. coderace-0.2.0/coderace/stats.py +97 -0
  13. {coderace-0.1.0 → coderace-0.2.0}/coderace/task.py +13 -0
  14. coderace-0.2.0/coderace/types.py +130 -0
  15. coderace-0.2.0/examples/add-type-hints.yaml +31 -0
  16. coderace-0.2.0/examples/example-task.yaml +23 -0
  17. coderace-0.2.0/examples/fix-edge-case.yaml +37 -0
  18. coderace-0.2.0/examples/write-tests.yaml +37 -0
  19. {coderace-0.1.0 → coderace-0.2.0}/pyproject.toml +4 -1
  20. {coderace-0.1.0 → coderace-0.2.0}/tests/test_adapters.py +12 -1
  21. {coderace-0.1.0 → coderace-0.2.0}/tests/test_cli.py +2 -1
  22. coderace-0.2.0/tests/test_examples.py +65 -0
  23. coderace-0.2.0/tests/test_html_report.py +95 -0
  24. coderace-0.2.0/tests/test_scorer.py +80 -0
  25. coderace-0.2.0/tests/test_stats.py +76 -0
  26. coderace-0.2.0/uv.lock +349 -0
  27. coderace-0.1.0/coderace/cli.py +0 -306
  28. coderace-0.1.0/coderace/types.py +0 -73
  29. coderace-0.1.0/tests/test_scorer.py +0 -46
  30. {coderace-0.1.0 → coderace-0.2.0}/.github/workflows/publish.yml +0 -0
  31. {coderace-0.1.0 → coderace-0.2.0}/.gitignore +0 -0
  32. {coderace-0.1.0 → coderace-0.2.0}/LICENSE +0 -0
  33. {coderace-0.1.0 → coderace-0.2.0}/coderace/adapters/aider.py +0 -0
  34. {coderace-0.1.0 → coderace-0.2.0}/coderace/adapters/base.py +0 -0
  35. {coderace-0.1.0 → coderace-0.2.0}/coderace/adapters/claude.py +0 -0
  36. {coderace-0.1.0 → coderace-0.2.0}/coderace/adapters/codex.py +0 -0
  37. {coderace-0.1.0 → coderace-0.2.0}/coderace/adapters/gemini.py +0 -0
  38. {coderace-0.1.0 → coderace-0.2.0}/coderace/git_ops.py +0 -0
  39. {coderace-0.1.0 → coderace-0.2.0}/tests/__init__.py +0 -0
  40. {coderace-0.1.0 → coderace-0.2.0}/tests/conftest.py +0 -0
  41. {coderace-0.1.0 → coderace-0.2.0}/tests/test_git_ops.py +0 -0
  42. {coderace-0.1.0 → coderace-0.2.0}/tests/test_reporter.py +0 -0
  43. {coderace-0.1.0 → coderace-0.2.0}/tests/test_task.py +0 -0
@@ -0,0 +1,34 @@
1
+ # Changelog
2
+
3
+ ## [0.2.0] - 2026-02-23
4
+
5
+ ### Added
6
+
7
+ - **OpenCode adapter** - OpenCode (terminal-first open-source coding agent) is now a supported agent (`opencode` in task YAML)
8
+ - **Custom scoring weights** - Override default weights in task YAML via `scoring:` section; weights are auto-normalized; supports short aliases (`tests`, `exit`, `lint`, `time`, `lines`)
9
+ - **HTML reports** - Self-contained single-file HTML report auto-generated on every run at `.coderace/<task>-results.html`; also `coderace results --html report.html` for manual export; sortable columns, dark theme
10
+ - **Statistical mode** - `coderace run task.yaml --runs N` for multi-run comparison; shows mean ± stddev for score, time, and lines changed; saves per-run and aggregated JSON
11
+ - **Example tasks** - `examples/` directory with 3 ready-to-use templates: `add-type-hints.yaml`, `fix-edge-case.yaml`, `write-tests.yaml`
12
+
13
+ ### Changed
14
+
15
+ - `coderace init` template now includes OpenCode in default agent list
16
+ - `coderace init` template includes commented scoring example
17
+ - README: "Try it now" section, statistical mode docs, HTML report docs, custom scoring docs, updated agent table
18
+
19
+ ### Fixed
20
+
21
+ - `opencode` now accepted as a valid agent name in task validation
22
+
23
+ ## [0.1.0] - 2026-02-22
24
+
25
+ ### Added
26
+
27
+ - Initial release
28
+ - CLI: `init`, `run`, `results`, `version` commands
29
+ - 4 agent adapters: Claude Code, Codex, Aider, Gemini CLI
30
+ - Sequential and parallel (git worktrees) run modes
31
+ - Composite scoring: tests (40%), exit (20%), lint (15%), time (15%), lines (10%)
32
+ - JSON results output
33
+ - Rich terminal table output
34
+ - `coderace run --parallel` using git worktrees
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderace
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Race coding agents against each other on real tasks
5
5
  Project-URL: Homepage, https://github.com/mikiships/coderace
6
6
  Project-URL: Repository, https://github.com/mikiships/coderace
@@ -30,9 +30,11 @@ Description-Content-Type: text/markdown
30
30
 
31
31
  # coderace
32
32
 
33
- Race coding agents against each other on real tasks in your repo.
33
+ Stop reading blog comparisons. Race coding agents against each other on real tasks in *your* repo with *your* code.
34
34
 
35
- Define a task. Run it against Claude Code, Codex, and Aider. Get a scored comparison table.
35
+ Every week there's a new "Claude Code vs Codex vs Cursor" post. They test on toy problems with cherry-picked examples. coderace gives you automated, reproducible, scored comparisons on the tasks you actually care about.
36
+
37
+ Define a task. Run it against Claude Code, Codex, Aider, Gemini CLI, and OpenCode. Get a scored comparison table.
36
38
 
37
39
  ## Install
38
40
 
@@ -108,16 +110,71 @@ Terminal table with Rich formatting:
108
110
  └──────┴────────┴───────┴───────┴──────┴──────┴──────────┴───────┘
109
111
  ```
110
112
 
111
- Results also saved as JSON in `.coderace/<task>-results.json`.
113
+ Results also saved as JSON in `.coderace/<task>-results.json` and as a self-contained HTML report in `.coderace/<task>-results.html`.
114
+
115
+ ## Try It Now
116
+
117
+ The `examples/` directory has ready-to-use task templates:
118
+
119
+ ```bash
120
+ # Race agents on adding type hints to your project
121
+ coderace run examples/add-type-hints.yaml
122
+
123
+ # Race agents on fixing an edge case bug
124
+ coderace run examples/fix-edge-case.yaml
125
+
126
+ # Race agents on writing new tests
127
+ coderace run examples/write-tests.yaml
128
+ ```
129
+
130
+ Edit the `repo` and `description` fields to point at your actual project and describe your real task.
131
+
132
+ ## Statistical Mode
133
+
134
+ Run each agent multiple times and get mean ± stddev:
135
+
136
+ ```bash
137
+ coderace run task.yaml --runs 5
138
+ ```
139
+
140
+ Useful for tasks with variable outcomes (LLM nondeterminism is real).
141
+
142
+ ## HTML Reports
143
+
144
+ Export results as a shareable single-file HTML report:
145
+
146
+ ```bash
147
+ # Auto-generated on every run at .coderace/<task>-results.html
148
+ # Or export manually:
149
+ coderace results task.yaml --html report.html
150
+ ```
151
+
152
+ The HTML report has sortable columns and a dark theme. Drop it in a blog post or Slack.
153
+
154
+ ## Custom Scoring
155
+
156
+ Override the default weights in your task YAML:
157
+
158
+ ```yaml
159
+ scoring:
160
+ tests: 60 # tests passing (default 40)
161
+ exit: 20 # clean exit (default 20)
162
+ lint: 10 # lint clean (default 15)
163
+ time: 5 # wall time (default 15)
164
+ lines: 5 # lines changed (default 10)
165
+ ```
166
+
167
+ Weights are normalized automatically (don't need to sum to 100).
112
168
 
113
169
  ## Supported Agents
114
170
 
115
- | Agent | CLI | Command |
116
- |-------|-----|---------|
117
- | Claude Code | `claude` | `claude --print --output-format json -p "<task>"` |
118
- | Codex | `codex` | `codex --quiet --full-auto -p "<task>"` |
119
- | Aider | `aider` | `aider --message "<task>" --yes --no-auto-commits` |
120
- | Gemini CLI | `gemini` | `gemini --non-interactive -p "<task>"` |
171
+ | Agent | CLI | Notes |
172
+ |-------|-----|-------|
173
+ | Claude Code | `claude` | Anthropic's coding agent |
174
+ | Codex | `codex` | OpenAI Codex CLI |
175
+ | Aider | `aider` | Git-integrated AI coding |
176
+ | Gemini CLI | `gemini` | Google's Gemini CLI |
177
+ | OpenCode | `opencode` | Open-source terminal agent |
121
178
 
122
179
  Each agent must be installed and authenticated separately.
123
180
 
@@ -131,6 +188,18 @@ coderace run task.yaml --parallel
131
188
 
132
189
  Sequential mode (default) runs agents one at a time on the same repo.
133
190
 
191
+ ## Why coderace?
192
+
193
+ **Blog posts compare models. coderace compares agents on your work.**
194
+
195
+ - Run on your actual codebase, not HumanEval
196
+ - Automated scoring: tests, lint, time, lines changed
197
+ - Parallel mode with git worktrees (no interference between agents)
198
+ - JSON output for CI integration and tracking over time
199
+ - Works with any agent that has a CLI
200
+
201
+ The goal isn't "which model is best." It's "which agent solves my specific problem best."
202
+
134
203
  ## Requirements
135
204
 
136
205
  - Python 3.10+
@@ -1,8 +1,10 @@
1
1
  # coderace
2
2
 
3
- Race coding agents against each other on real tasks in your repo.
3
+ Stop reading blog comparisons. Race coding agents against each other on real tasks in *your* repo with *your* code.
4
4
 
5
- Define a task. Run it against Claude Code, Codex, and Aider. Get a scored comparison table.
5
+ Every week there's a new "Claude Code vs Codex vs Cursor" post. They test on toy problems with cherry-picked examples. coderace gives you automated, reproducible, scored comparisons on the tasks you actually care about.
6
+
7
+ Define a task. Run it against Claude Code, Codex, Aider, Gemini CLI, and OpenCode. Get a scored comparison table.
6
8
 
7
9
  ## Install
8
10
 
@@ -78,16 +80,71 @@ Terminal table with Rich formatting:
78
80
  └──────┴────────┴───────┴───────┴──────┴──────┴──────────┴───────┘
79
81
  ```
80
82
 
81
- Results also saved as JSON in `.coderace/<task>-results.json`.
83
+ Results also saved as JSON in `.coderace/<task>-results.json` and as a self-contained HTML report in `.coderace/<task>-results.html`.
84
+
85
+ ## Try It Now
86
+
87
+ The `examples/` directory has ready-to-use task templates:
88
+
89
+ ```bash
90
+ # Race agents on adding type hints to your project
91
+ coderace run examples/add-type-hints.yaml
92
+
93
+ # Race agents on fixing an edge case bug
94
+ coderace run examples/fix-edge-case.yaml
95
+
96
+ # Race agents on writing new tests
97
+ coderace run examples/write-tests.yaml
98
+ ```
99
+
100
+ Edit the `repo` and `description` fields to point at your actual project and describe your real task.
101
+
102
+ ## Statistical Mode
103
+
104
+ Run each agent multiple times and get mean ± stddev:
105
+
106
+ ```bash
107
+ coderace run task.yaml --runs 5
108
+ ```
109
+
110
+ Useful for tasks with variable outcomes (LLM nondeterminism is real).
111
+
112
+ ## HTML Reports
113
+
114
+ Export results as a shareable single-file HTML report:
115
+
116
+ ```bash
117
+ # Auto-generated on every run at .coderace/<task>-results.html
118
+ # Or export manually:
119
+ coderace results task.yaml --html report.html
120
+ ```
121
+
122
+ The HTML report has sortable columns and a dark theme. Drop it in a blog post or Slack.
123
+
124
+ ## Custom Scoring
125
+
126
+ Override the default weights in your task YAML:
127
+
128
+ ```yaml
129
+ scoring:
130
+ tests: 60 # tests passing (default 40)
131
+ exit: 20 # clean exit (default 20)
132
+ lint: 10 # lint clean (default 15)
133
+ time: 5 # wall time (default 15)
134
+ lines: 5 # lines changed (default 10)
135
+ ```
136
+
137
+ Weights are normalized automatically (don't need to sum to 100).
82
138
 
83
139
  ## Supported Agents
84
140
 
85
- | Agent | CLI | Command |
86
- |-------|-----|---------|
87
- | Claude Code | `claude` | `claude --print --output-format json -p "<task>"` |
88
- | Codex | `codex` | `codex --quiet --full-auto -p "<task>"` |
89
- | Aider | `aider` | `aider --message "<task>" --yes --no-auto-commits` |
90
- | Gemini CLI | `gemini` | `gemini --non-interactive -p "<task>"` |
141
+ | Agent | CLI | Notes |
142
+ |-------|-----|-------|
143
+ | Claude Code | `claude` | Anthropic's coding agent |
144
+ | Codex | `codex` | OpenAI Codex CLI |
145
+ | Aider | `aider` | Git-integrated AI coding |
146
+ | Gemini CLI | `gemini` | Google's Gemini CLI |
147
+ | OpenCode | `opencode` | Open-source terminal agent |
91
148
 
92
149
  Each agent must be installed and authenticated separately.
93
150
 
@@ -101,6 +158,18 @@ coderace run task.yaml --parallel
101
158
 
102
159
  Sequential mode (default) runs agents one at a time on the same repo.
103
160
 
161
+ ## Why coderace?
162
+
163
+ **Blog posts compare models. coderace compares agents on your work.**
164
+
165
+ - Run on your actual codebase, not HumanEval
166
+ - Automated scoring: tests, lint, time, lines changed
167
+ - Parallel mode with git worktrees (no interference between agents)
168
+ - JSON output for CI integration and tracking over time
169
+ - Works with any agent that has a CLI
170
+
171
+ The goal isn't "which model is best." It's "which agent solves my specific problem best."
172
+
104
173
  ## Requirements
105
174
 
106
175
  - Python 3.10+
@@ -0,0 +1,137 @@
1
+ # All-Day Build Contract: coderace v0.2.0
2
+
3
+ Status: In Progress
4
+ Date: 2026-02-23
5
+ Owner: Codex/sub-agent execution pass
6
+ Scope type: Deliverable-gated (no hour promises)
7
+
8
+ ## 1. Objective
9
+
10
+ Ship coderace v0.2.0 with five new features that make comparison results shareable, statistically meaningful, and broader in agent coverage. The "Claude Code vs Codex" comparison trend is peaking this week. OpenCode (60k-star open-source alternative) just got a major benchmark review. Adding OpenCode as the 5th agent + HTML reports makes coderace the go-to tool for this moment.
11
+
12
+ This contract is considered complete only when every deliverable and validation gate below is satisfied.
13
+
14
+ ## 2. Non-Negotiable Build Rules
15
+
16
+ 1. No time-based completion claims.
17
+ 2. Completion is allowed only when all checklist items are checked.
18
+ 3. Full test suite must pass at the end (existing 39 tests + new tests).
19
+ 4. New features must ship with docs and report addendum updates in the same pass.
20
+ 5. CLI outputs must be deterministic and schema-backed where specified.
21
+ 6. Never modify files outside the project directory.
22
+ 7. Commit after each completed deliverable (not at the end).
23
+ 8. If stuck on same issue for 3 attempts, stop and write a blocker report.
24
+ 9. Do NOT refactor, restyle, or "improve" code outside the deliverables.
25
+ 10. Read existing tests and docs before writing new code.
26
+
27
+ ## 3. Feature Deliverables
28
+
29
+ ### D1. OpenCode Adapter (5th CLI agent)
30
+
31
+ Add OpenCode CLI as a supported agent. OpenCode is a terminal-first open-source coding assistant with 60k+ GitHub stars. It's invoked as `opencode` with similar patterns to other CLI agents.
32
+
33
+ Required files:
34
+ - `coderace/adapters/opencode.py`
35
+ - `tests/test_opencode_adapter.py`
36
+
37
+ - [ ] Implement OpenCode adapter following existing adapter pattern (see claude.py, codex.py, aider.py, gemini.py)
38
+ - [ ] OpenCode invocation: `opencode` CLI with task prompt via stdin or --prompt flag (research actual CLI interface)
39
+ - [ ] If OpenCode CLI is not installed, adapter should detect and report clearly
40
+ - [ ] Register adapter in adapter registry
41
+ - [ ] Add `opencode` to task YAML agents list support
42
+ - [ ] Tests: unit tests for adapter (mock CLI invocation), integration with run pipeline
43
+ - [ ] Update README: add OpenCode to supported agents list and example YAML
44
+
45
+ ### D2. Custom Scoring Weights
46
+
47
+ Allow users to override default scoring weights in the task YAML. Currently hardcoded (40/20/15/15/10). Users should be able to tune.
48
+
49
+ Required files:
50
+ - Modify `coderace/scoring.py` (or wherever scoring lives)
51
+ - `tests/test_custom_scoring.py`
52
+
53
+ - [ ] Add optional `scoring` section to task YAML schema:
54
+ ```yaml
55
+ scoring:
56
+ tests: 50
57
+ exit: 20
58
+ lint: 10
59
+ time: 10
60
+ lines: 10
61
+ ```
62
+ - [ ] Weights are normalized (sum to 100) automatically
63
+ - [ ] If `scoring` section omitted, use current defaults
64
+ - [ ] Validate: all weights >= 0, no unknown keys
65
+ - [ ] Tests: custom weights, partial override, invalid weights, normalization
66
+
67
+ ### D3. HTML Report Output
68
+
69
+ Generate a self-contained HTML report from race results. This makes results shareable on blogs, tweets, and team Slack.
70
+
71
+ Required files:
72
+ - `coderace/report.py`
73
+ - `tests/test_report.py`
74
+
75
+ - [ ] `coderace results task.yaml --html report.html` generates a single-file HTML report
76
+ - [ ] Report includes: task name, date, agent scores table, scoring weights used, timing breakdown
77
+ - [ ] Styled with inline CSS (no external dependencies, single file)
78
+ - [ ] Table is sortable by clicking column headers (vanilla JS, inline)
79
+ - [ ] Include a "Generated by coderace" footer with version
80
+ - [ ] Tests: HTML generation, content validation, file output
81
+
82
+ ### D4. Statistical Mode (multiple runs)
83
+
84
+ Run the same task N times and report mean/stddev for each metric. Real benchmarking needs statistical significance.
85
+
86
+ Required files:
87
+ - Modify `coderace/runner.py` (or equivalent)
88
+ - `coderace/stats.py`
89
+ - `tests/test_stats.py`
90
+
91
+ - [ ] `coderace run task.yaml --runs 5` runs each agent 5 times
92
+ - [ ] Results show mean ± stddev for score, time, and lines changed
93
+ - [ ] Rich table adapts to show statistical columns
94
+ - [ ] JSON output includes per-run data + aggregates
95
+ - [ ] HTML report (D3) also supports statistical view
96
+ - [ ] Tests: multi-run aggregation, edge cases (1 run = no stddev), JSON schema
97
+
98
+ ### D5. Example Benchmark Tasks
99
+
100
+ Ship example tasks that work out of the box on any Python project. Users shouldn't have to write YAML from scratch to try coderace.
101
+
102
+ Required files:
103
+ - `examples/add-type-hints.yaml`
104
+ - `examples/fix-edge-case.yaml`
105
+ - `examples/write-tests.yaml`
106
+ - Update README with examples section
107
+
108
+ - [ ] 3 example task YAMLs that target common patterns (type hints, edge cases, test coverage)
109
+ - [ ] Each example has a description explaining what it tests
110
+ - [ ] Examples reference a small bundled test fixture (or clearly document how to point at user's repo)
111
+ - [ ] README section: "Try it now" with copy-paste commands
112
+ - [ ] Tests: validate example YAML files parse correctly
113
+
114
+ ## 4. Test Requirements
115
+
116
+ - [ ] Unit tests for each deliverable (specified above)
117
+ - [ ] All 39 existing tests must still pass
118
+ - [ ] Integration test: run full pipeline with mock agents including OpenCode
119
+ - [ ] Edge cases: empty results, single agent, all agents fail, custom weights sum to 0
120
+
121
+ ## 5. Reports
122
+
123
+ - Write progress to `progress-log.md` after each deliverable
124
+ - Include: what was built, what tests pass, what's next, any blockers
125
+ - Final summary when all deliverables done or stopped
126
+
127
+ ## 6. Stop Conditions
128
+
129
+ - All deliverables checked and all tests passing -> DONE
130
+ - 3 consecutive failed attempts on same issue -> STOP, write blocker report
131
+ - Scope creep detected (new requirements discovered) -> STOP, report what's new
132
+ - All tests passing but deliverables remain -> continue to next deliverable
133
+
134
+ ## 7. Version Bump
135
+
136
+ - [ ] Bump version to 0.2.0 in pyproject.toml
137
+ - [ ] Update CHANGELOG or add one if missing
@@ -1,3 +1,3 @@
1
1
  """coderace - Race coding agents against each other on real tasks."""
2
2
 
3
- __version__ = "0.1.0"
3
+ __version__ = "0.2.0"
@@ -5,12 +5,14 @@ from coderace.adapters.base import BaseAdapter
5
5
  from coderace.adapters.claude import ClaudeAdapter
6
6
  from coderace.adapters.codex import CodexAdapter
7
7
  from coderace.adapters.gemini import GeminiAdapter
8
+ from coderace.adapters.opencode import OpenCodeAdapter
8
9
 
9
10
  ADAPTERS: dict[str, type[BaseAdapter]] = {
10
11
  "claude": ClaudeAdapter,
11
12
  "codex": CodexAdapter,
12
13
  "aider": AiderAdapter,
13
14
  "gemini": GeminiAdapter,
15
+ "opencode": OpenCodeAdapter,
14
16
  }
15
17
 
16
18
  __all__ = [
@@ -20,4 +22,5 @@ __all__ = [
20
22
  "CodexAdapter",
21
23
  "AiderAdapter",
22
24
  "GeminiAdapter",
25
+ "OpenCodeAdapter",
23
26
  ]
@@ -0,0 +1,18 @@
1
+ """OpenCode adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from coderace.adapters.base import BaseAdapter
6
+
7
+
8
+ class OpenCodeAdapter(BaseAdapter):
9
+ """Adapter for OpenCode CLI (terminal-first AI coding agent)."""
10
+
11
+ name = "opencode"
12
+
13
+ def build_command(self, task_description: str) -> list[str]:
14
+ return [
15
+ "opencode",
16
+ "run",
17
+ task_description,
18
+ ]