coderace 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {coderace-0.3.0 → coderace-0.4.0}/CHANGELOG.md +13 -0
- {coderace-0.3.0 → coderace-0.4.0}/PKG-INFO +72 -1
- {coderace-0.3.0 → coderace-0.4.0}/README.md +71 -0
- coderace-0.4.0/all-day-build-contract-cost-tracking.md +97 -0
- coderace-0.4.0/coderace/adapters/aider.py +33 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/adapters/base.py +32 -1
- {coderace-0.3.0 → coderace-0.4.0}/coderace/adapters/claude.py +13 -0
- coderace-0.4.0/coderace/adapters/codex.py +33 -0
- coderace-0.4.0/coderace/adapters/gemini.py +32 -0
- coderace-0.4.0/coderace/adapters/opencode.py +31 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/cli.py +23 -2
- {coderace-0.3.0 → coderace-0.4.0}/coderace/commands/results.py +17 -6
- coderace-0.4.0/coderace/cost.py +456 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/html_report.py +15 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/reporter.py +26 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/scorer.py +1 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/stats.py +9 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/task.py +33 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/types.py +7 -0
- {coderace-0.3.0 → coderace-0.4.0}/examples/add-type-hints.yaml +5 -0
- {coderace-0.3.0 → coderace-0.4.0}/examples/example-task.yaml +8 -0
- {coderace-0.3.0 → coderace-0.4.0}/examples/fix-edge-case.yaml +5 -0
- {coderace-0.3.0 → coderace-0.4.0}/examples/write-tests.yaml +5 -0
- coderace-0.4.0/progress-log.md +92 -0
- {coderace-0.3.0 → coderace-0.4.0}/pyproject.toml +1 -1
- coderace-0.4.0/tests/test_cost.py +432 -0
- coderace-0.4.0/tests/test_cost_config.py +311 -0
- coderace-0.4.0/tests/test_cost_integration.py +374 -0
- {coderace-0.3.0 → coderace-0.4.0}/uv.lock +1 -1
- coderace-0.3.0/coderace/adapters/aider.py +0 -20
- coderace-0.3.0/coderace/adapters/codex.py +0 -20
- coderace-0.3.0/coderace/adapters/gemini.py +0 -19
- coderace-0.3.0/coderace/adapters/opencode.py +0 -18
- coderace-0.3.0/progress-log.md +0 -93
- {coderace-0.3.0 → coderace-0.4.0}/.github/workflows/publish.yml +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/.gitignore +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/LICENSE +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/action.yml +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/all-day-build-contract-ci-integration.md +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/all-day-build-contract-v0.2.md +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/__init__.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/adapters/__init__.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/commands/__init__.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/commands/diff.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/coderace/git_ops.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/examples/ci-race-on-pr.yml +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/scripts/ci-run.sh +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/scripts/format-comment.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/__init__.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/conftest.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_adapters.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_cli.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_diff.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_examples.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_format_comment.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_git_ops.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_html_report.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_markdown_results.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_reporter.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_scorer.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_stats.py +0 -0
- {coderace-0.3.0 → coderace-0.4.0}/tests/test_task.py +0 -0
|
@@ -1,5 +1,18 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.4.0] - 2026-02-24
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- **Cost tracking** — Each agent run now includes an estimated API cost. The results table shows a `Cost (USD)` column in terminal, markdown, JSON, and HTML output.
|
|
8
|
+
- **`coderace/cost.py`** — Pricing engine: pricing table for Claude Code (Sonnet 4.6, Opus 4.6), Codex (GPT-5.3), Gemini CLI (2.5 Pro, 3.1 Pro), Aider, and OpenCode. `CostResult` dataclass with `input_tokens`, `output_tokens`, `estimated_cost_usd`, `model_name`, `pricing_source`.
|
|
9
|
+
- **Per-adapter `parse_cost()` methods** — Each adapter extracts token counts or cost info from the agent's stdout/stderr. Falls back to file-size estimation when tokens are unavailable.
|
|
10
|
+
- **`pricing:` section in task YAML** — Override pricing per-agent or per-model with `input_per_1m` / `output_per_1m` (USD per 1M tokens).
|
|
11
|
+
- **`--no-cost` flag** — `coderace run task.yaml --no-cost` disables cost tracking entirely.
|
|
12
|
+
- **HTML report $/score column** — The HTML report now shows cost and cost-per-point for direct efficiency comparison.
|
|
13
|
+
- **Statistical mode cost aggregation** — `--runs N` shows mean ± stddev for cost alongside score and time.
|
|
14
|
+
- **`coderace init` template** — Now includes a commented `pricing:` example section.
|
|
15
|
+
|
|
3
16
|
## [0.3.0] - 2026-02-24
|
|
4
17
|
|
|
5
18
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: coderace
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Race coding agents against each other on real tasks
|
|
5
5
|
Project-URL: Homepage, https://github.com/mikiships/coderace
|
|
6
6
|
Project-URL: Repository, https://github.com/mikiships/coderace
|
|
@@ -200,6 +200,77 @@ scoring:
|
|
|
200
200
|
|
|
201
201
|
Weights are normalized automatically (don't need to sum to 100).
|
|
202
202
|
|
|
203
|
+
## Cost Tracking
|
|
204
|
+
|
|
205
|
+
coderace automatically estimates API cost for each agent run. After every race, the results table includes a **Cost (USD)** column so you can compare quality-per-dollar, not just quality alone.
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
┌──────┬────────┬───────┬───────┬──────┬──────┬──────────┬───────┬────────────┐
|
|
209
|
+
│ Rank │ Agent │ Score │ Tests │ Exit │ Lint │ Time (s) │ Lines │ Cost (USD) │
|
|
210
|
+
├──────┼────────┼───────┼───────┼──────┼──────┼──────────┼───────┼────────────┤
|
|
211
|
+
│ 1 │ claude │ 85.0 │ PASS │ PASS │ PASS │ 10.5 │ 42 │ $0.0063 │
|
|
212
|
+
│ 2 │ codex │ 70.0 │ PASS │ PASS │ FAIL │ 15.2 │ 98 │ $0.0041 │
|
|
213
|
+
│ 3 │ aider │ 55.0 │ FAIL │ PASS │ PASS │ 8.1 │ 31 │ - │
|
|
214
|
+
└──────┴────────┴───────┴───────┴──────┴──────┴──────────┴───────┴────────────┘
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Cost appears in all output formats:
|
|
218
|
+
- **Terminal** — `Cost (USD)` column (shows `-` when unavailable)
|
|
219
|
+
- **Markdown** — `--format markdown` includes the column
|
|
220
|
+
- **JSON** — `cost` object per agent result with `input_tokens`, `output_tokens`, `estimated_cost_usd`, `model_name`, `pricing_source`
|
|
221
|
+
- **HTML report** — Cost column plus `$/score` ratio column for direct efficiency comparison
|
|
222
|
+
|
|
223
|
+
### How it works
|
|
224
|
+
|
|
225
|
+
Each agent adapter parses token counts or cost lines from the agent's CLI output:
|
|
226
|
+
|
|
227
|
+
| Agent | Source |
|
|
228
|
+
|-------|--------|
|
|
229
|
+
| Claude Code | `usage.input_tokens` / `usage.output_tokens` from JSON output; or "Total cost: $N" lines |
|
|
230
|
+
| Codex | `prompt_tokens=N, completion_tokens=N` usage summary |
|
|
231
|
+
| Gemini CLI | `inputTokenCount=N, outputTokenCount=N` lines |
|
|
232
|
+
| Aider | "Tokens: N sent, N received. Cost: $N message" lines |
|
|
233
|
+
| OpenCode | "Total cost: $N" or generic token lines |
|
|
234
|
+
|
|
235
|
+
If token counts are unavailable, cost is estimated from input file size + output diff size (marked as `pricing_source: "estimated"`).
|
|
236
|
+
|
|
237
|
+
### Disable cost tracking
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
coderace run task.yaml --no-cost
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
## Custom Pricing
|
|
244
|
+
|
|
245
|
+
Override the default pricing table in your task YAML — useful for custom models, negotiated rates, or open-source deployments.
|
|
246
|
+
|
|
247
|
+
```yaml
|
|
248
|
+
# pricing: per-agent or per-model overrides (USD per 1M tokens)
|
|
249
|
+
pricing:
|
|
250
|
+
claude:
|
|
251
|
+
input_per_1m: 3.00 # default for claude-sonnet-4-6
|
|
252
|
+
output_per_1m: 15.00
|
|
253
|
+
codex:
|
|
254
|
+
input_per_1m: 3.00
|
|
255
|
+
output_per_1m: 15.00
|
|
256
|
+
# Or use the model name directly:
|
|
257
|
+
claude-opus-4-6:
|
|
258
|
+
input_per_1m: 15.00
|
|
259
|
+
output_per_1m: 75.00
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Keys can be agent names (`claude`, `codex`, `aider`, `gemini`, `opencode`) or model names (`claude-sonnet-4-6`, `gpt-5.3-codex`, `gemini-2.5-pro`). The default pricing table covers:
|
|
263
|
+
|
|
264
|
+
| Model | Input ($/1M) | Output ($/1M) |
|
|
265
|
+
|-------|-------------|--------------|
|
|
266
|
+
| claude-sonnet-4-6 | $3.00 | $15.00 |
|
|
267
|
+
| claude-opus-4-6 | $15.00 | $75.00 |
|
|
268
|
+
| gpt-5.3-codex | $3.00 | $15.00 |
|
|
269
|
+
| gemini-2.5-pro | $1.25 | $10.00 |
|
|
270
|
+
| gemini-3.1-pro | $1.25 | $10.00 |
|
|
271
|
+
|
|
272
|
+
Pricing is easy to update: the table lives in `coderace/cost.py` as a plain dict.
|
|
273
|
+
|
|
203
274
|
## Supported Agents
|
|
204
275
|
|
|
205
276
|
| Agent | CLI | Notes |
|
|
@@ -170,6 +170,77 @@ scoring:
|
|
|
170
170
|
|
|
171
171
|
Weights are normalized automatically (don't need to sum to 100).
|
|
172
172
|
|
|
173
|
+
## Cost Tracking
|
|
174
|
+
|
|
175
|
+
coderace automatically estimates API cost for each agent run. After every race, the results table includes a **Cost (USD)** column so you can compare quality-per-dollar, not just quality alone.
|
|
176
|
+
|
|
177
|
+
```
|
|
178
|
+
┌──────┬────────┬───────┬───────┬──────┬──────┬──────────┬───────┬────────────┐
|
|
179
|
+
│ Rank │ Agent │ Score │ Tests │ Exit │ Lint │ Time (s) │ Lines │ Cost (USD) │
|
|
180
|
+
├──────┼────────┼───────┼───────┼──────┼──────┼──────────┼───────┼────────────┤
|
|
181
|
+
│ 1 │ claude │ 85.0 │ PASS │ PASS │ PASS │ 10.5 │ 42 │ $0.0063 │
|
|
182
|
+
│ 2 │ codex │ 70.0 │ PASS │ PASS │ FAIL │ 15.2 │ 98 │ $0.0041 │
|
|
183
|
+
│ 3 │ aider │ 55.0 │ FAIL │ PASS │ PASS │ 8.1 │ 31 │ - │
|
|
184
|
+
└──────┴────────┴───────┴───────┴──────┴──────┴──────────┴───────┴────────────┘
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Cost appears in all output formats:
|
|
188
|
+
- **Terminal** — `Cost (USD)` column (shows `-` when unavailable)
|
|
189
|
+
- **Markdown** — `--format markdown` includes the column
|
|
190
|
+
- **JSON** — `cost` object per agent result with `input_tokens`, `output_tokens`, `estimated_cost_usd`, `model_name`, `pricing_source`
|
|
191
|
+
- **HTML report** — Cost column plus `$/score` ratio column for direct efficiency comparison
|
|
192
|
+
|
|
193
|
+
### How it works
|
|
194
|
+
|
|
195
|
+
Each agent adapter parses token counts or cost lines from the agent's CLI output:
|
|
196
|
+
|
|
197
|
+
| Agent | Source |
|
|
198
|
+
|-------|--------|
|
|
199
|
+
| Claude Code | `usage.input_tokens` / `usage.output_tokens` from JSON output; or "Total cost: $N" lines |
|
|
200
|
+
| Codex | `prompt_tokens=N, completion_tokens=N` usage summary |
|
|
201
|
+
| Gemini CLI | `inputTokenCount=N, outputTokenCount=N` lines |
|
|
202
|
+
| Aider | "Tokens: N sent, N received. Cost: $N message" lines |
|
|
203
|
+
| OpenCode | "Total cost: $N" or generic token lines |
|
|
204
|
+
|
|
205
|
+
If token counts are unavailable, cost is estimated from input file size + output diff size (marked as `pricing_source: "estimated"`).
|
|
206
|
+
|
|
207
|
+
### Disable cost tracking
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
coderace run task.yaml --no-cost
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Custom Pricing
|
|
214
|
+
|
|
215
|
+
Override the default pricing table in your task YAML — useful for custom models, negotiated rates, or open-source deployments.
|
|
216
|
+
|
|
217
|
+
```yaml
|
|
218
|
+
# pricing: per-agent or per-model overrides (USD per 1M tokens)
|
|
219
|
+
pricing:
|
|
220
|
+
claude:
|
|
221
|
+
input_per_1m: 3.00 # default for claude-sonnet-4-6
|
|
222
|
+
output_per_1m: 15.00
|
|
223
|
+
codex:
|
|
224
|
+
input_per_1m: 3.00
|
|
225
|
+
output_per_1m: 15.00
|
|
226
|
+
# Or use the model name directly:
|
|
227
|
+
claude-opus-4-6:
|
|
228
|
+
input_per_1m: 15.00
|
|
229
|
+
output_per_1m: 75.00
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Keys can be agent names (`claude`, `codex`, `aider`, `gemini`, `opencode`) or model names (`claude-sonnet-4-6`, `gpt-5.3-codex`, `gemini-2.5-pro`). The default pricing table covers:
|
|
233
|
+
|
|
234
|
+
| Model | Input ($/1M) | Output ($/1M) |
|
|
235
|
+
|-------|-------------|--------------|
|
|
236
|
+
| claude-sonnet-4-6 | $3.00 | $15.00 |
|
|
237
|
+
| claude-opus-4-6 | $15.00 | $75.00 |
|
|
238
|
+
| gpt-5.3-codex | $3.00 | $15.00 |
|
|
239
|
+
| gemini-2.5-pro | $1.25 | $10.00 |
|
|
240
|
+
| gemini-3.1-pro | $1.25 | $10.00 |
|
|
241
|
+
|
|
242
|
+
Pricing is easy to update: the table lives in `coderace/cost.py` as a plain dict.
|
|
243
|
+
|
|
173
244
|
## Supported Agents
|
|
174
245
|
|
|
175
246
|
| Agent | CLI | Notes |
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# All-Day Build Contract: Cost Tracking (v0.4.0)
|
|
2
|
+
|
|
3
|
+
Status: In Progress
|
|
4
|
+
Date: 2026-02-24
|
|
5
|
+
Owner: Sub-agent execution pass
|
|
6
|
+
Scope type: Deliverable-gated (no hour promises)
|
|
7
|
+
|
|
8
|
+
## 1. Objective
|
|
9
|
+
|
|
10
|
+
Add cost tracking to coderace so users can compare coding agents on quality-per-dollar, not just quality alone. When a race finishes, each agent's result includes estimated API cost. The results table shows a $/score column. This is the #1 missing comparison axis: everyone benchmarks speed and quality, nobody automates cost comparison.
|
|
11
|
+
|
|
12
|
+
This contract is considered complete only when every deliverable and validation gate below is satisfied.
|
|
13
|
+
|
|
14
|
+
## 2. Non-Negotiable Build Rules
|
|
15
|
+
|
|
16
|
+
1. No time-based completion claims.
|
|
17
|
+
2. Completion is allowed only when all checklist items are checked.
|
|
18
|
+
3. Full test suite must pass at the end.
|
|
19
|
+
4. New features must ship with docs and report addendum updates in the same pass.
|
|
20
|
+
5. CLI outputs must be deterministic and schema-backed where specified.
|
|
21
|
+
6. Never modify files outside the project directory.
|
|
22
|
+
7. Commit after each completed deliverable (not at the end).
|
|
23
|
+
8. If stuck on same issue for 3 attempts, stop and write a blocker report.
|
|
24
|
+
9. Do NOT refactor, restyle, or "improve" code outside the deliverables.
|
|
25
|
+
10. Read existing tests and docs before writing new code.
|
|
26
|
+
|
|
27
|
+
## 3. Feature Deliverables
|
|
28
|
+
|
|
29
|
+
### D1. Cost estimation engine (core)
|
|
30
|
+
|
|
31
|
+
Build a cost estimation module that maps agent CLI output to dollar costs. Each agent adapter gets a `parse_cost()` method that extracts token counts or cost info from the agent's stdout/stderr.
|
|
32
|
+
|
|
33
|
+
Required:
|
|
34
|
+
- `coderace/cost.py` — pricing tables, cost calculation logic
|
|
35
|
+
- `coderace/adapters/*.py` — updated with parse_cost() methods
|
|
36
|
+
|
|
37
|
+
- [ ] Pricing table for: Claude Code (Sonnet 4.6, Opus 4.6), Codex (GPT-5.3), Gemini CLI (Gemini 2.5 Pro, Gemini 3.1 Pro), Aider (configurable model), OpenCode (configurable model)
|
|
38
|
+
- [ ] Parse token counts from each agent's output (Claude Code prints session summary, Codex prints usage, etc.)
|
|
39
|
+
- [ ] Fallback: if token counts unavailable, estimate from input file size + output diff size using per-model pricing
|
|
40
|
+
- [ ] CostResult dataclass: input_tokens, output_tokens, estimated_cost_usd, model_name, pricing_source
|
|
41
|
+
- [ ] Tests for D1: unit tests for each parser, edge cases (missing output, unknown model)
|
|
42
|
+
|
|
43
|
+
### D2. Results integration
|
|
44
|
+
|
|
45
|
+
Integrate cost data into the race results pipeline. Show cost alongside score in all output formats.
|
|
46
|
+
|
|
47
|
+
Required:
|
|
48
|
+
- `coderace/results.py` — updated
|
|
49
|
+
- `coderace/cli.py` — updated
|
|
50
|
+
|
|
51
|
+
- [ ] Race results include cost_usd field per agent
|
|
52
|
+
- [ ] `coderace results` terminal output shows Cost column
|
|
53
|
+
- [ ] `--format markdown` includes cost column
|
|
54
|
+
- [ ] `--format json` includes cost object
|
|
55
|
+
- [ ] HTML report includes cost column with $/score ratio
|
|
56
|
+
- [ ] Statistical mode (`--runs N`) aggregates cost: mean ± stddev
|
|
57
|
+
- [ ] Tests for D2
|
|
58
|
+
|
|
59
|
+
### D3. Cost configuration
|
|
60
|
+
|
|
61
|
+
Allow users to override pricing in task YAML (for custom models, negotiated rates, etc).
|
|
62
|
+
|
|
63
|
+
Required:
|
|
64
|
+
- `coderace/config.py` or extend task YAML schema
|
|
65
|
+
|
|
66
|
+
- [ ] `pricing:` section in task YAML: per-agent or per-model overrides
|
|
67
|
+
- [ ] `coderace init` template includes commented pricing example
|
|
68
|
+
- [ ] `--no-cost` flag to disable cost tracking entirely
|
|
69
|
+
- [ ] Tests for D3
|
|
70
|
+
|
|
71
|
+
### D4. Documentation
|
|
72
|
+
|
|
73
|
+
- [ ] README section: "Cost Tracking" with example output
|
|
74
|
+
- [ ] README section: "Custom Pricing" showing YAML config
|
|
75
|
+
- [ ] CHANGELOG entry for v0.4.0
|
|
76
|
+
- [ ] Update example task YAMLs with pricing comments
|
|
77
|
+
|
|
78
|
+
## 4. Test Requirements
|
|
79
|
+
|
|
80
|
+
- [ ] Unit tests for cost parsing (each adapter)
|
|
81
|
+
- [ ] Unit tests for pricing calculation
|
|
82
|
+
- [ ] Integration test: full race with cost output
|
|
83
|
+
- [ ] Edge cases: agent crashes (no cost data), unknown model, zero tokens
|
|
84
|
+
- [ ] All existing 130 tests must still pass
|
|
85
|
+
|
|
86
|
+
## 5. Reports
|
|
87
|
+
|
|
88
|
+
- Write progress to `progress-log.md` after each deliverable
|
|
89
|
+
- Include: what was built, what tests pass, what's next, any blockers
|
|
90
|
+
- Final summary when all deliverables done or stopped
|
|
91
|
+
|
|
92
|
+
## 6. Stop Conditions
|
|
93
|
+
|
|
94
|
+
- All deliverables checked and all tests passing -> DONE
|
|
95
|
+
- 3 consecutive failed attempts on same issue -> STOP, write blocker report
|
|
96
|
+
- Scope creep detected (new requirements discovered) -> STOP, report what's new
|
|
97
|
+
- All tests passing but deliverables remain -> continue to next deliverable
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Aider adapter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from coderace.adapters.base import BaseAdapter
|
|
8
|
+
from coderace.cost import CostResult, parse_aider_cost
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AiderAdapter(BaseAdapter):
|
|
12
|
+
"""Adapter for Aider coding assistant."""
|
|
13
|
+
|
|
14
|
+
name = "aider"
|
|
15
|
+
|
|
16
|
+
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
+
return [
|
|
18
|
+
"aider",
|
|
19
|
+
"--message",
|
|
20
|
+
task_description,
|
|
21
|
+
"--yes",
|
|
22
|
+
"--no-auto-commits",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
def parse_cost(
|
|
26
|
+
self,
|
|
27
|
+
stdout: str,
|
|
28
|
+
stderr: str,
|
|
29
|
+
model_name: str = "aider-default",
|
|
30
|
+
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
31
|
+
) -> Optional[CostResult]:
|
|
32
|
+
"""Parse cost data from Aider output."""
|
|
33
|
+
return parse_aider_cost(stdout, stderr, model_name, custom_pricing)
|
|
@@ -6,7 +6,9 @@ import subprocess
|
|
|
6
6
|
import time
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
8
|
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
9
10
|
|
|
11
|
+
from coderace.cost import CostResult
|
|
10
12
|
from coderace.types import AgentResult
|
|
11
13
|
|
|
12
14
|
|
|
@@ -20,7 +22,27 @@ class BaseAdapter(ABC):
|
|
|
20
22
|
"""Build the CLI command to invoke this agent."""
|
|
21
23
|
...
|
|
22
24
|
|
|
23
|
-
def
|
|
25
|
+
def parse_cost(
|
|
26
|
+
self,
|
|
27
|
+
stdout: str,
|
|
28
|
+
stderr: str,
|
|
29
|
+
model_name: str = "",
|
|
30
|
+
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
31
|
+
) -> Optional[CostResult]:
|
|
32
|
+
"""Parse cost data from agent output. Override in subclasses.
|
|
33
|
+
|
|
34
|
+
Returns None if cost data is unavailable.
|
|
35
|
+
"""
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
def run(
|
|
39
|
+
self,
|
|
40
|
+
task_description: str,
|
|
41
|
+
workdir: Path,
|
|
42
|
+
timeout: int,
|
|
43
|
+
no_cost: bool = False,
|
|
44
|
+
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
45
|
+
) -> AgentResult:
|
|
24
46
|
"""Run the agent on a task and capture results."""
|
|
25
47
|
cmd = self.build_command(task_description)
|
|
26
48
|
start = time.monotonic()
|
|
@@ -50,6 +72,14 @@ class BaseAdapter(ABC):
|
|
|
50
72
|
|
|
51
73
|
wall_time = time.monotonic() - start
|
|
52
74
|
|
|
75
|
+
# Parse cost (fails gracefully — never raises)
|
|
76
|
+
cost_result: Optional[CostResult] = None
|
|
77
|
+
if not no_cost:
|
|
78
|
+
try:
|
|
79
|
+
cost_result = self.parse_cost(stdout, stderr, custom_pricing=custom_pricing)
|
|
80
|
+
except Exception:
|
|
81
|
+
pass
|
|
82
|
+
|
|
53
83
|
return AgentResult(
|
|
54
84
|
agent=self.name,
|
|
55
85
|
exit_code=exit_code,
|
|
@@ -57,4 +87,5 @@ class BaseAdapter(ABC):
|
|
|
57
87
|
stderr=stderr,
|
|
58
88
|
wall_time=wall_time,
|
|
59
89
|
timed_out=timed_out,
|
|
90
|
+
cost_result=cost_result,
|
|
60
91
|
)
|
|
@@ -2,7 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
5
7
|
from coderace.adapters.base import BaseAdapter
|
|
8
|
+
from coderace.cost import CostResult, parse_claude_cost
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class ClaudeAdapter(BaseAdapter):
|
|
@@ -19,3 +22,13 @@ class ClaudeAdapter(BaseAdapter):
|
|
|
19
22
|
"-p",
|
|
20
23
|
task_description,
|
|
21
24
|
]
|
|
25
|
+
|
|
26
|
+
def parse_cost(
|
|
27
|
+
self,
|
|
28
|
+
stdout: str,
|
|
29
|
+
stderr: str,
|
|
30
|
+
model_name: str = "claude-sonnet-4-6",
|
|
31
|
+
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
32
|
+
) -> Optional[CostResult]:
|
|
33
|
+
"""Parse cost data from Claude Code output."""
|
|
34
|
+
return parse_claude_cost(stdout, stderr, model_name, custom_pricing)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Codex CLI adapter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from coderace.adapters.base import BaseAdapter
|
|
8
|
+
from coderace.cost import CostResult, parse_codex_cost
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CodexAdapter(BaseAdapter):
|
|
12
|
+
"""Adapter for OpenAI Codex CLI."""
|
|
13
|
+
|
|
14
|
+
name = "codex"
|
|
15
|
+
|
|
16
|
+
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
+
return [
|
|
18
|
+
"codex",
|
|
19
|
+
"--quiet",
|
|
20
|
+
"--full-auto",
|
|
21
|
+
"-p",
|
|
22
|
+
task_description,
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
def parse_cost(
|
|
26
|
+
self,
|
|
27
|
+
stdout: str,
|
|
28
|
+
stderr: str,
|
|
29
|
+
model_name: str = "gpt-5.3-codex",
|
|
30
|
+
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
31
|
+
) -> Optional[CostResult]:
|
|
32
|
+
"""Parse cost data from Codex CLI output."""
|
|
33
|
+
return parse_codex_cost(stdout, stderr, model_name, custom_pricing)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Gemini CLI adapter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from coderace.adapters.base import BaseAdapter
|
|
8
|
+
from coderace.cost import CostResult, parse_gemini_cost
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GeminiAdapter(BaseAdapter):
|
|
12
|
+
"""Adapter for Google Gemini CLI."""
|
|
13
|
+
|
|
14
|
+
name = "gemini"
|
|
15
|
+
|
|
16
|
+
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
+
return [
|
|
18
|
+
"gemini",
|
|
19
|
+
"--non-interactive",
|
|
20
|
+
"-p",
|
|
21
|
+
task_description,
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
def parse_cost(
|
|
25
|
+
self,
|
|
26
|
+
stdout: str,
|
|
27
|
+
stderr: str,
|
|
28
|
+
model_name: str = "gemini-2.5-pro",
|
|
29
|
+
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
30
|
+
) -> Optional[CostResult]:
|
|
31
|
+
"""Parse cost data from Gemini CLI output."""
|
|
32
|
+
return parse_gemini_cost(stdout, stderr, model_name, custom_pricing)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""OpenCode adapter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from coderace.adapters.base import BaseAdapter
|
|
8
|
+
from coderace.cost import CostResult, parse_opencode_cost
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class OpenCodeAdapter(BaseAdapter):
|
|
12
|
+
"""Adapter for OpenCode CLI (terminal-first AI coding agent)."""
|
|
13
|
+
|
|
14
|
+
name = "opencode"
|
|
15
|
+
|
|
16
|
+
def build_command(self, task_description: str) -> list[str]:
|
|
17
|
+
return [
|
|
18
|
+
"opencode",
|
|
19
|
+
"run",
|
|
20
|
+
task_description,
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
def parse_cost(
|
|
24
|
+
self,
|
|
25
|
+
stdout: str,
|
|
26
|
+
stderr: str,
|
|
27
|
+
model_name: str = "opencode-default",
|
|
28
|
+
custom_pricing: dict[str, tuple[float, float]] | None = None,
|
|
29
|
+
) -> Optional[CostResult]:
|
|
30
|
+
"""Parse cost data from OpenCode output."""
|
|
31
|
+
return parse_opencode_cost(stdout, stderr, model_name, custom_pricing)
|
|
@@ -51,6 +51,8 @@ def _run_agent_sequential(
|
|
|
51
51
|
branch: str,
|
|
52
52
|
base_ref: str,
|
|
53
53
|
timeout: int,
|
|
54
|
+
no_cost: bool = False,
|
|
55
|
+
custom_pricing: dict | None = None,
|
|
54
56
|
) -> tuple[AgentResult | None, int]:
|
|
55
57
|
"""Run a single agent sequentially (on the main repo). Returns (result, lines_changed)."""
|
|
56
58
|
try:
|
|
@@ -59,7 +61,7 @@ def _run_agent_sequential(
|
|
|
59
61
|
return None, 0
|
|
60
62
|
|
|
61
63
|
adapter = ADAPTERS[agent_name]()
|
|
62
|
-
result = adapter.run(task_description, repo, timeout)
|
|
64
|
+
result = adapter.run(task_description, repo, timeout, no_cost=no_cost, custom_pricing=custom_pricing)
|
|
63
65
|
|
|
64
66
|
_, lines = get_diff_stat(repo, base_ref)
|
|
65
67
|
return result, lines
|
|
@@ -72,6 +74,8 @@ def _run_agent_worktree(
|
|
|
72
74
|
branch: str,
|
|
73
75
|
base_ref: str,
|
|
74
76
|
timeout: int,
|
|
77
|
+
no_cost: bool = False,
|
|
78
|
+
custom_pricing: dict | None = None,
|
|
75
79
|
) -> tuple[AgentResult | None, int]:
|
|
76
80
|
"""Run a single agent in a git worktree (for parallel execution)."""
|
|
77
81
|
import tempfile
|
|
@@ -87,7 +91,7 @@ def _run_agent_worktree(
|
|
|
87
91
|
add_worktree(repo, worktree_dir, branch)
|
|
88
92
|
|
|
89
93
|
adapter = ADAPTERS[agent_name]()
|
|
90
|
-
result = adapter.run(task_description, worktree_dir, timeout)
|
|
94
|
+
result = adapter.run(task_description, worktree_dir, timeout, no_cost=no_cost, custom_pricing=custom_pricing)
|
|
91
95
|
|
|
92
96
|
_, lines = get_diff_stat(worktree_dir, base_ref)
|
|
93
97
|
return result, lines
|
|
@@ -114,6 +118,9 @@ def run(
|
|
|
114
118
|
runs: int = typer.Option(
|
|
115
119
|
1, "--runs", "-n", help="Number of runs (>1 for stats)"
|
|
116
120
|
),
|
|
121
|
+
no_cost: bool = typer.Option(
|
|
122
|
+
False, "--no-cost", help="Disable cost tracking"
|
|
123
|
+
),
|
|
117
124
|
) -> None:
|
|
118
125
|
"""Run all agents on a task and score the results."""
|
|
119
126
|
task = load_task(task_file)
|
|
@@ -195,6 +202,8 @@ def run(
|
|
|
195
202
|
branch,
|
|
196
203
|
base_ref,
|
|
197
204
|
task.timeout,
|
|
205
|
+
no_cost,
|
|
206
|
+
task.pricing,
|
|
198
207
|
)
|
|
199
208
|
futures[future] = agent_name
|
|
200
209
|
|
|
@@ -248,6 +257,8 @@ def run(
|
|
|
248
257
|
branch,
|
|
249
258
|
base_ref,
|
|
250
259
|
task.timeout,
|
|
260
|
+
no_cost=no_cost,
|
|
261
|
+
custom_pricing=task.pricing,
|
|
251
262
|
)
|
|
252
263
|
|
|
253
264
|
if result is None:
|
|
@@ -417,6 +428,8 @@ def _save_stats_json(
|
|
|
417
428
|
"exit_clean_rate": s.exit_clean_rate,
|
|
418
429
|
"lint_clean_rate": s.lint_clean_rate,
|
|
419
430
|
"per_run_scores": s.per_run_scores,
|
|
431
|
+
"cost_mean": s.cost_mean,
|
|
432
|
+
"cost_stddev": s.cost_stddev,
|
|
420
433
|
}
|
|
421
434
|
)
|
|
422
435
|
|
|
@@ -484,9 +497,16 @@ def results(
|
|
|
484
497
|
table.add_column("Lint", justify="center")
|
|
485
498
|
table.add_column("Time (s)", justify="right")
|
|
486
499
|
table.add_column("Lines", justify="right")
|
|
500
|
+
table.add_column("Cost (USD)", justify="right")
|
|
487
501
|
|
|
488
502
|
for entry in data:
|
|
489
503
|
b = entry["breakdown"]
|
|
504
|
+
cost_info = entry.get("cost")
|
|
505
|
+
cost_str = (
|
|
506
|
+
f"${cost_info['estimated_cost_usd']:.4f}"
|
|
507
|
+
if cost_info is not None
|
|
508
|
+
else "-"
|
|
509
|
+
)
|
|
490
510
|
table.add_row(
|
|
491
511
|
str(entry["rank"]),
|
|
492
512
|
entry["agent"],
|
|
@@ -496,6 +516,7 @@ def results(
|
|
|
496
516
|
_bool_icon(b["lint_clean"]),
|
|
497
517
|
f"{b['wall_time']:.1f}",
|
|
498
518
|
str(b["lines_changed"]),
|
|
519
|
+
cost_str,
|
|
499
520
|
)
|
|
500
521
|
|
|
501
522
|
console.print(table)
|
|
@@ -39,16 +39,21 @@ def format_markdown_results(scores: list[Score], task_name: str = "") -> str:
|
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
# Table header
|
|
42
|
-
header = "| Rank | Agent | Score | Tests | Lint | Exit | Time (s) | Lines |\n"
|
|
43
|
-
separator = "
|
|
42
|
+
header = "| Rank | Agent | Score | Tests | Lint | Exit | Time (s) | Lines | Cost (USD) |\n"
|
|
43
|
+
separator = "|------|-------|------:|:-----:|:----:|:----:|---------:|------:|-----------:|\n"
|
|
44
44
|
|
|
45
45
|
rows: list[str] = []
|
|
46
46
|
for i, score in enumerate(ranked, 1):
|
|
47
47
|
b = score.breakdown
|
|
48
|
+
cost_str = (
|
|
49
|
+
f"${score.cost_result.estimated_cost_usd:.4f}"
|
|
50
|
+
if score.cost_result is not None
|
|
51
|
+
else "-"
|
|
52
|
+
)
|
|
48
53
|
row = (
|
|
49
54
|
f"| {i} | `{score.agent}` | {score.composite:.1f} |"
|
|
50
55
|
f" {_bool_md(b.tests_pass)} | {_bool_md(b.lint_clean)} |"
|
|
51
|
-
f" {_bool_md(b.exit_clean)} | {b.wall_time:.1f} | {b.lines_changed} |"
|
|
56
|
+
f" {_bool_md(b.exit_clean)} | {b.wall_time:.1f} | {b.lines_changed} | {cost_str} |"
|
|
52
57
|
)
|
|
53
58
|
rows.append(row)
|
|
54
59
|
|
|
@@ -84,12 +89,18 @@ def format_markdown_from_json(data: list[dict], task_name: str = "") -> str:
|
|
|
84
89
|
heading = f"## coderace results: {task_name}\n\n" if task_name else "## coderace results\n\n"
|
|
85
90
|
summary = f"**Winner:** `{agent}` — {score:.1f} pts | {n} agent(s) raced\n\n"
|
|
86
91
|
|
|
87
|
-
header = "| Rank | Agent | Score | Tests | Lint | Exit | Time (s) | Lines |\n"
|
|
88
|
-
separator = "
|
|
92
|
+
header = "| Rank | Agent | Score | Tests | Lint | Exit | Time (s) | Lines | Cost (USD) |\n"
|
|
93
|
+
separator = "|------|-------|------:|:-----:|:----:|:----:|---------:|------:|-----------:|\n"
|
|
89
94
|
|
|
90
95
|
rows: list[str] = []
|
|
91
96
|
for entry in data:
|
|
92
97
|
b = entry.get("breakdown", {})
|
|
98
|
+
cost_info = entry.get("cost")
|
|
99
|
+
cost_str = (
|
|
100
|
+
f"${cost_info['estimated_cost_usd']:.4f}"
|
|
101
|
+
if cost_info is not None
|
|
102
|
+
else "-"
|
|
103
|
+
)
|
|
93
104
|
rank = entry.get("rank", "?")
|
|
94
105
|
a = entry.get("agent", "?")
|
|
95
106
|
sc = entry.get("composite_score", 0.0)
|
|
@@ -98,7 +109,7 @@ def format_markdown_from_json(data: list[dict], task_name: str = "") -> str:
|
|
|
98
109
|
f" {_bool_md(b.get('tests_pass', False))} |"
|
|
99
110
|
f" {_bool_md(b.get('lint_clean', False))} |"
|
|
100
111
|
f" {_bool_md(b.get('exit_clean', False))} |"
|
|
101
|
-
f" {b.get('wall_time', 0.0):.1f} | {b.get('lines_changed', 0)} |"
|
|
112
|
+
f" {b.get('wall_time', 0.0):.1f} | {b.get('lines_changed', 0)} | {cost_str} |"
|
|
102
113
|
)
|
|
103
114
|
rows.append(row)
|
|
104
115
|
|