coderace 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {coderace-0.3.0 → coderace-0.4.0}/CHANGELOG.md +13 -0
  2. {coderace-0.3.0 → coderace-0.4.0}/PKG-INFO +72 -1
  3. {coderace-0.3.0 → coderace-0.4.0}/README.md +71 -0
  4. coderace-0.4.0/all-day-build-contract-cost-tracking.md +97 -0
  5. coderace-0.4.0/coderace/adapters/aider.py +33 -0
  6. {coderace-0.3.0 → coderace-0.4.0}/coderace/adapters/base.py +32 -1
  7. {coderace-0.3.0 → coderace-0.4.0}/coderace/adapters/claude.py +13 -0
  8. coderace-0.4.0/coderace/adapters/codex.py +33 -0
  9. coderace-0.4.0/coderace/adapters/gemini.py +32 -0
  10. coderace-0.4.0/coderace/adapters/opencode.py +31 -0
  11. {coderace-0.3.0 → coderace-0.4.0}/coderace/cli.py +23 -2
  12. {coderace-0.3.0 → coderace-0.4.0}/coderace/commands/results.py +17 -6
  13. coderace-0.4.0/coderace/cost.py +456 -0
  14. {coderace-0.3.0 → coderace-0.4.0}/coderace/html_report.py +15 -0
  15. {coderace-0.3.0 → coderace-0.4.0}/coderace/reporter.py +26 -0
  16. {coderace-0.3.0 → coderace-0.4.0}/coderace/scorer.py +1 -0
  17. {coderace-0.3.0 → coderace-0.4.0}/coderace/stats.py +9 -0
  18. {coderace-0.3.0 → coderace-0.4.0}/coderace/task.py +33 -0
  19. {coderace-0.3.0 → coderace-0.4.0}/coderace/types.py +7 -0
  20. {coderace-0.3.0 → coderace-0.4.0}/examples/add-type-hints.yaml +5 -0
  21. {coderace-0.3.0 → coderace-0.4.0}/examples/example-task.yaml +8 -0
  22. {coderace-0.3.0 → coderace-0.4.0}/examples/fix-edge-case.yaml +5 -0
  23. {coderace-0.3.0 → coderace-0.4.0}/examples/write-tests.yaml +5 -0
  24. coderace-0.4.0/progress-log.md +92 -0
  25. {coderace-0.3.0 → coderace-0.4.0}/pyproject.toml +1 -1
  26. coderace-0.4.0/tests/test_cost.py +432 -0
  27. coderace-0.4.0/tests/test_cost_config.py +311 -0
  28. coderace-0.4.0/tests/test_cost_integration.py +374 -0
  29. {coderace-0.3.0 → coderace-0.4.0}/uv.lock +1 -1
  30. coderace-0.3.0/coderace/adapters/aider.py +0 -20
  31. coderace-0.3.0/coderace/adapters/codex.py +0 -20
  32. coderace-0.3.0/coderace/adapters/gemini.py +0 -19
  33. coderace-0.3.0/coderace/adapters/opencode.py +0 -18
  34. coderace-0.3.0/progress-log.md +0 -93
  35. {coderace-0.3.0 → coderace-0.4.0}/.github/workflows/publish.yml +0 -0
  36. {coderace-0.3.0 → coderace-0.4.0}/.gitignore +0 -0
  37. {coderace-0.3.0 → coderace-0.4.0}/LICENSE +0 -0
  38. {coderace-0.3.0 → coderace-0.4.0}/action.yml +0 -0
  39. {coderace-0.3.0 → coderace-0.4.0}/all-day-build-contract-ci-integration.md +0 -0
  40. {coderace-0.3.0 → coderace-0.4.0}/all-day-build-contract-v0.2.md +0 -0
  41. {coderace-0.3.0 → coderace-0.4.0}/coderace/__init__.py +0 -0
  42. {coderace-0.3.0 → coderace-0.4.0}/coderace/adapters/__init__.py +0 -0
  43. {coderace-0.3.0 → coderace-0.4.0}/coderace/commands/__init__.py +0 -0
  44. {coderace-0.3.0 → coderace-0.4.0}/coderace/commands/diff.py +0 -0
  45. {coderace-0.3.0 → coderace-0.4.0}/coderace/git_ops.py +0 -0
  46. {coderace-0.3.0 → coderace-0.4.0}/examples/ci-race-on-pr.yml +0 -0
  47. {coderace-0.3.0 → coderace-0.4.0}/scripts/ci-run.sh +0 -0
  48. {coderace-0.3.0 → coderace-0.4.0}/scripts/format-comment.py +0 -0
  49. {coderace-0.3.0 → coderace-0.4.0}/tests/__init__.py +0 -0
  50. {coderace-0.3.0 → coderace-0.4.0}/tests/conftest.py +0 -0
  51. {coderace-0.3.0 → coderace-0.4.0}/tests/test_adapters.py +0 -0
  52. {coderace-0.3.0 → coderace-0.4.0}/tests/test_cli.py +0 -0
  53. {coderace-0.3.0 → coderace-0.4.0}/tests/test_diff.py +0 -0
  54. {coderace-0.3.0 → coderace-0.4.0}/tests/test_examples.py +0 -0
  55. {coderace-0.3.0 → coderace-0.4.0}/tests/test_format_comment.py +0 -0
  56. {coderace-0.3.0 → coderace-0.4.0}/tests/test_git_ops.py +0 -0
  57. {coderace-0.3.0 → coderace-0.4.0}/tests/test_html_report.py +0 -0
  58. {coderace-0.3.0 → coderace-0.4.0}/tests/test_markdown_results.py +0 -0
  59. {coderace-0.3.0 → coderace-0.4.0}/tests/test_reporter.py +0 -0
  60. {coderace-0.3.0 → coderace-0.4.0}/tests/test_scorer.py +0 -0
  61. {coderace-0.3.0 → coderace-0.4.0}/tests/test_stats.py +0 -0
  62. {coderace-0.3.0 → coderace-0.4.0}/tests/test_task.py +0 -0
@@ -1,5 +1,18 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.4.0] - 2026-02-24
4
+
5
+ ### Added
6
+
7
+ - **Cost tracking** — Each agent run now includes an estimated API cost. The results table shows a `Cost (USD)` column in terminal, markdown, JSON, and HTML output.
8
+ - **`coderace/cost.py`** — Pricing engine: pricing table for Claude Code (Sonnet 4.6, Opus 4.6), Codex (GPT-5.3), Gemini CLI (2.5 Pro, 3.1 Pro), Aider, and OpenCode. `CostResult` dataclass with `input_tokens`, `output_tokens`, `estimated_cost_usd`, `model_name`, `pricing_source`.
9
+ - **Per-adapter `parse_cost()` methods** — Each adapter extracts token counts or cost info from the agent's stdout/stderr. Falls back to file-size estimation when tokens are unavailable.
10
+ - **`pricing:` section in task YAML** — Override pricing per-agent or per-model with `input_per_1m` / `output_per_1m` (USD per 1M tokens).
11
+ - **`--no-cost` flag** — `coderace run task.yaml --no-cost` disables cost tracking entirely.
12
+ - **HTML report $/score column** — The HTML report now shows cost and cost-per-point for direct efficiency comparison.
13
+ - **Statistical mode cost aggregation** — `--runs N` shows mean ± stddev for cost alongside score and time.
14
+ - **`coderace init` template** — Now includes a commented `pricing:` example section.
15
+
3
16
  ## [0.3.0] - 2026-02-24
4
17
 
5
18
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderace
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Race coding agents against each other on real tasks
5
5
  Project-URL: Homepage, https://github.com/mikiships/coderace
6
6
  Project-URL: Repository, https://github.com/mikiships/coderace
@@ -200,6 +200,77 @@ scoring:
200
200
 
201
201
  Weights are normalized automatically (don't need to sum to 100).
202
202
 
203
+ ## Cost Tracking
204
+
205
+ coderace automatically estimates API cost for each agent run. After every race, the results table includes a **Cost (USD)** column so you can compare quality-per-dollar, not just quality alone.
206
+
207
+ ```
208
+ ┌──────┬────────┬───────┬───────┬──────┬──────┬──────────┬───────┬────────────┐
209
+ │ Rank │ Agent │ Score │ Tests │ Exit │ Lint │ Time (s) │ Lines │ Cost (USD) │
210
+ ├──────┼────────┼───────┼───────┼──────┼──────┼──────────┼───────┼────────────┤
211
+ │ 1 │ claude │ 85.0 │ PASS │ PASS │ PASS │ 10.5 │ 42 │ $0.0063 │
212
+ │ 2 │ codex │ 70.0 │ PASS │ PASS │ FAIL │ 15.2 │ 98 │ $0.0041 │
213
+ │ 3 │ aider │ 55.0 │ FAIL │ PASS │ PASS │ 8.1 │ 31 │ - │
214
+ └──────┴────────┴───────┴───────┴──────┴──────┴──────────┴───────┴────────────┘
215
+ ```
216
+
217
+ Cost appears in all output formats:
218
+ - **Terminal** — `Cost (USD)` column (shows `-` when unavailable)
219
+ - **Markdown** — `--format markdown` includes the column
220
+ - **JSON** — `cost` object per agent result with `input_tokens`, `output_tokens`, `estimated_cost_usd`, `model_name`, `pricing_source`
221
+ - **HTML report** — Cost column plus `$/score` ratio column for direct efficiency comparison
222
+
223
+ ### How it works
224
+
225
+ Each agent adapter parses token counts or cost lines from the agent's CLI output:
226
+
227
+ | Agent | Source |
228
+ |-------|--------|
229
+ | Claude Code | `usage.input_tokens` / `usage.output_tokens` from JSON output; or "Total cost: $N" lines |
230
+ | Codex | `prompt_tokens=N, completion_tokens=N` usage summary |
231
+ | Gemini CLI | `inputTokenCount=N, outputTokenCount=N` lines |
232
+ | Aider | "Tokens: N sent, N received. Cost: $N message" lines |
233
+ | OpenCode | "Total cost: $N" or generic token lines |
234
+
235
+ If token counts are unavailable, cost is estimated from input file size + output diff size (marked as `pricing_source: "estimated"`).
236
+
237
+ ### Disable cost tracking
238
+
239
+ ```bash
240
+ coderace run task.yaml --no-cost
241
+ ```
242
+
243
+ ## Custom Pricing
244
+
245
+ Override the default pricing table in your task YAML — useful for custom models, negotiated rates, or open-source deployments.
246
+
247
+ ```yaml
248
+ # pricing: per-agent or per-model overrides (USD per 1M tokens)
249
+ pricing:
250
+ claude:
251
+ input_per_1m: 3.00 # default for claude-sonnet-4-6
252
+ output_per_1m: 15.00
253
+ codex:
254
+ input_per_1m: 3.00
255
+ output_per_1m: 15.00
256
+ # Or use the model name directly:
257
+ claude-opus-4-6:
258
+ input_per_1m: 15.00
259
+ output_per_1m: 75.00
260
+ ```
261
+
262
+ Keys can be agent names (`claude`, `codex`, `aider`, `gemini`, `opencode`) or model names (`claude-sonnet-4-6`, `gpt-5.3-codex`, `gemini-2.5-pro`). The default pricing table covers:
263
+
264
+ | Model | Input ($/1M) | Output ($/1M) |
265
+ |-------|-------------|--------------|
266
+ | claude-sonnet-4-6 | $3.00 | $15.00 |
267
+ | claude-opus-4-6 | $15.00 | $75.00 |
268
+ | gpt-5.3-codex | $3.00 | $15.00 |
269
+ | gemini-2.5-pro | $1.25 | $10.00 |
270
+ | gemini-3.1-pro | $1.25 | $10.00 |
271
+
272
+ Pricing is easy to update: the table lives in `coderace/cost.py` as a plain dict.
273
+
203
274
  ## Supported Agents
204
275
 
205
276
  | Agent | CLI | Notes |
@@ -170,6 +170,77 @@ scoring:
170
170
 
171
171
  Weights are normalized automatically (don't need to sum to 100).
172
172
 
173
+ ## Cost Tracking
174
+
175
+ coderace automatically estimates API cost for each agent run. After every race, the results table includes a **Cost (USD)** column so you can compare quality-per-dollar, not just quality alone.
176
+
177
+ ```
178
+ ┌──────┬────────┬───────┬───────┬──────┬──────┬──────────┬───────┬────────────┐
179
+ │ Rank │ Agent │ Score │ Tests │ Exit │ Lint │ Time (s) │ Lines │ Cost (USD) │
180
+ ├──────┼────────┼───────┼───────┼──────┼──────┼──────────┼───────┼────────────┤
181
+ │ 1 │ claude │ 85.0 │ PASS │ PASS │ PASS │ 10.5 │ 42 │ $0.0063 │
182
+ │ 2 │ codex │ 70.0 │ PASS │ PASS │ FAIL │ 15.2 │ 98 │ $0.0041 │
183
+ │ 3 │ aider │ 55.0 │ FAIL │ PASS │ PASS │ 8.1 │ 31 │ - │
184
+ └──────┴────────┴───────┴───────┴──────┴──────┴──────────┴───────┴────────────┘
185
+ ```
186
+
187
+ Cost appears in all output formats:
188
+ - **Terminal** — `Cost (USD)` column (shows `-` when unavailable)
189
+ - **Markdown** — `--format markdown` includes the column
190
+ - **JSON** — `cost` object per agent result with `input_tokens`, `output_tokens`, `estimated_cost_usd`, `model_name`, `pricing_source`
191
+ - **HTML report** — Cost column plus `$/score` ratio column for direct efficiency comparison
192
+
193
+ ### How it works
194
+
195
+ Each agent adapter parses token counts or cost lines from the agent's CLI output:
196
+
197
+ | Agent | Source |
198
+ |-------|--------|
199
+ | Claude Code | `usage.input_tokens` / `usage.output_tokens` from JSON output; or "Total cost: $N" lines |
200
+ | Codex | `prompt_tokens=N, completion_tokens=N` usage summary |
201
+ | Gemini CLI | `inputTokenCount=N, outputTokenCount=N` lines |
202
+ | Aider | "Tokens: N sent, N received. Cost: $N message" lines |
203
+ | OpenCode | "Total cost: $N" or generic token lines |
204
+
205
+ If token counts are unavailable, cost is estimated from input file size + output diff size (marked as `pricing_source: "estimated"`).
206
+
207
+ ### Disable cost tracking
208
+
209
+ ```bash
210
+ coderace run task.yaml --no-cost
211
+ ```
212
+
213
+ ## Custom Pricing
214
+
215
+ Override the default pricing table in your task YAML — useful for custom models, negotiated rates, or open-source deployments.
216
+
217
+ ```yaml
218
+ # pricing: per-agent or per-model overrides (USD per 1M tokens)
219
+ pricing:
220
+ claude:
221
+ input_per_1m: 3.00 # default for claude-sonnet-4-6
222
+ output_per_1m: 15.00
223
+ codex:
224
+ input_per_1m: 3.00
225
+ output_per_1m: 15.00
226
+ # Or use the model name directly:
227
+ claude-opus-4-6:
228
+ input_per_1m: 15.00
229
+ output_per_1m: 75.00
230
+ ```
231
+
232
+ Keys can be agent names (`claude`, `codex`, `aider`, `gemini`, `opencode`) or model names (`claude-sonnet-4-6`, `gpt-5.3-codex`, `gemini-2.5-pro`). The default pricing table covers:
233
+
234
+ | Model | Input ($/1M) | Output ($/1M) |
235
+ |-------|-------------|--------------|
236
+ | claude-sonnet-4-6 | $3.00 | $15.00 |
237
+ | claude-opus-4-6 | $15.00 | $75.00 |
238
+ | gpt-5.3-codex | $3.00 | $15.00 |
239
+ | gemini-2.5-pro | $1.25 | $10.00 |
240
+ | gemini-3.1-pro | $1.25 | $10.00 |
241
+
242
+ Pricing is easy to update: the table lives in `coderace/cost.py` as a plain dict.
243
+
173
244
  ## Supported Agents
174
245
 
175
246
  | Agent | CLI | Notes |
@@ -0,0 +1,97 @@
1
+ # All-Day Build Contract: Cost Tracking (v0.4.0)
2
+
3
+ Status: In Progress
4
+ Date: 2026-02-24
5
+ Owner: Sub-agent execution pass
6
+ Scope type: Deliverable-gated (no hour promises)
7
+
8
+ ## 1. Objective
9
+
10
+ Add cost tracking to coderace so users can compare coding agents on quality-per-dollar, not just quality alone. When a race finishes, each agent's result includes estimated API cost. The results table shows a $/score column. This is the #1 missing comparison axis: everyone benchmarks speed and quality, nobody automates cost comparison.
11
+
12
+ This contract is considered complete only when every deliverable and validation gate below is satisfied.
13
+
14
+ ## 2. Non-Negotiable Build Rules
15
+
16
+ 1. No time-based completion claims.
17
+ 2. Completion is allowed only when all checklist items are checked.
18
+ 3. Full test suite must pass at the end.
19
+ 4. New features must ship with docs and report addendum updates in the same pass.
20
+ 5. CLI outputs must be deterministic and schema-backed where specified.
21
+ 6. Never modify files outside the project directory.
22
+ 7. Commit after each completed deliverable (not at the end).
23
+ 8. If stuck on same issue for 3 attempts, stop and write a blocker report.
24
+ 9. Do NOT refactor, restyle, or "improve" code outside the deliverables.
25
+ 10. Read existing tests and docs before writing new code.
26
+
27
+ ## 3. Feature Deliverables
28
+
29
+ ### D1. Cost estimation engine (core)
30
+
31
+ Build a cost estimation module that maps agent CLI output to dollar costs. Each agent adapter gets a `parse_cost()` method that extracts token counts or cost info from the agent's stdout/stderr.
32
+
33
+ Required:
34
+ - `coderace/cost.py` — pricing tables, cost calculation logic
35
+ - `coderace/adapters/*.py` — updated with parse_cost() methods
36
+
37
+ - [ ] Pricing table for: Claude Code (Sonnet 4.6, Opus 4.6), Codex (GPT-5.3), Gemini CLI (Gemini 2.5 Pro, Gemini 3.1 Pro), Aider (configurable model), OpenCode (configurable model)
38
+ - [ ] Parse token counts from each agent's output (Claude Code prints session summary, Codex prints usage, etc.)
39
+ - [ ] Fallback: if token counts unavailable, estimate from input file size + output diff size using per-model pricing
40
+ - [ ] CostResult dataclass: input_tokens, output_tokens, estimated_cost_usd, model_name, pricing_source
41
+ - [ ] Tests for D1: unit tests for each parser, edge cases (missing output, unknown model)
42
+
43
+ ### D2. Results integration
44
+
45
+ Integrate cost data into the race results pipeline. Show cost alongside score in all output formats.
46
+
47
+ Required:
48
+ - `coderace/results.py` — updated
49
+ - `coderace/cli.py` — updated
50
+
51
+ - [ ] Race results include cost_usd field per agent
52
+ - [ ] `coderace results` terminal output shows Cost column
53
+ - [ ] `--format markdown` includes cost column
54
+ - [ ] `--format json` includes cost object
55
+ - [ ] HTML report includes cost column with $/score ratio
56
+ - [ ] Statistical mode (`--runs N`) aggregates cost: mean ± stddev
57
+ - [ ] Tests for D2
58
+
59
+ ### D3. Cost configuration
60
+
61
+ Allow users to override pricing in task YAML (for custom models, negotiated rates, etc).
62
+
63
+ Required:
64
+ - `coderace/config.py` or extend task YAML schema
65
+
66
+ - [ ] `pricing:` section in task YAML: per-agent or per-model overrides
67
+ - [ ] `coderace init` template includes commented pricing example
68
+ - [ ] `--no-cost` flag to disable cost tracking entirely
69
+ - [ ] Tests for D3
70
+
71
+ ### D4. Documentation
72
+
73
+ - [ ] README section: "Cost Tracking" with example output
74
+ - [ ] README section: "Custom Pricing" showing YAML config
75
+ - [ ] CHANGELOG entry for v0.4.0
76
+ - [ ] Update example task YAMLs with pricing comments
77
+
78
+ ## 4. Test Requirements
79
+
80
+ - [ ] Unit tests for cost parsing (each adapter)
81
+ - [ ] Unit tests for pricing calculation
82
+ - [ ] Integration test: full race with cost output
83
+ - [ ] Edge cases: agent crashes (no cost data), unknown model, zero tokens
84
+ - [ ] All existing 130 tests must still pass
85
+
86
+ ## 5. Reports
87
+
88
+ - Write progress to `progress-log.md` after each deliverable
89
+ - Include: what was built, what tests pass, what's next, any blockers
90
+ - Final summary when all deliverables done or stopped
91
+
92
+ ## 6. Stop Conditions
93
+
94
+ - All deliverables checked and all tests passing -> DONE
95
+ - 3 consecutive failed attempts on same issue -> STOP, write blocker report
96
+ - Scope creep detected (new requirements discovered) -> STOP, report what's new
97
+ - All tests passing but deliverables remain -> continue to next deliverable
@@ -0,0 +1,33 @@
1
+ """Aider adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from coderace.adapters.base import BaseAdapter
8
+ from coderace.cost import CostResult, parse_aider_cost
9
+
10
+
11
+ class AiderAdapter(BaseAdapter):
12
+ """Adapter for Aider coding assistant."""
13
+
14
+ name = "aider"
15
+
16
+ def build_command(self, task_description: str) -> list[str]:
17
+ return [
18
+ "aider",
19
+ "--message",
20
+ task_description,
21
+ "--yes",
22
+ "--no-auto-commits",
23
+ ]
24
+
25
+ def parse_cost(
26
+ self,
27
+ stdout: str,
28
+ stderr: str,
29
+ model_name: str = "aider-default",
30
+ custom_pricing: dict[str, tuple[float, float]] | None = None,
31
+ ) -> Optional[CostResult]:
32
+ """Parse cost data from Aider output."""
33
+ return parse_aider_cost(stdout, stderr, model_name, custom_pricing)
@@ -6,7 +6,9 @@ import subprocess
6
6
  import time
7
7
  from abc import ABC, abstractmethod
8
8
  from pathlib import Path
9
+ from typing import Optional
9
10
 
11
+ from coderace.cost import CostResult
10
12
  from coderace.types import AgentResult
11
13
 
12
14
 
@@ -20,7 +22,27 @@ class BaseAdapter(ABC):
20
22
  """Build the CLI command to invoke this agent."""
21
23
  ...
22
24
 
23
- def run(self, task_description: str, workdir: Path, timeout: int) -> AgentResult:
25
+ def parse_cost(
26
+ self,
27
+ stdout: str,
28
+ stderr: str,
29
+ model_name: str = "",
30
+ custom_pricing: dict[str, tuple[float, float]] | None = None,
31
+ ) -> Optional[CostResult]:
32
+ """Parse cost data from agent output. Override in subclasses.
33
+
34
+ Returns None if cost data is unavailable.
35
+ """
36
+ return None
37
+
38
+ def run(
39
+ self,
40
+ task_description: str,
41
+ workdir: Path,
42
+ timeout: int,
43
+ no_cost: bool = False,
44
+ custom_pricing: dict[str, tuple[float, float]] | None = None,
45
+ ) -> AgentResult:
24
46
  """Run the agent on a task and capture results."""
25
47
  cmd = self.build_command(task_description)
26
48
  start = time.monotonic()
@@ -50,6 +72,14 @@ class BaseAdapter(ABC):
50
72
 
51
73
  wall_time = time.monotonic() - start
52
74
 
75
+ # Parse cost (fails gracefully — never raises)
76
+ cost_result: Optional[CostResult] = None
77
+ if not no_cost:
78
+ try:
79
+ cost_result = self.parse_cost(stdout, stderr, custom_pricing=custom_pricing)
80
+ except Exception:
81
+ pass
82
+
53
83
  return AgentResult(
54
84
  agent=self.name,
55
85
  exit_code=exit_code,
@@ -57,4 +87,5 @@ class BaseAdapter(ABC):
57
87
  stderr=stderr,
58
88
  wall_time=wall_time,
59
89
  timed_out=timed_out,
90
+ cost_result=cost_result,
60
91
  )
@@ -2,7 +2,10 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from typing import Optional
6
+
5
7
  from coderace.adapters.base import BaseAdapter
8
+ from coderace.cost import CostResult, parse_claude_cost
6
9
 
7
10
 
8
11
  class ClaudeAdapter(BaseAdapter):
@@ -19,3 +22,13 @@ class ClaudeAdapter(BaseAdapter):
19
22
  "-p",
20
23
  task_description,
21
24
  ]
25
+
26
+ def parse_cost(
27
+ self,
28
+ stdout: str,
29
+ stderr: str,
30
+ model_name: str = "claude-sonnet-4-6",
31
+ custom_pricing: dict[str, tuple[float, float]] | None = None,
32
+ ) -> Optional[CostResult]:
33
+ """Parse cost data from Claude Code output."""
34
+ return parse_claude_cost(stdout, stderr, model_name, custom_pricing)
@@ -0,0 +1,33 @@
1
+ """Codex CLI adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from coderace.adapters.base import BaseAdapter
8
+ from coderace.cost import CostResult, parse_codex_cost
9
+
10
+
11
+ class CodexAdapter(BaseAdapter):
12
+ """Adapter for OpenAI Codex CLI."""
13
+
14
+ name = "codex"
15
+
16
+ def build_command(self, task_description: str) -> list[str]:
17
+ return [
18
+ "codex",
19
+ "--quiet",
20
+ "--full-auto",
21
+ "-p",
22
+ task_description,
23
+ ]
24
+
25
+ def parse_cost(
26
+ self,
27
+ stdout: str,
28
+ stderr: str,
29
+ model_name: str = "gpt-5.3-codex",
30
+ custom_pricing: dict[str, tuple[float, float]] | None = None,
31
+ ) -> Optional[CostResult]:
32
+ """Parse cost data from Codex CLI output."""
33
+ return parse_codex_cost(stdout, stderr, model_name, custom_pricing)
@@ -0,0 +1,32 @@
1
+ """Gemini CLI adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from coderace.adapters.base import BaseAdapter
8
+ from coderace.cost import CostResult, parse_gemini_cost
9
+
10
+
11
+ class GeminiAdapter(BaseAdapter):
12
+ """Adapter for Google Gemini CLI."""
13
+
14
+ name = "gemini"
15
+
16
+ def build_command(self, task_description: str) -> list[str]:
17
+ return [
18
+ "gemini",
19
+ "--non-interactive",
20
+ "-p",
21
+ task_description,
22
+ ]
23
+
24
+ def parse_cost(
25
+ self,
26
+ stdout: str,
27
+ stderr: str,
28
+ model_name: str = "gemini-2.5-pro",
29
+ custom_pricing: dict[str, tuple[float, float]] | None = None,
30
+ ) -> Optional[CostResult]:
31
+ """Parse cost data from Gemini CLI output."""
32
+ return parse_gemini_cost(stdout, stderr, model_name, custom_pricing)
@@ -0,0 +1,31 @@
1
+ """OpenCode adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from coderace.adapters.base import BaseAdapter
8
+ from coderace.cost import CostResult, parse_opencode_cost
9
+
10
+
11
+ class OpenCodeAdapter(BaseAdapter):
12
+ """Adapter for OpenCode CLI (terminal-first AI coding agent)."""
13
+
14
+ name = "opencode"
15
+
16
+ def build_command(self, task_description: str) -> list[str]:
17
+ return [
18
+ "opencode",
19
+ "run",
20
+ task_description,
21
+ ]
22
+
23
+ def parse_cost(
24
+ self,
25
+ stdout: str,
26
+ stderr: str,
27
+ model_name: str = "opencode-default",
28
+ custom_pricing: dict[str, tuple[float, float]] | None = None,
29
+ ) -> Optional[CostResult]:
30
+ """Parse cost data from OpenCode output."""
31
+ return parse_opencode_cost(stdout, stderr, model_name, custom_pricing)
@@ -51,6 +51,8 @@ def _run_agent_sequential(
51
51
  branch: str,
52
52
  base_ref: str,
53
53
  timeout: int,
54
+ no_cost: bool = False,
55
+ custom_pricing: dict | None = None,
54
56
  ) -> tuple[AgentResult | None, int]:
55
57
  """Run a single agent sequentially (on the main repo). Returns (result, lines_changed)."""
56
58
  try:
@@ -59,7 +61,7 @@ def _run_agent_sequential(
59
61
  return None, 0
60
62
 
61
63
  adapter = ADAPTERS[agent_name]()
62
- result = adapter.run(task_description, repo, timeout)
64
+ result = adapter.run(task_description, repo, timeout, no_cost=no_cost, custom_pricing=custom_pricing)
63
65
 
64
66
  _, lines = get_diff_stat(repo, base_ref)
65
67
  return result, lines
@@ -72,6 +74,8 @@ def _run_agent_worktree(
72
74
  branch: str,
73
75
  base_ref: str,
74
76
  timeout: int,
77
+ no_cost: bool = False,
78
+ custom_pricing: dict | None = None,
75
79
  ) -> tuple[AgentResult | None, int]:
76
80
  """Run a single agent in a git worktree (for parallel execution)."""
77
81
  import tempfile
@@ -87,7 +91,7 @@ def _run_agent_worktree(
87
91
  add_worktree(repo, worktree_dir, branch)
88
92
 
89
93
  adapter = ADAPTERS[agent_name]()
90
- result = adapter.run(task_description, worktree_dir, timeout)
94
+ result = adapter.run(task_description, worktree_dir, timeout, no_cost=no_cost, custom_pricing=custom_pricing)
91
95
 
92
96
  _, lines = get_diff_stat(worktree_dir, base_ref)
93
97
  return result, lines
@@ -114,6 +118,9 @@ def run(
114
118
  runs: int = typer.Option(
115
119
  1, "--runs", "-n", help="Number of runs (>1 for stats)"
116
120
  ),
121
+ no_cost: bool = typer.Option(
122
+ False, "--no-cost", help="Disable cost tracking"
123
+ ),
117
124
  ) -> None:
118
125
  """Run all agents on a task and score the results."""
119
126
  task = load_task(task_file)
@@ -195,6 +202,8 @@ def run(
195
202
  branch,
196
203
  base_ref,
197
204
  task.timeout,
205
+ no_cost,
206
+ task.pricing,
198
207
  )
199
208
  futures[future] = agent_name
200
209
 
@@ -248,6 +257,8 @@ def run(
248
257
  branch,
249
258
  base_ref,
250
259
  task.timeout,
260
+ no_cost=no_cost,
261
+ custom_pricing=task.pricing,
251
262
  )
252
263
 
253
264
  if result is None:
@@ -417,6 +428,8 @@ def _save_stats_json(
417
428
  "exit_clean_rate": s.exit_clean_rate,
418
429
  "lint_clean_rate": s.lint_clean_rate,
419
430
  "per_run_scores": s.per_run_scores,
431
+ "cost_mean": s.cost_mean,
432
+ "cost_stddev": s.cost_stddev,
420
433
  }
421
434
  )
422
435
 
@@ -484,9 +497,16 @@ def results(
484
497
  table.add_column("Lint", justify="center")
485
498
  table.add_column("Time (s)", justify="right")
486
499
  table.add_column("Lines", justify="right")
500
+ table.add_column("Cost (USD)", justify="right")
487
501
 
488
502
  for entry in data:
489
503
  b = entry["breakdown"]
504
+ cost_info = entry.get("cost")
505
+ cost_str = (
506
+ f"${cost_info['estimated_cost_usd']:.4f}"
507
+ if cost_info is not None
508
+ else "-"
509
+ )
490
510
  table.add_row(
491
511
  str(entry["rank"]),
492
512
  entry["agent"],
@@ -496,6 +516,7 @@ def results(
496
516
  _bool_icon(b["lint_clean"]),
497
517
  f"{b['wall_time']:.1f}",
498
518
  str(b["lines_changed"]),
519
+ cost_str,
499
520
  )
500
521
 
501
522
  console.print(table)
@@ -39,16 +39,21 @@ def format_markdown_results(scores: list[Score], task_name: str = "") -> str:
39
39
  )
40
40
 
41
41
  # Table header
42
- header = "| Rank | Agent | Score | Tests | Lint | Exit | Time (s) | Lines |\n"
43
- separator = "|------|-------|------:|:-----:|:----:|:----:|---------:|------:|\n"
42
+ header = "| Rank | Agent | Score | Tests | Lint | Exit | Time (s) | Lines | Cost (USD) |\n"
43
+ separator = "|------|-------|------:|:-----:|:----:|:----:|---------:|------:|-----------:|\n"
44
44
 
45
45
  rows: list[str] = []
46
46
  for i, score in enumerate(ranked, 1):
47
47
  b = score.breakdown
48
+ cost_str = (
49
+ f"${score.cost_result.estimated_cost_usd:.4f}"
50
+ if score.cost_result is not None
51
+ else "-"
52
+ )
48
53
  row = (
49
54
  f"| {i} | `{score.agent}` | {score.composite:.1f} |"
50
55
  f" {_bool_md(b.tests_pass)} | {_bool_md(b.lint_clean)} |"
51
- f" {_bool_md(b.exit_clean)} | {b.wall_time:.1f} | {b.lines_changed} |"
56
+ f" {_bool_md(b.exit_clean)} | {b.wall_time:.1f} | {b.lines_changed} | {cost_str} |"
52
57
  )
53
58
  rows.append(row)
54
59
 
@@ -84,12 +89,18 @@ def format_markdown_from_json(data: list[dict], task_name: str = "") -> str:
84
89
  heading = f"## coderace results: {task_name}\n\n" if task_name else "## coderace results\n\n"
85
90
  summary = f"**Winner:** `{agent}` — {score:.1f} pts | {n} agent(s) raced\n\n"
86
91
 
87
- header = "| Rank | Agent | Score | Tests | Lint | Exit | Time (s) | Lines |\n"
88
- separator = "|------|-------|------:|:-----:|:----:|:----:|---------:|------:|\n"
92
+ header = "| Rank | Agent | Score | Tests | Lint | Exit | Time (s) | Lines | Cost (USD) |\n"
93
+ separator = "|------|-------|------:|:-----:|:----:|:----:|---------:|------:|-----------:|\n"
89
94
 
90
95
  rows: list[str] = []
91
96
  for entry in data:
92
97
  b = entry.get("breakdown", {})
98
+ cost_info = entry.get("cost")
99
+ cost_str = (
100
+ f"${cost_info['estimated_cost_usd']:.4f}"
101
+ if cost_info is not None
102
+ else "-"
103
+ )
93
104
  rank = entry.get("rank", "?")
94
105
  a = entry.get("agent", "?")
95
106
  sc = entry.get("composite_score", 0.0)
@@ -98,7 +109,7 @@ def format_markdown_from_json(data: list[dict], task_name: str = "") -> str:
98
109
  f" {_bool_md(b.get('tests_pass', False))} |"
99
110
  f" {_bool_md(b.get('lint_clean', False))} |"
100
111
  f" {_bool_md(b.get('exit_clean', False))} |"
101
- f" {b.get('wall_time', 0.0):.1f} | {b.get('lines_changed', 0)} |"
112
+ f" {b.get('wall_time', 0.0):.1f} | {b.get('lines_changed', 0)} | {cost_str} |"
102
113
  )
103
114
  rows.append(row)
104
115