coderace 1.3.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderace-1.4.0/.claude-task.md +1 -0
- {coderace-1.3.0 → coderace-1.4.0}/CHANGELOG.md +10 -0
- {coderace-1.3.0 → coderace-1.4.0}/PKG-INFO +5 -1
- {coderace-1.3.0 → coderace-1.4.0}/README.md +4 -0
- coderace-1.4.0/all-day-build-contract-benchmark-tasks-v2.md +184 -0
- coderace-1.4.0/coderace/builtins/tasks/api-client.yaml +335 -0
- coderace-1.4.0/coderace/builtins/tasks/bug-hunt.yaml +444 -0
- coderace-1.4.0/coderace/builtins/tasks/concurrent-queue.yaml +328 -0
- coderace-1.4.0/coderace/builtins/tasks/refactor.yaml +589 -0
- {coderace-1.3.0 → coderace-1.4.0}/progress-log.md +39 -0
- {coderace-1.3.0 → coderace-1.4.0}/pyproject.toml +1 -1
- coderace-1.4.0/tests/test_benchmark_tasks_v2.py +83 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_builtins.py +5 -1
- {coderace-1.3.0 → coderace-1.4.0}/.github/workflows/publish.yml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/.gitignore +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/DONE.txt +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/LICENSE +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/action.yml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-benchmark.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-builtin-tasks.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-ci-integration.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-context-eval.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-cost-tracking.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-dashboard.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-leaderboard.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-model-selection.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-race-mode.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-v0.2.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-v090-tasks.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-v1.0-statistical.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-verification-tests.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/benchmark-results/fibonacci-2026-02-27.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/benchmark-results/fibonacci-v2-2026-02-27.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/benchmark-results/hard-tasks-2026-02-27.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/benchmark-results/multi-task-2026-02-27.md +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/__init__.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/__init__.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/aider.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/base.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/claude.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/codex.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/gemini.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/opencode.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/benchmark.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/benchmark_report.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/benchmark_stats.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/__init__.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/binary-search-tree.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/cli-args-parser.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/csv-analyzer.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/data-pipeline.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/diff-algorithm.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/expression-evaluator.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/fibonacci.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/file-watcher.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/http-server.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/json-parser.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/lru-cache.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/markdown-to-html.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/regex-engine.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/state-machine.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/task-scheduler.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/url-router.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/cli.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/__init__.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/benchmark.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/context_eval.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/dashboard.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/diff.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/history.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/leaderboard.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/race.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/results.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/tasks.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/context_eval.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/context_eval_report.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/cost.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/dashboard.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/elo.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/export.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/git_ops.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/html_report.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/publish.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/reporter.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/scorer.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/statistics.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/stats.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/store.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/task.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/coderace/types.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/demo-race.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/examples/add-type-hints.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/examples/ci-race-on-pr.yml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/examples/context-eval-demo.sh +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/examples/example-task.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/examples/fix-edge-case.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/examples/model-selection.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/examples/write-tests.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/scripts/ci-run.sh +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/scripts/format-comment.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tasks/markdown-table.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tasks/parse-duration.yaml +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/__init__.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/conftest.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_adapters.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_benchmark.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_benchmark_trials.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_benchmark_v1_integration.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_cli.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_cli_store_integration.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_context_eval.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_context_eval_dashboard.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_cost.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_cost_config.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_cost_integration.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_dashboard.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_dashboard_cli.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_diff.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_elo.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_examples.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_export.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_format_comment.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_full_workflow.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_git_ops.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_history.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_html_report.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_leaderboard.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_markdown_results.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_model_selection_d1_d2.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_model_selection_d3.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_model_selection_d4.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_publish.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_race.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_reporter.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_scorer.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_statistics.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_stats.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_store.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_task.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_tasks_cli.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/tests/test_verification_integration.py +0 -0
- {coderace-1.3.0 → coderace-1.4.0}/uv.lock +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Read the build contract in all-day-build-contract-benchmark-tasks-v2.md. Read existing tasks in src/coderace/tasks/ to understand the format. Then execute the contract: build all 5 deliverables (D1-D5). Follow the non-negotiable rules exactly. Commit after each deliverable. Run pytest tests/ -x after each commit to verify nothing breaks.
|
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [1.4.0] - 2026-03-05
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- **4 new benchmark tasks** testing real-world coding skills beyond "build from scratch":
|
|
7
|
+
- `bug-hunt`: Find and fix 5 planted bugs in a calculator module (debugging)
|
|
8
|
+
- `refactor`: Improve messy code while keeping existing tests passing (refactoring)
|
|
9
|
+
- `concurrent-queue`: Thread-safe priority queue with producer/consumer pattern (concurrency)
|
|
10
|
+
- `api-client`: HTTP client with retry, rate limiting, and circuit breaker (API design)
|
|
11
|
+
- Total built-in tasks: 20 (up from 16)
|
|
12
|
+
|
|
3
13
|
## [1.3.0] - 2026-03-05
|
|
4
14
|
|
|
5
15
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: coderace
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Race coding agents against each other on real tasks
|
|
5
5
|
Project-URL: Homepage, https://github.com/mikiships/coderace
|
|
6
6
|
Project-URL: Repository, https://github.com/mikiships/coderace
|
|
@@ -217,6 +217,10 @@ coderace run --builtin fibonacci
|
|
|
217
217
|
| `url-router` | Hard | HTTP-style router with params, wildcard, and 405/404 logic |
|
|
218
218
|
| `diff-algorithm` | Hard | Unified diff + patch application roundtrip checks |
|
|
219
219
|
| `task-scheduler` | Hard | Dependency-aware priority scheduler with timeout handling |
|
|
220
|
+
| `bug-hunt` | Hard | Find and fix 5 planted bugs in a calculator module |
|
|
221
|
+
| `refactor` | Hard | Refactor messy code while keeping tests passing |
|
|
222
|
+
| `concurrent-queue` | Hard | Thread-safe priority queue with producer/consumer |
|
|
223
|
+
| `api-client` | Hard | HTTP client with retry, rate limiting, circuit breaker |
|
|
220
224
|
|
|
221
225
|
`coderace tasks list` now includes a `Verify` column so you can see which built-ins ship with verification suites.
|
|
222
226
|
|
|
@@ -187,6 +187,10 @@ coderace run --builtin fibonacci
|
|
|
187
187
|
| `url-router` | Hard | HTTP-style router with params, wildcard, and 405/404 logic |
|
|
188
188
|
| `diff-algorithm` | Hard | Unified diff + patch application roundtrip checks |
|
|
189
189
|
| `task-scheduler` | Hard | Dependency-aware priority scheduler with timeout handling |
|
|
190
|
+
| `bug-hunt` | Hard | Find and fix 5 planted bugs in a calculator module |
|
|
191
|
+
| `refactor` | Hard | Refactor messy code while keeping tests passing |
|
|
192
|
+
| `concurrent-queue` | Hard | Thread-safe priority queue with producer/consumer |
|
|
193
|
+
| `api-client` | Hard | HTTP client with retry, rate limiting, circuit breaker |
|
|
190
194
|
|
|
191
195
|
`coderace tasks list` now includes a `Verify` column so you can see which built-ins ship with verification suites.
|
|
192
196
|
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
# All-Day Build Contract: New Benchmark Tasks (Batch 2)
|
|
2
|
+
|
|
3
|
+
Status: In Progress
|
|
4
|
+
Date: 2026-03-05
|
|
5
|
+
Owner: Claude Code execution pass
|
|
6
|
+
Scope type: Deliverable-gated (no hour promises)
|
|
7
|
+
|
|
8
|
+
## 1. Objective
|
|
9
|
+
|
|
10
|
+
Add 4 new built-in benchmark tasks to coderace that test **real-world coding skills** beyond "build from scratch." Current 16 tasks all ask agents to implement something new. These 4 test debugging, refactoring, concurrency, and API design — skills that matter in production but aren't benchmarked.
|
|
11
|
+
|
|
12
|
+
This contract is considered complete only when every deliverable and validation gate below is satisfied.
|
|
13
|
+
|
|
14
|
+
## 2. Non-Negotiable Build Rules
|
|
15
|
+
|
|
16
|
+
1. No time-based completion claims.
|
|
17
|
+
2. Completion is allowed only when all checklist items are checked.
|
|
18
|
+
3. Full test suite must pass at the end (`pytest tests/ -x`).
|
|
19
|
+
4. New tasks must follow the exact same format as existing built-in tasks (see `src/coderace/tasks/` for examples).
|
|
20
|
+
5. CLI outputs must be deterministic and schema-backed where specified.
|
|
21
|
+
6. Never modify files outside the project directory.
|
|
22
|
+
7. Commit after each completed deliverable (not at the end).
|
|
23
|
+
8. If stuck on same issue for 3 attempts, stop and write a blocker report.
|
|
24
|
+
9. Do NOT refactor, restyle, or "improve" existing code or tasks.
|
|
25
|
+
10. Read existing tasks in `src/coderace/tasks/` before writing new ones. Match the exact YAML format, naming conventions, and verification test style.
|
|
26
|
+
|
|
27
|
+
## 3. Feature Deliverables
|
|
28
|
+
|
|
29
|
+
### D1. Task: `bug-hunt` (debugging — fix planted bugs in existing code)
|
|
30
|
+
|
|
31
|
+
**Concept:** Provide a working-looking module (`buggy_calculator.py`) with 5 planted bugs. The agent must find and fix them without being told exactly what's wrong.
|
|
32
|
+
|
|
33
|
+
The task description gives the agent:
|
|
34
|
+
- A pre-written `buggy_calculator.py` with an advanced calculator (expressions, variables, functions like `sqrt`, `abs`, `min`, `max`)
|
|
35
|
+
- A failing test file that demonstrates the bugs exist
|
|
36
|
+
- Instructions: "Fix ALL bugs. Do not rewrite from scratch."
|
|
37
|
+
|
|
38
|
+
Planted bugs (specific, not vague):
|
|
39
|
+
1. Division rounds to int instead of float (`//` instead of `/`)
|
|
40
|
+
2. Negative number parsing fails (misses unary minus in expressions)
|
|
41
|
+
3. Variable assignment overwrites built-in functions
|
|
42
|
+
4. `min(a, b)` returns max (arguments reversed)
|
|
43
|
+
5. Parenthesized expressions off-by-one in closing paren detection
|
|
44
|
+
|
|
45
|
+
Hidden verification tests:
|
|
46
|
+
- All 5 bugs must be fixed
|
|
47
|
+
- Original working functionality must still work
|
|
48
|
+
- Code must not be a complete rewrite (verify specific function signatures still exist)
|
|
49
|
+
- Edge cases: nested expressions with negatives, chained function calls
|
|
50
|
+
|
|
51
|
+
Required files:
|
|
52
|
+
- `src/coderace/tasks/bug-hunt.yaml`
|
|
53
|
+
- The YAML must include `setup_files:` with the buggy source and failing tests
|
|
54
|
+
|
|
55
|
+
- [ ] Write buggy_calculator.py with exactly 5 planted bugs
|
|
56
|
+
- [ ] Write test_buggy_calculator.py that exposes the bugs
|
|
57
|
+
- [ ] Write verify_bug_hunt.py with hidden tests for all 5 fixes + non-rewrite check
|
|
58
|
+
- [ ] Write bug-hunt.yaml task definition
|
|
59
|
+
- [ ] Tests for D1 (task loads, runs dry-run, YAML validates)
|
|
60
|
+
|
|
61
|
+
### D2. Task: `refactor` (improve existing messy code while keeping tests passing)
|
|
62
|
+
|
|
63
|
+
**Concept:** Provide a working but messy module (`data_store.py`) — a key-value store with bad naming, duplicated logic, no type hints, mixed concerns, and poor error handling. The agent must refactor it while keeping the existing test suite passing.
|
|
64
|
+
|
|
65
|
+
The task description gives the agent:
|
|
66
|
+
- `data_store.py` — a working but ugly key-value store (~150 lines, deliberately messy)
|
|
67
|
+
- `test_data_store.py` — passing tests (these must STILL pass after refactoring)
|
|
68
|
+
- Instructions: "Refactor for readability, maintainability, and best practices. All existing tests must pass. Do not change test file."
|
|
69
|
+
|
|
70
|
+
Quality dimensions to verify:
|
|
71
|
+
1. Type hints added (verify with mypy or ast inspection)
|
|
72
|
+
2. Functions under 25 lines each
|
|
73
|
+
3. No duplicated logic (measure before/after)
|
|
74
|
+
4. Consistent naming convention
|
|
75
|
+
5. Proper error handling (specific exceptions, not bare except)
|
|
76
|
+
|
|
77
|
+
Hidden verification tests:
|
|
78
|
+
- All original test_data_store.py tests still pass
|
|
79
|
+
- AST analysis: type hints present on all public functions
|
|
80
|
+
- AST analysis: no function exceeds 25 lines
|
|
81
|
+
- New edge cases that the refactored code should handle more gracefully
|
|
82
|
+
- Import check: the module still exports the same public API
|
|
83
|
+
|
|
84
|
+
Required files:
|
|
85
|
+
- `src/coderace/tasks/refactor.yaml`
|
|
86
|
+
|
|
87
|
+
- [ ] Write messy data_store.py (~150 lines, working but ugly)
|
|
88
|
+
- [ ] Write test_data_store.py with comprehensive passing tests
|
|
89
|
+
- [ ] Write verify_refactor.py with hidden quality checks (AST-based + functional)
|
|
90
|
+
- [ ] Write refactor.yaml task definition
|
|
91
|
+
- [ ] Tests for D2
|
|
92
|
+
|
|
93
|
+
### D3. Task: `concurrent-queue` (thread-safe producer/consumer with priority)
|
|
94
|
+
|
|
95
|
+
**Concept:** Build a thread-safe priority queue with producer/consumer pattern. Tests concurrency understanding — a known weak spot for LLMs.
|
|
96
|
+
|
|
97
|
+
The task description tells the agent to create `concurrent_queue.py` with:
|
|
98
|
+
- `PriorityTaskQueue` class
|
|
99
|
+
- `submit(task, priority)` — thread-safe task submission
|
|
100
|
+
- `worker(callback)` — starts a worker thread that processes tasks by priority
|
|
101
|
+
- `shutdown(wait=True)` — graceful shutdown
|
|
102
|
+
- Support for: task cancellation, timeout on get, max queue size with backpressure
|
|
103
|
+
- Thread-safe stats: tasks_submitted, tasks_completed, tasks_failed
|
|
104
|
+
|
|
105
|
+
And `test_concurrent_queue.py` with tests for:
|
|
106
|
+
- Basic submit and process
|
|
107
|
+
- Priority ordering (higher priority processed first)
|
|
108
|
+
- Multiple workers
|
|
109
|
+
- Graceful shutdown (pending tasks complete)
|
|
110
|
+
- Task cancellation
|
|
111
|
+
- Backpressure when queue is full
|
|
112
|
+
- Thread safety under concurrent access
|
|
113
|
+
|
|
114
|
+
Hidden verification tests:
|
|
115
|
+
- Stress test: 1000 tasks, 10 workers, verify all complete
|
|
116
|
+
- Priority ordering under load (submit mixed priorities, verify processing order)
|
|
117
|
+
- Shutdown with pending tasks (verify all complete before shutdown returns)
|
|
118
|
+
- Cancellation of in-flight vs queued tasks
|
|
119
|
+
- Deadlock detection (timeout-based, the test itself must complete in <10s)
|
|
120
|
+
|
|
121
|
+
Required files:
|
|
122
|
+
- `src/coderace/tasks/concurrent-queue.yaml`
|
|
123
|
+
|
|
124
|
+
- [ ] Write concurrent-queue.yaml with full spec
|
|
125
|
+
- [ ] Write verify_concurrent_queue.py with stress tests and deadlock detection
|
|
126
|
+
- [ ] Tests for D3
|
|
127
|
+
|
|
128
|
+
### D4. Task: `api-client` (HTTP client with retry, rate limiting, circuit breaker)
|
|
129
|
+
|
|
130
|
+
**Concept:** Build a production-grade HTTP API client with resilience patterns. Tests real-world engineering patterns.
|
|
131
|
+
|
|
132
|
+
The task description tells the agent to create `api_client.py` with:
|
|
133
|
+
- `APIClient(base_url, max_retries=3, rate_limit_per_sec=10)`
|
|
134
|
+
- Methods: `get`, `post`, `put`, `delete` — all async-compatible but sync by default
|
|
135
|
+
- Retry with exponential backoff (jitter) on 429 and 5xx
|
|
136
|
+
- Rate limiting (token bucket)
|
|
137
|
+
- Circuit breaker (open after N consecutive failures, half-open after timeout)
|
|
138
|
+
- Request/response logging
|
|
139
|
+
- Configurable timeout per request
|
|
140
|
+
|
|
141
|
+
And `test_api_client.py` with tests using unittest.mock to simulate server responses.
|
|
142
|
+
|
|
143
|
+
Hidden verification tests:
|
|
144
|
+
- Retry behavior: mock server returns 503 twice then 200, verify 3 attempts made
|
|
145
|
+
- Rate limiting: fire 20 requests, verify spacing respects rate limit
|
|
146
|
+
- Circuit breaker: force 5 failures, verify circuit opens, then half-opens after cooldown
|
|
147
|
+
- Backoff jitter: verify retry delays aren't deterministic (statistical test)
|
|
148
|
+
- Timeout handling: mock slow server, verify timeout raised
|
|
149
|
+
|
|
150
|
+
Required files:
|
|
151
|
+
- `src/coderace/tasks/api-client.yaml`
|
|
152
|
+
|
|
153
|
+
- [ ] Write api-client.yaml with full spec
|
|
154
|
+
- [ ] Write verify_api_client.py with resilience pattern tests
|
|
155
|
+
- [ ] Tests for D4
|
|
156
|
+
|
|
157
|
+
### D5. Integration + Documentation
|
|
158
|
+
|
|
159
|
+
- [ ] All 4 new tasks appear in `coderace tasks list` output
|
|
160
|
+
- [ ] `coderace benchmark --dry-run --tasks bug-hunt,refactor,concurrent-queue,api-client` works
|
|
161
|
+
- [ ] Update CHANGELOG.md with new tasks
|
|
162
|
+
- [ ] Version bump to 1.4.0 in pyproject.toml
|
|
163
|
+
- [ ] README: add "20 built-in tasks" (update from 16) and mention the new task categories
|
|
164
|
+
- [ ] All existing tests still pass (`pytest tests/ -x`)
|
|
165
|
+
|
|
166
|
+
## 4. Test Requirements
|
|
167
|
+
|
|
168
|
+
- [ ] Unit tests for each new task (YAML loads, validates, dry-run works)
|
|
169
|
+
- [ ] Integration test: `coderace benchmark --dry-run --tasks bug-hunt,refactor,concurrent-queue,api-client` completes
|
|
170
|
+
- [ ] All existing 574 tests must still pass
|
|
171
|
+
- [ ] New tests bring total to 590+
|
|
172
|
+
|
|
173
|
+
## 5. Reports
|
|
174
|
+
|
|
175
|
+
- Write progress to `progress-log.md` after each deliverable
|
|
176
|
+
- Include: what was built, what tests pass, what's next, any blockers
|
|
177
|
+
- Final summary when all deliverables done or stopped
|
|
178
|
+
|
|
179
|
+
## 6. Stop Conditions
|
|
180
|
+
|
|
181
|
+
- All deliverables checked and all tests passing -> DONE
|
|
182
|
+
- 3 consecutive failed attempts on same issue -> STOP, write blocker report
|
|
183
|
+
- Scope creep detected (new requirements discovered) -> STOP, report what's new
|
|
184
|
+
- All tests passing but deliverables remain -> continue to next deliverable
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
name: api-client
|
|
2
|
+
difficulty: hard
|
|
3
|
+
description: |
|
|
4
|
+
Build a production-grade HTTP API client with resilience patterns.
|
|
5
|
+
|
|
6
|
+
Create a file called `api_client.py` with the following class:
|
|
7
|
+
|
|
8
|
+
```python
|
|
9
|
+
class APIClient:
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
base_url: str,
|
|
13
|
+
max_retries: int = 3,
|
|
14
|
+
rate_limit_per_sec: float = 10,
|
|
15
|
+
timeout: float = 30.0,
|
|
16
|
+
circuit_breaker_threshold: int = 5,
|
|
17
|
+
circuit_breaker_timeout: float = 30.0,
|
|
18
|
+
):
|
|
19
|
+
"""Initialize the API client.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
base_url: Base URL for all requests (e.g., "https://api.example.com")
|
|
23
|
+
max_retries: Max retry attempts on 429/5xx responses
|
|
24
|
+
rate_limit_per_sec: Max requests per second (token bucket)
|
|
25
|
+
timeout: Default timeout per request in seconds
|
|
26
|
+
circuit_breaker_threshold: Consecutive failures before circuit opens
|
|
27
|
+
circuit_breaker_timeout: Seconds before circuit transitions to half-open
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def get(self, path: str, **kwargs) -> requests.Response:
|
|
31
|
+
"""Send a GET request."""
|
|
32
|
+
|
|
33
|
+
def post(self, path: str, **kwargs) -> requests.Response:
|
|
34
|
+
"""Send a POST request."""
|
|
35
|
+
|
|
36
|
+
def put(self, path: str, **kwargs) -> requests.Response:
|
|
37
|
+
"""Send a PUT request."""
|
|
38
|
+
|
|
39
|
+
def delete(self, path: str, **kwargs) -> requests.Response:
|
|
40
|
+
"""Send a DELETE request."""
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Requirements:
|
|
44
|
+
- All HTTP methods support `timeout`, `headers`, `params`, `json`, `data` kwargs
|
|
45
|
+
- Retry with exponential backoff + jitter on 429 and 5xx status codes
|
|
46
|
+
- Rate limiting using token bucket algorithm (max N requests per second)
|
|
47
|
+
- Circuit breaker pattern:
|
|
48
|
+
- CLOSED: requests flow normally
|
|
49
|
+
- OPEN: after N consecutive failures, immediately raise `CircuitBreakerOpen`
|
|
50
|
+
- HALF-OPEN: after timeout, allow one test request through
|
|
51
|
+
- Success in HALF-OPEN -> CLOSED; failure -> back to OPEN
|
|
52
|
+
- Custom exceptions: `CircuitBreakerOpen`, `APIClientError`
|
|
53
|
+
- `request_log` attribute: list of dicts with keys `method`, `url`, `status_code`, `timestamp`
|
|
54
|
+
- Uses the `requests` library for HTTP
|
|
55
|
+
|
|
56
|
+
Also create `test_api_client.py` with tests using `unittest.mock` to mock HTTP responses:
|
|
57
|
+
- Basic GET/POST/PUT/DELETE
|
|
58
|
+
- Retry on 503 then success
|
|
59
|
+
- Rate limiting (verify delays between rapid requests)
|
|
60
|
+
- Circuit breaker opens after consecutive failures
|
|
61
|
+
- Circuit breaker transitions from OPEN to HALF-OPEN to CLOSED
|
|
62
|
+
- Timeout handling
|
|
63
|
+
- Request logging
|
|
64
|
+
|
|
65
|
+
Do NOT modify any files other than `api_client.py` and `test_api_client.py`.
|
|
66
|
+
repo: .
|
|
67
|
+
test_command: python3 -m pytest test_api_client.py -x -q
|
|
68
|
+
verify_command: python3 -m pytest verify_api_client.py -x -q
|
|
69
|
+
verify_files:
|
|
70
|
+
verify_api_client.py: |
|
|
71
|
+
"""Hidden verification tests for api-client task."""
|
|
72
|
+
|
|
73
|
+
import time
|
|
74
|
+
from unittest.mock import MagicMock, patch, PropertyMock
|
|
75
|
+
|
|
76
|
+
import pytest
|
|
77
|
+
|
|
78
|
+
from api_client import APIClient, CircuitBreakerOpen
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _mock_response(status_code=200, json_data=None):
|
|
82
|
+
resp = MagicMock()
|
|
83
|
+
resp.status_code = status_code
|
|
84
|
+
resp.json.return_value = json_data or {}
|
|
85
|
+
resp.ok = 200 <= status_code < 300
|
|
86
|
+
resp.raise_for_status = MagicMock()
|
|
87
|
+
if status_code >= 400:
|
|
88
|
+
resp.raise_for_status.side_effect = Exception(f"HTTP {status_code}")
|
|
89
|
+
return resp
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class TestRetryBehavior:
|
|
93
|
+
@patch("api_client.requests.Session.request")
|
|
94
|
+
def test_retries_on_503_then_succeeds(self, mock_request):
|
|
95
|
+
mock_request.side_effect = [
|
|
96
|
+
_mock_response(503),
|
|
97
|
+
_mock_response(503),
|
|
98
|
+
_mock_response(200, {"ok": True}),
|
|
99
|
+
]
|
|
100
|
+
client = APIClient(
|
|
101
|
+
"http://test.com",
|
|
102
|
+
max_retries=3,
|
|
103
|
+
rate_limit_per_sec=1000,
|
|
104
|
+
)
|
|
105
|
+
resp = client.get("/endpoint")
|
|
106
|
+
assert resp.status_code == 200
|
|
107
|
+
assert mock_request.call_count == 3
|
|
108
|
+
|
|
109
|
+
@patch("api_client.requests.Session.request")
|
|
110
|
+
def test_retries_on_429(self, mock_request):
|
|
111
|
+
mock_request.side_effect = [
|
|
112
|
+
_mock_response(429),
|
|
113
|
+
_mock_response(200),
|
|
114
|
+
]
|
|
115
|
+
client = APIClient(
|
|
116
|
+
"http://test.com",
|
|
117
|
+
max_retries=3,
|
|
118
|
+
rate_limit_per_sec=1000,
|
|
119
|
+
)
|
|
120
|
+
resp = client.get("/path")
|
|
121
|
+
assert resp.status_code == 200
|
|
122
|
+
assert mock_request.call_count == 2
|
|
123
|
+
|
|
124
|
+
@patch("api_client.requests.Session.request")
|
|
125
|
+
def test_retries_exhausted_returns_last_response(self, mock_request):
|
|
126
|
+
mock_request.side_effect = [
|
|
127
|
+
_mock_response(503),
|
|
128
|
+
_mock_response(503),
|
|
129
|
+
_mock_response(503),
|
|
130
|
+
]
|
|
131
|
+
client = APIClient(
|
|
132
|
+
"http://test.com",
|
|
133
|
+
max_retries=3,
|
|
134
|
+
rate_limit_per_sec=1000,
|
|
135
|
+
)
|
|
136
|
+
resp = client.get("/fail")
|
|
137
|
+
assert resp.status_code == 503
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class TestBackoffJitter:
|
|
141
|
+
@patch("api_client.requests.Session.request")
|
|
142
|
+
def test_retry_delays_have_jitter(self, mock_request):
|
|
143
|
+
"""Retry delays should not be deterministic."""
|
|
144
|
+
mock_request.side_effect = [
|
|
145
|
+
_mock_response(503),
|
|
146
|
+
_mock_response(503),
|
|
147
|
+
_mock_response(200),
|
|
148
|
+
]
|
|
149
|
+
delays = []
|
|
150
|
+
|
|
151
|
+
original_sleep = time.sleep
|
|
152
|
+
def capture_sleep(duration):
|
|
153
|
+
delays.append(duration)
|
|
154
|
+
# Don't actually sleep in tests
|
|
155
|
+
|
|
156
|
+
client = APIClient(
|
|
157
|
+
"http://test.com",
|
|
158
|
+
max_retries=3,
|
|
159
|
+
rate_limit_per_sec=1000,
|
|
160
|
+
)
|
|
161
|
+
with patch("time.sleep", side_effect=capture_sleep):
|
|
162
|
+
client.get("/test")
|
|
163
|
+
|
|
164
|
+
# Should have at least 1 retry delay
|
|
165
|
+
retry_delays = [d for d in delays if d > 0.001]
|
|
166
|
+
assert len(retry_delays) >= 1
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class TestRateLimiting:
|
|
170
|
+
@patch("api_client.requests.Session.request")
|
|
171
|
+
def test_rate_limit_spacing(self, mock_request):
|
|
172
|
+
mock_request.return_value = _mock_response(200)
|
|
173
|
+
# Very low rate limit to make spacing measurable
|
|
174
|
+
client = APIClient(
|
|
175
|
+
"http://test.com",
|
|
176
|
+
max_retries=0,
|
|
177
|
+
rate_limit_per_sec=5,
|
|
178
|
+
)
|
|
179
|
+
start = time.time()
|
|
180
|
+
for _ in range(6):
|
|
181
|
+
client.get("/test")
|
|
182
|
+
elapsed = time.time() - start
|
|
183
|
+
# 6 requests at 5/sec should take at least ~1 second
|
|
184
|
+
assert elapsed >= 0.8
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class TestCircuitBreaker:
|
|
188
|
+
@patch("api_client.requests.Session.request")
|
|
189
|
+
def test_circuit_opens_after_consecutive_failures(self, mock_request):
|
|
190
|
+
mock_request.return_value = _mock_response(503)
|
|
191
|
+
client = APIClient(
|
|
192
|
+
"http://test.com",
|
|
193
|
+
max_retries=1,
|
|
194
|
+
rate_limit_per_sec=1000,
|
|
195
|
+
circuit_breaker_threshold=3,
|
|
196
|
+
circuit_breaker_timeout=60,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Cause consecutive failures to open the circuit
|
|
200
|
+
for _ in range(3):
|
|
201
|
+
try:
|
|
202
|
+
client.get("/fail")
|
|
203
|
+
except CircuitBreakerOpen:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
# Now circuit should be open
|
|
207
|
+
with pytest.raises(CircuitBreakerOpen):
|
|
208
|
+
client.get("/blocked")
|
|
209
|
+
|
|
210
|
+
@patch("api_client.requests.Session.request")
|
|
211
|
+
def test_circuit_half_open_after_timeout(self, mock_request):
|
|
212
|
+
mock_request.return_value = _mock_response(503)
|
|
213
|
+
client = APIClient(
|
|
214
|
+
"http://test.com",
|
|
215
|
+
max_retries=1,
|
|
216
|
+
rate_limit_per_sec=1000,
|
|
217
|
+
circuit_breaker_threshold=3,
|
|
218
|
+
circuit_breaker_timeout=0.1,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Open the circuit
|
|
222
|
+
for _ in range(5):
|
|
223
|
+
try:
|
|
224
|
+
client.get("/fail")
|
|
225
|
+
except CircuitBreakerOpen:
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
# Wait for timeout to elapse
|
|
229
|
+
time.sleep(0.15)
|
|
230
|
+
|
|
231
|
+
# Should allow one request through (half-open)
|
|
232
|
+
mock_request.return_value = _mock_response(200)
|
|
233
|
+
resp = client.get("/recover")
|
|
234
|
+
assert resp.status_code == 200
|
|
235
|
+
|
|
236
|
+
@patch("api_client.requests.Session.request")
|
|
237
|
+
def test_circuit_closes_on_half_open_success(self, mock_request):
|
|
238
|
+
mock_request.return_value = _mock_response(503)
|
|
239
|
+
client = APIClient(
|
|
240
|
+
"http://test.com",
|
|
241
|
+
max_retries=1,
|
|
242
|
+
rate_limit_per_sec=1000,
|
|
243
|
+
circuit_breaker_threshold=3,
|
|
244
|
+
circuit_breaker_timeout=0.1,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Open the circuit
|
|
248
|
+
for _ in range(5):
|
|
249
|
+
try:
|
|
250
|
+
client.get("/fail")
|
|
251
|
+
except CircuitBreakerOpen:
|
|
252
|
+
pass
|
|
253
|
+
|
|
254
|
+
time.sleep(0.15)
|
|
255
|
+
|
|
256
|
+
# Half-open success
|
|
257
|
+
mock_request.return_value = _mock_response(200)
|
|
258
|
+
client.get("/recover")
|
|
259
|
+
|
|
260
|
+
# Circuit should be closed now — further requests should work
|
|
261
|
+
resp = client.get("/another")
|
|
262
|
+
assert resp.status_code == 200
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class TestHTTPMethods:
|
|
266
|
+
@patch("api_client.requests.Session.request")
|
|
267
|
+
def test_post(self, mock_request):
|
|
268
|
+
mock_request.return_value = _mock_response(201)
|
|
269
|
+
client = APIClient("http://test.com", rate_limit_per_sec=1000)
|
|
270
|
+
resp = client.post("/data", json={"key": "value"})
|
|
271
|
+
assert resp.status_code == 201
|
|
272
|
+
|
|
273
|
+
@patch("api_client.requests.Session.request")
|
|
274
|
+
def test_put(self, mock_request):
|
|
275
|
+
mock_request.return_value = _mock_response(200)
|
|
276
|
+
client = APIClient("http://test.com", rate_limit_per_sec=1000)
|
|
277
|
+
resp = client.put("/data/1", json={"key": "updated"})
|
|
278
|
+
assert resp.status_code == 200
|
|
279
|
+
|
|
280
|
+
@patch("api_client.requests.Session.request")
|
|
281
|
+
def test_delete(self, mock_request):
|
|
282
|
+
mock_request.return_value = _mock_response(204)
|
|
283
|
+
client = APIClient("http://test.com", rate_limit_per_sec=1000)
|
|
284
|
+
resp = client.delete("/data/1")
|
|
285
|
+
assert resp.status_code == 204
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class TestRequestLogging:
|
|
289
|
+
@patch("api_client.requests.Session.request")
|
|
290
|
+
def test_requests_are_logged(self, mock_request):
|
|
291
|
+
mock_request.return_value = _mock_response(200)
|
|
292
|
+
client = APIClient("http://test.com", rate_limit_per_sec=1000)
|
|
293
|
+
client.get("/a")
|
|
294
|
+
client.post("/b")
|
|
295
|
+
|
|
296
|
+
log = client.request_log
|
|
297
|
+
assert len(log) >= 2
|
|
298
|
+
assert log[0]["method"].upper() == "GET"
|
|
299
|
+
assert "/a" in log[0]["url"]
|
|
300
|
+
assert log[0]["status_code"] == 200
|
|
301
|
+
assert "timestamp" in log[0]
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class TestTimeout:
|
|
305
|
+
@patch("api_client.requests.Session.request")
|
|
306
|
+
def test_timeout_passed_to_request(self, mock_request):
|
|
307
|
+
mock_request.return_value = _mock_response(200)
|
|
308
|
+
client = APIClient(
|
|
309
|
+
"http://test.com",
|
|
310
|
+
rate_limit_per_sec=1000,
|
|
311
|
+
timeout=5.0,
|
|
312
|
+
)
|
|
313
|
+
client.get("/test")
|
|
314
|
+
call_kwargs = mock_request.call_args
|
|
315
|
+
# Timeout should be passed to the underlying request
|
|
316
|
+
assert call_kwargs.kwargs.get("timeout") == 5.0 or (
|
|
317
|
+
len(call_kwargs.args) > 0 and
|
|
318
|
+
any(isinstance(a, (int, float)) and a == 5.0
|
|
319
|
+
for a in call_kwargs.args)
|
|
320
|
+
) or "timeout" in str(call_kwargs)
|
|
321
|
+
lint_command: ruff check api_client.py
|
|
322
|
+
timeout: 600
|
|
323
|
+
agents:
|
|
324
|
+
- claude
|
|
325
|
+
- codex
|
|
326
|
+
- gemini
|
|
327
|
+
- aider
|
|
328
|
+
- opencode
|
|
329
|
+
scoring:
|
|
330
|
+
tests: 25
|
|
331
|
+
verify: 30
|
|
332
|
+
exit: 20
|
|
333
|
+
lint: 15
|
|
334
|
+
time: 5
|
|
335
|
+
lines: 5
|