coderace 1.3.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. coderace-1.4.0/.claude-task.md +1 -0
  2. {coderace-1.3.0 → coderace-1.4.0}/CHANGELOG.md +10 -0
  3. {coderace-1.3.0 → coderace-1.4.0}/PKG-INFO +5 -1
  4. {coderace-1.3.0 → coderace-1.4.0}/README.md +4 -0
  5. coderace-1.4.0/all-day-build-contract-benchmark-tasks-v2.md +184 -0
  6. coderace-1.4.0/coderace/builtins/tasks/api-client.yaml +335 -0
  7. coderace-1.4.0/coderace/builtins/tasks/bug-hunt.yaml +444 -0
  8. coderace-1.4.0/coderace/builtins/tasks/concurrent-queue.yaml +328 -0
  9. coderace-1.4.0/coderace/builtins/tasks/refactor.yaml +589 -0
  10. {coderace-1.3.0 → coderace-1.4.0}/progress-log.md +39 -0
  11. {coderace-1.3.0 → coderace-1.4.0}/pyproject.toml +1 -1
  12. coderace-1.4.0/tests/test_benchmark_tasks_v2.py +83 -0
  13. {coderace-1.3.0 → coderace-1.4.0}/tests/test_builtins.py +5 -1
  14. {coderace-1.3.0 → coderace-1.4.0}/.github/workflows/publish.yml +0 -0
  15. {coderace-1.3.0 → coderace-1.4.0}/.gitignore +0 -0
  16. {coderace-1.3.0 → coderace-1.4.0}/DONE.txt +0 -0
  17. {coderace-1.3.0 → coderace-1.4.0}/LICENSE +0 -0
  18. {coderace-1.3.0 → coderace-1.4.0}/action.yml +0 -0
  19. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-benchmark.md +0 -0
  20. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-builtin-tasks.md +0 -0
  21. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-ci-integration.md +0 -0
  22. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-context-eval.md +0 -0
  23. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-cost-tracking.md +0 -0
  24. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-dashboard.md +0 -0
  25. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-leaderboard.md +0 -0
  26. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-model-selection.md +0 -0
  27. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-race-mode.md +0 -0
  28. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-v0.2.md +0 -0
  29. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-v090-tasks.md +0 -0
  30. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-v1.0-statistical.md +0 -0
  31. {coderace-1.3.0 → coderace-1.4.0}/all-day-build-contract-verification-tests.md +0 -0
  32. {coderace-1.3.0 → coderace-1.4.0}/benchmark-results/fibonacci-2026-02-27.md +0 -0
  33. {coderace-1.3.0 → coderace-1.4.0}/benchmark-results/fibonacci-v2-2026-02-27.md +0 -0
  34. {coderace-1.3.0 → coderace-1.4.0}/benchmark-results/hard-tasks-2026-02-27.md +0 -0
  35. {coderace-1.3.0 → coderace-1.4.0}/benchmark-results/multi-task-2026-02-27.md +0 -0
  36. {coderace-1.3.0 → coderace-1.4.0}/coderace/__init__.py +0 -0
  37. {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/__init__.py +0 -0
  38. {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/aider.py +0 -0
  39. {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/base.py +0 -0
  40. {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/claude.py +0 -0
  41. {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/codex.py +0 -0
  42. {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/gemini.py +0 -0
  43. {coderace-1.3.0 → coderace-1.4.0}/coderace/adapters/opencode.py +0 -0
  44. {coderace-1.3.0 → coderace-1.4.0}/coderace/benchmark.py +0 -0
  45. {coderace-1.3.0 → coderace-1.4.0}/coderace/benchmark_report.py +0 -0
  46. {coderace-1.3.0 → coderace-1.4.0}/coderace/benchmark_stats.py +0 -0
  47. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/__init__.py +0 -0
  48. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/binary-search-tree.yaml +0 -0
  49. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/cli-args-parser.yaml +0 -0
  50. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/csv-analyzer.yaml +0 -0
  51. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/data-pipeline.yaml +0 -0
  52. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/diff-algorithm.yaml +0 -0
  53. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/expression-evaluator.yaml +0 -0
  54. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/fibonacci.yaml +0 -0
  55. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/file-watcher.yaml +0 -0
  56. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/http-server.yaml +0 -0
  57. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/json-parser.yaml +0 -0
  58. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/lru-cache.yaml +0 -0
  59. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/markdown-to-html.yaml +0 -0
  60. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/regex-engine.yaml +0 -0
  61. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/state-machine.yaml +0 -0
  62. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/task-scheduler.yaml +0 -0
  63. {coderace-1.3.0 → coderace-1.4.0}/coderace/builtins/tasks/url-router.yaml +0 -0
  64. {coderace-1.3.0 → coderace-1.4.0}/coderace/cli.py +0 -0
  65. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/__init__.py +0 -0
  66. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/benchmark.py +0 -0
  67. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/context_eval.py +0 -0
  68. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/dashboard.py +0 -0
  69. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/diff.py +0 -0
  70. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/history.py +0 -0
  71. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/leaderboard.py +0 -0
  72. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/race.py +0 -0
  73. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/results.py +0 -0
  74. {coderace-1.3.0 → coderace-1.4.0}/coderace/commands/tasks.py +0 -0
  75. {coderace-1.3.0 → coderace-1.4.0}/coderace/context_eval.py +0 -0
  76. {coderace-1.3.0 → coderace-1.4.0}/coderace/context_eval_report.py +0 -0
  77. {coderace-1.3.0 → coderace-1.4.0}/coderace/cost.py +0 -0
  78. {coderace-1.3.0 → coderace-1.4.0}/coderace/dashboard.py +0 -0
  79. {coderace-1.3.0 → coderace-1.4.0}/coderace/elo.py +0 -0
  80. {coderace-1.3.0 → coderace-1.4.0}/coderace/export.py +0 -0
  81. {coderace-1.3.0 → coderace-1.4.0}/coderace/git_ops.py +0 -0
  82. {coderace-1.3.0 → coderace-1.4.0}/coderace/html_report.py +0 -0
  83. {coderace-1.3.0 → coderace-1.4.0}/coderace/publish.py +0 -0
  84. {coderace-1.3.0 → coderace-1.4.0}/coderace/reporter.py +0 -0
  85. {coderace-1.3.0 → coderace-1.4.0}/coderace/scorer.py +0 -0
  86. {coderace-1.3.0 → coderace-1.4.0}/coderace/statistics.py +0 -0
  87. {coderace-1.3.0 → coderace-1.4.0}/coderace/stats.py +0 -0
  88. {coderace-1.3.0 → coderace-1.4.0}/coderace/store.py +0 -0
  89. {coderace-1.3.0 → coderace-1.4.0}/coderace/task.py +0 -0
  90. {coderace-1.3.0 → coderace-1.4.0}/coderace/types.py +0 -0
  91. {coderace-1.3.0 → coderace-1.4.0}/demo-race.yaml +0 -0
  92. {coderace-1.3.0 → coderace-1.4.0}/examples/add-type-hints.yaml +0 -0
  93. {coderace-1.3.0 → coderace-1.4.0}/examples/ci-race-on-pr.yml +0 -0
  94. {coderace-1.3.0 → coderace-1.4.0}/examples/context-eval-demo.sh +0 -0
  95. {coderace-1.3.0 → coderace-1.4.0}/examples/example-task.yaml +0 -0
  96. {coderace-1.3.0 → coderace-1.4.0}/examples/fix-edge-case.yaml +0 -0
  97. {coderace-1.3.0 → coderace-1.4.0}/examples/model-selection.yaml +0 -0
  98. {coderace-1.3.0 → coderace-1.4.0}/examples/write-tests.yaml +0 -0
  99. {coderace-1.3.0 → coderace-1.4.0}/scripts/ci-run.sh +0 -0
  100. {coderace-1.3.0 → coderace-1.4.0}/scripts/format-comment.py +0 -0
  101. {coderace-1.3.0 → coderace-1.4.0}/tasks/markdown-table.yaml +0 -0
  102. {coderace-1.3.0 → coderace-1.4.0}/tasks/parse-duration.yaml +0 -0
  103. {coderace-1.3.0 → coderace-1.4.0}/tests/__init__.py +0 -0
  104. {coderace-1.3.0 → coderace-1.4.0}/tests/conftest.py +0 -0
  105. {coderace-1.3.0 → coderace-1.4.0}/tests/test_adapters.py +0 -0
  106. {coderace-1.3.0 → coderace-1.4.0}/tests/test_benchmark.py +0 -0
  107. {coderace-1.3.0 → coderace-1.4.0}/tests/test_benchmark_trials.py +0 -0
  108. {coderace-1.3.0 → coderace-1.4.0}/tests/test_benchmark_v1_integration.py +0 -0
  109. {coderace-1.3.0 → coderace-1.4.0}/tests/test_cli.py +0 -0
  110. {coderace-1.3.0 → coderace-1.4.0}/tests/test_cli_store_integration.py +0 -0
  111. {coderace-1.3.0 → coderace-1.4.0}/tests/test_context_eval.py +0 -0
  112. {coderace-1.3.0 → coderace-1.4.0}/tests/test_context_eval_dashboard.py +0 -0
  113. {coderace-1.3.0 → coderace-1.4.0}/tests/test_cost.py +0 -0
  114. {coderace-1.3.0 → coderace-1.4.0}/tests/test_cost_config.py +0 -0
  115. {coderace-1.3.0 → coderace-1.4.0}/tests/test_cost_integration.py +0 -0
  116. {coderace-1.3.0 → coderace-1.4.0}/tests/test_dashboard.py +0 -0
  117. {coderace-1.3.0 → coderace-1.4.0}/tests/test_dashboard_cli.py +0 -0
  118. {coderace-1.3.0 → coderace-1.4.0}/tests/test_diff.py +0 -0
  119. {coderace-1.3.0 → coderace-1.4.0}/tests/test_elo.py +0 -0
  120. {coderace-1.3.0 → coderace-1.4.0}/tests/test_examples.py +0 -0
  121. {coderace-1.3.0 → coderace-1.4.0}/tests/test_export.py +0 -0
  122. {coderace-1.3.0 → coderace-1.4.0}/tests/test_format_comment.py +0 -0
  123. {coderace-1.3.0 → coderace-1.4.0}/tests/test_full_workflow.py +0 -0
  124. {coderace-1.3.0 → coderace-1.4.0}/tests/test_git_ops.py +0 -0
  125. {coderace-1.3.0 → coderace-1.4.0}/tests/test_history.py +0 -0
  126. {coderace-1.3.0 → coderace-1.4.0}/tests/test_html_report.py +0 -0
  127. {coderace-1.3.0 → coderace-1.4.0}/tests/test_leaderboard.py +0 -0
  128. {coderace-1.3.0 → coderace-1.4.0}/tests/test_markdown_results.py +0 -0
  129. {coderace-1.3.0 → coderace-1.4.0}/tests/test_model_selection_d1_d2.py +0 -0
  130. {coderace-1.3.0 → coderace-1.4.0}/tests/test_model_selection_d3.py +0 -0
  131. {coderace-1.3.0 → coderace-1.4.0}/tests/test_model_selection_d4.py +0 -0
  132. {coderace-1.3.0 → coderace-1.4.0}/tests/test_publish.py +0 -0
  133. {coderace-1.3.0 → coderace-1.4.0}/tests/test_race.py +0 -0
  134. {coderace-1.3.0 → coderace-1.4.0}/tests/test_reporter.py +0 -0
  135. {coderace-1.3.0 → coderace-1.4.0}/tests/test_scorer.py +0 -0
  136. {coderace-1.3.0 → coderace-1.4.0}/tests/test_statistics.py +0 -0
  137. {coderace-1.3.0 → coderace-1.4.0}/tests/test_stats.py +0 -0
  138. {coderace-1.3.0 → coderace-1.4.0}/tests/test_store.py +0 -0
  139. {coderace-1.3.0 → coderace-1.4.0}/tests/test_task.py +0 -0
  140. {coderace-1.3.0 → coderace-1.4.0}/tests/test_tasks_cli.py +0 -0
  141. {coderace-1.3.0 → coderace-1.4.0}/tests/test_verification_integration.py +0 -0
  142. {coderace-1.3.0 → coderace-1.4.0}/uv.lock +0 -0
@@ -0,0 +1 @@
1
+ Read the build contract in all-day-build-contract-benchmark-tasks-v2.md. Read existing tasks in src/coderace/tasks/ to understand the format. Then execute the contract: build all 5 deliverables (D1-D5). Follow the non-negotiable rules exactly. Commit after each deliverable. Run pytest tests/ -x after each commit to verify nothing breaks.
@@ -1,5 +1,15 @@
1
1
  # Changelog
2
2
 
3
+ ## [1.4.0] - 2026-03-05
4
+
5
+ ### Added
6
+ - **4 new benchmark tasks** testing real-world coding skills beyond "build from scratch":
7
+ - `bug-hunt`: Find and fix 5 planted bugs in a calculator module (debugging)
8
+ - `refactor`: Improve messy code while keeping existing tests passing (refactoring)
9
+ - `concurrent-queue`: Thread-safe priority queue with producer/consumer pattern (concurrency)
10
+ - `api-client`: HTTP client with retry, rate limiting, and circuit breaker (API design)
11
+ - Total built-in tasks: 20 (up from 16)
12
+
3
13
  ## [1.3.0] - 2026-03-05
4
14
 
5
15
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderace
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: Race coding agents against each other on real tasks
5
5
  Project-URL: Homepage, https://github.com/mikiships/coderace
6
6
  Project-URL: Repository, https://github.com/mikiships/coderace
@@ -217,6 +217,10 @@ coderace run --builtin fibonacci
217
217
  | `url-router` | Hard | HTTP-style router with params, wildcard, and 405/404 logic |
218
218
  | `diff-algorithm` | Hard | Unified diff + patch application roundtrip checks |
219
219
  | `task-scheduler` | Hard | Dependency-aware priority scheduler with timeout handling |
220
+ | `bug-hunt` | Hard | Find and fix 5 planted bugs in a calculator module |
221
+ | `refactor` | Hard | Refactor messy code while keeping tests passing |
222
+ | `concurrent-queue` | Hard | Thread-safe priority queue with producer/consumer |
223
+ | `api-client` | Hard | HTTP client with retry, rate limiting, circuit breaker |
220
224
 
221
225
  `coderace tasks list` now includes a `Verify` column so you can see which built-ins ship with verification suites.
222
226
 
@@ -187,6 +187,10 @@ coderace run --builtin fibonacci
187
187
  | `url-router` | Hard | HTTP-style router with params, wildcard, and 405/404 logic |
188
188
  | `diff-algorithm` | Hard | Unified diff + patch application roundtrip checks |
189
189
  | `task-scheduler` | Hard | Dependency-aware priority scheduler with timeout handling |
190
+ | `bug-hunt` | Hard | Find and fix 5 planted bugs in a calculator module |
191
+ | `refactor` | Hard | Refactor messy code while keeping tests passing |
192
+ | `concurrent-queue` | Hard | Thread-safe priority queue with producer/consumer |
193
+ | `api-client` | Hard | HTTP client with retry, rate limiting, circuit breaker |
190
194
 
191
195
  `coderace tasks list` now includes a `Verify` column so you can see which built-ins ship with verification suites.
192
196
 
@@ -0,0 +1,184 @@
1
+ # All-Day Build Contract: New Benchmark Tasks (Batch 2)
2
+
3
+ Status: In Progress
4
+ Date: 2026-03-05
5
+ Owner: Claude Code execution pass
6
+ Scope type: Deliverable-gated (no hour promises)
7
+
8
+ ## 1. Objective
9
+
10
+ Add 4 new built-in benchmark tasks to coderace that test **real-world coding skills** beyond "build from scratch." Current 16 tasks all ask agents to implement something new. These 4 test debugging, refactoring, concurrency, and API design — skills that matter in production but aren't benchmarked.
11
+
12
+ This contract is considered complete only when every deliverable and validation gate below is satisfied.
13
+
14
+ ## 2. Non-Negotiable Build Rules
15
+
16
+ 1. No time-based completion claims.
17
+ 2. Completion is allowed only when all checklist items are checked.
18
+ 3. Full test suite must pass at the end (`pytest tests/ -x`).
19
+ 4. New tasks must follow the exact same format as existing built-in tasks (see `src/coderace/tasks/` for examples).
20
+ 5. CLI outputs must be deterministic and schema-backed where specified.
21
+ 6. Never modify files outside the project directory.
22
+ 7. Commit after each completed deliverable (not at the end).
23
+ 8. If stuck on same issue for 3 attempts, stop and write a blocker report.
24
+ 9. Do NOT refactor, restyle, or "improve" existing code or tasks.
25
+ 10. Read existing tasks in `src/coderace/tasks/` before writing new ones. Match the exact YAML format, naming conventions, and verification test style.
26
+
27
+ ## 3. Feature Deliverables
28
+
29
+ ### D1. Task: `bug-hunt` (debugging — fix planted bugs in existing code)
30
+
31
+ **Concept:** Provide a working-looking module (`buggy_calculator.py`) with 5 planted bugs. The agent must find and fix them without being told exactly what's wrong.
32
+
33
+ The task description gives the agent:
34
+ - A pre-written `buggy_calculator.py` with an advanced calculator (expressions, variables, functions like `sqrt`, `abs`, `min`, `max`)
35
+ - A failing test file that demonstrates the bugs exist
36
+ - Instructions: "Fix ALL bugs. Do not rewrite from scratch."
37
+
38
+ Planted bugs (specific, not vague):
39
+ 1. Division rounds to int instead of float (`//` instead of `/`)
40
+ 2. Negative number parsing fails (misses unary minus in expressions)
41
+ 3. Variable assignment overwrites built-in functions
42
+ 4. `min(a, b)` returns max (arguments reversed)
43
+ 5. Parenthesized expressions off-by-one in closing paren detection
44
+
45
+ Hidden verification tests:
46
+ - All 5 bugs must be fixed
47
+ - Original working functionality must still work
48
+ - Code must not be a complete rewrite (verify specific function signatures still exist)
49
+ - Edge cases: nested expressions with negatives, chained function calls
50
+
51
+ Required files:
52
+ - `src/coderace/tasks/bug-hunt.yaml`
53
+ - The YAML must include `setup_files:` with the buggy source and failing tests
54
+
55
+ - [ ] Write buggy_calculator.py with exactly 5 planted bugs
56
+ - [ ] Write test_buggy_calculator.py that exposes the bugs
57
+ - [ ] Write verify_bug_hunt.py with hidden tests for all 5 fixes + non-rewrite check
58
+ - [ ] Write bug-hunt.yaml task definition
59
+ - [ ] Tests for D1 (task loads, runs dry-run, YAML validates)
60
+
61
+ ### D2. Task: `refactor` (improve existing messy code while keeping tests passing)
62
+
63
+ **Concept:** Provide a working but messy module (`data_store.py`) — a key-value store with bad naming, duplicated logic, no type hints, mixed concerns, and poor error handling. The agent must refactor it while keeping the existing test suite passing.
64
+
65
+ The task description gives the agent:
66
+ - `data_store.py` — a working but ugly key-value store (~150 lines, deliberately messy)
67
+ - `test_data_store.py` — passing tests (these must STILL pass after refactoring)
68
+ - Instructions: "Refactor for readability, maintainability, and best practices. All existing tests must pass. Do not change test file."
69
+
70
+ Quality dimensions to verify:
71
+ 1. Type hints added (verify with mypy or ast inspection)
72
+ 2. Functions under 25 lines each
73
+ 3. No duplicated logic (measure before/after)
74
+ 4. Consistent naming convention
75
+ 5. Proper error handling (specific exceptions, not bare except)
76
+
77
+ Hidden verification tests:
78
+ - All original test_data_store.py tests still pass
79
+ - AST analysis: type hints present on all public functions
80
+ - AST analysis: no function exceeds 25 lines
81
+ - New edge cases that the refactored code should handle more gracefully
82
+ - Import check: the module still exports the same public API
83
+
84
+ Required files:
85
+ - `src/coderace/tasks/refactor.yaml`
86
+
87
+ - [ ] Write messy data_store.py (~150 lines, working but ugly)
88
+ - [ ] Write test_data_store.py with comprehensive passing tests
89
+ - [ ] Write verify_refactor.py with hidden quality checks (AST-based + functional)
90
+ - [ ] Write refactor.yaml task definition
91
+ - [ ] Tests for D2
92
+
93
+ ### D3. Task: `concurrent-queue` (thread-safe producer/consumer with priority)
94
+
95
+ **Concept:** Build a thread-safe priority queue with producer/consumer pattern. Tests concurrency understanding — a known weak spot for LLMs.
96
+
97
+ The task description tells the agent to create `concurrent_queue.py` with:
98
+ - `PriorityTaskQueue` class
99
+ - `submit(task, priority)` — thread-safe task submission
100
+ - `worker(callback)` — starts a worker thread that processes tasks by priority
101
+ - `shutdown(wait=True)` — graceful shutdown
102
+ - Support for: task cancellation, timeout on get, max queue size with backpressure
103
+ - Thread-safe stats: tasks_submitted, tasks_completed, tasks_failed
104
+
105
+ And `test_concurrent_queue.py` with tests for:
106
+ - Basic submit and process
107
+ - Priority ordering (higher priority processed first)
108
+ - Multiple workers
109
+ - Graceful shutdown (pending tasks complete)
110
+ - Task cancellation
111
+ - Backpressure when queue is full
112
+ - Thread safety under concurrent access
113
+
114
+ Hidden verification tests:
115
+ - Stress test: 1000 tasks, 10 workers, verify all complete
116
+ - Priority ordering under load (submit mixed priorities, verify processing order)
117
+ - Shutdown with pending tasks (verify all complete before shutdown returns)
118
+ - Cancellation of in-flight vs queued tasks
119
+ - Deadlock detection (timeout-based, the test itself must complete in <10s)
120
+
121
+ Required files:
122
+ - `src/coderace/tasks/concurrent-queue.yaml`
123
+
124
+ - [ ] Write concurrent-queue.yaml with full spec
125
+ - [ ] Write verify_concurrent_queue.py with stress tests and deadlock detection
126
+ - [ ] Tests for D3
127
+
128
+ ### D4. Task: `api-client` (HTTP client with retry, rate limiting, circuit breaker)
129
+
130
+ **Concept:** Build a production-grade HTTP API client with resilience patterns. Tests real-world engineering patterns.
131
+
132
+ The task description tells the agent to create `api_client.py` with:
133
+ - `APIClient(base_url, max_retries=3, rate_limit_per_sec=10)`
134
+ - Methods: `get`, `post`, `put`, `delete` — all async-compatible but sync by default
135
+ - Retry with exponential backoff (jitter) on 429 and 5xx
136
+ - Rate limiting (token bucket)
137
+ - Circuit breaker (open after N consecutive failures, half-open after timeout)
138
+ - Request/response logging
139
+ - Configurable timeout per request
140
+
141
+ And `test_api_client.py` with tests using unittest.mock to simulate server responses.
142
+
143
+ Hidden verification tests:
144
+ - Retry behavior: mock server returns 503 twice then 200, verify 3 attempts made
145
+ - Rate limiting: fire 20 requests, verify spacing respects rate limit
146
+ - Circuit breaker: force 5 failures, verify circuit opens, then half-opens after cooldown
147
+ - Backoff jitter: verify retry delays aren't deterministic (statistical test)
148
+ - Timeout handling: mock slow server, verify timeout raised
149
+
150
+ Required files:
151
+ - `src/coderace/tasks/api-client.yaml`
152
+
153
+ - [ ] Write api-client.yaml with full spec
154
+ - [ ] Write verify_api_client.py with resilience pattern tests
155
+ - [ ] Tests for D4
156
+
157
+ ### D5. Integration + Documentation
158
+
159
+ - [ ] All 4 new tasks appear in `coderace tasks list` output
160
+ - [ ] `coderace benchmark --dry-run --tasks bug-hunt,refactor,concurrent-queue,api-client` works
161
+ - [ ] Update CHANGELOG.md with new tasks
162
+ - [ ] Version bump to 1.4.0 in pyproject.toml
163
+ - [ ] README: add "20 built-in tasks" (update from 16) and mention the new task categories
164
+ - [ ] All existing tests still pass (`pytest tests/ -x`)
165
+
166
+ ## 4. Test Requirements
167
+
168
+ - [ ] Unit tests for each new task (YAML loads, validates, dry-run works)
169
+ - [ ] Integration test: `coderace benchmark --dry-run --tasks bug-hunt,refactor,concurrent-queue,api-client` completes
170
+ - [ ] All existing 574 tests must still pass
171
+ - [ ] New tests bring total to 590+
172
+
173
+ ## 5. Reports
174
+
175
+ - Write progress to `progress-log.md` after each deliverable
176
+ - Include: what was built, what tests pass, what's next, any blockers
177
+ - Final summary when all deliverables done or stopped
178
+
179
+ ## 6. Stop Conditions
180
+
181
+ - All deliverables checked and all tests passing -> DONE
182
+ - 3 consecutive failed attempts on same issue -> STOP, write blocker report
183
+ - Scope creep detected (new requirements discovered) -> STOP, report what's new
184
+ - All tests passing but deliverables remain -> continue to next deliverable
@@ -0,0 +1,335 @@
1
+ name: api-client
2
+ difficulty: hard
3
+ description: |
4
+ Build a production-grade HTTP API client with resilience patterns.
5
+
6
+ Create a file called `api_client.py` with the following class:
7
+
8
+ ```python
9
+ class APIClient:
10
+ def __init__(
11
+ self,
12
+ base_url: str,
13
+ max_retries: int = 3,
14
+ rate_limit_per_sec: float = 10,
15
+ timeout: float = 30.0,
16
+ circuit_breaker_threshold: int = 5,
17
+ circuit_breaker_timeout: float = 30.0,
18
+ ):
19
+ """Initialize the API client.
20
+
21
+ Args:
22
+ base_url: Base URL for all requests (e.g., "https://api.example.com")
23
+ max_retries: Max retry attempts on 429/5xx responses
24
+ rate_limit_per_sec: Max requests per second (token bucket)
25
+ timeout: Default timeout per request in seconds
26
+ circuit_breaker_threshold: Consecutive failures before circuit opens
27
+ circuit_breaker_timeout: Seconds before circuit transitions to half-open
28
+ """
29
+
30
+ def get(self, path: str, **kwargs) -> requests.Response:
31
+ """Send a GET request."""
32
+
33
+ def post(self, path: str, **kwargs) -> requests.Response:
34
+ """Send a POST request."""
35
+
36
+ def put(self, path: str, **kwargs) -> requests.Response:
37
+ """Send a PUT request."""
38
+
39
+ def delete(self, path: str, **kwargs) -> requests.Response:
40
+ """Send a DELETE request."""
41
+ ```
42
+
43
+ Requirements:
44
+ - All HTTP methods support `timeout`, `headers`, `params`, `json`, `data` kwargs
45
+ - Retry with exponential backoff + jitter on 429 and 5xx status codes
46
+ - Rate limiting using token bucket algorithm (max N requests per second)
47
+ - Circuit breaker pattern:
48
+ - CLOSED: requests flow normally
49
+ - OPEN: after N consecutive failures, immediately raise `CircuitBreakerOpen`
50
+ - HALF-OPEN: after timeout, allow one test request through
51
+ - Success in HALF-OPEN -> CLOSED; failure -> back to OPEN
52
+ - Custom exceptions: `CircuitBreakerOpen`, `APIClientError`
53
+ - `request_log` attribute: list of dicts with keys `method`, `url`, `status_code`, `timestamp`
54
+ - Uses the `requests` library for HTTP
55
+
56
+ Also create `test_api_client.py` with tests using `unittest.mock` to mock HTTP responses:
57
+ - Basic GET/POST/PUT/DELETE
58
+ - Retry on 503 then success
59
+ - Rate limiting (verify delays between rapid requests)
60
+ - Circuit breaker opens after consecutive failures
61
+ - Circuit breaker transitions from OPEN to HALF-OPEN to CLOSED
62
+ - Timeout handling
63
+ - Request logging
64
+
65
+ Do NOT modify any files other than `api_client.py` and `test_api_client.py`.
66
+ repo: .
67
+ test_command: python3 -m pytest test_api_client.py -x -q
68
+ verify_command: python3 -m pytest verify_api_client.py -x -q
69
+ verify_files:
70
+ verify_api_client.py: |
71
+ """Hidden verification tests for api-client task."""
72
+
73
+ import time
74
+ from unittest.mock import MagicMock, patch, PropertyMock
75
+
76
+ import pytest
77
+
78
+ from api_client import APIClient, CircuitBreakerOpen
79
+
80
+
81
+ def _mock_response(status_code=200, json_data=None):
82
+ resp = MagicMock()
83
+ resp.status_code = status_code
84
+ resp.json.return_value = json_data or {}
85
+ resp.ok = 200 <= status_code < 300
86
+ resp.raise_for_status = MagicMock()
87
+ if status_code >= 400:
88
+ resp.raise_for_status.side_effect = Exception(f"HTTP {status_code}")
89
+ return resp
90
+
91
+
92
+ class TestRetryBehavior:
93
+ @patch("api_client.requests.Session.request")
94
+ def test_retries_on_503_then_succeeds(self, mock_request):
95
+ mock_request.side_effect = [
96
+ _mock_response(503),
97
+ _mock_response(503),
98
+ _mock_response(200, {"ok": True}),
99
+ ]
100
+ client = APIClient(
101
+ "http://test.com",
102
+ max_retries=3,
103
+ rate_limit_per_sec=1000,
104
+ )
105
+ resp = client.get("/endpoint")
106
+ assert resp.status_code == 200
107
+ assert mock_request.call_count == 3
108
+
109
+ @patch("api_client.requests.Session.request")
110
+ def test_retries_on_429(self, mock_request):
111
+ mock_request.side_effect = [
112
+ _mock_response(429),
113
+ _mock_response(200),
114
+ ]
115
+ client = APIClient(
116
+ "http://test.com",
117
+ max_retries=3,
118
+ rate_limit_per_sec=1000,
119
+ )
120
+ resp = client.get("/path")
121
+ assert resp.status_code == 200
122
+ assert mock_request.call_count == 2
123
+
124
+ @patch("api_client.requests.Session.request")
125
+ def test_retries_exhausted_returns_last_response(self, mock_request):
126
+ mock_request.side_effect = [
127
+ _mock_response(503),
128
+ _mock_response(503),
129
+ _mock_response(503),
130
+ ]
131
+ client = APIClient(
132
+ "http://test.com",
133
+ max_retries=3,
134
+ rate_limit_per_sec=1000,
135
+ )
136
+ resp = client.get("/fail")
137
+ assert resp.status_code == 503
138
+
139
+
140
+ class TestBackoffJitter:
141
+ @patch("api_client.requests.Session.request")
142
+ def test_retry_delays_have_jitter(self, mock_request):
143
+ """Retry delays should not be deterministic."""
144
+ mock_request.side_effect = [
145
+ _mock_response(503),
146
+ _mock_response(503),
147
+ _mock_response(200),
148
+ ]
149
+ delays = []
150
+
151
+ original_sleep = time.sleep
152
+ def capture_sleep(duration):
153
+ delays.append(duration)
154
+ # Don't actually sleep in tests
155
+
156
+ client = APIClient(
157
+ "http://test.com",
158
+ max_retries=3,
159
+ rate_limit_per_sec=1000,
160
+ )
161
+ with patch("time.sleep", side_effect=capture_sleep):
162
+ client.get("/test")
163
+
164
+ # Should have at least 1 retry delay
165
+ retry_delays = [d for d in delays if d > 0.001]
166
+ assert len(retry_delays) >= 1
167
+
168
+
169
+ class TestRateLimiting:
170
+ @patch("api_client.requests.Session.request")
171
+ def test_rate_limit_spacing(self, mock_request):
172
+ mock_request.return_value = _mock_response(200)
173
+ # Very low rate limit to make spacing measurable
174
+ client = APIClient(
175
+ "http://test.com",
176
+ max_retries=0,
177
+ rate_limit_per_sec=5,
178
+ )
179
+ start = time.time()
180
+ for _ in range(6):
181
+ client.get("/test")
182
+ elapsed = time.time() - start
183
+ # 6 requests at 5/sec should take at least ~1 second
184
+ assert elapsed >= 0.8
185
+
186
+
187
+ class TestCircuitBreaker:
188
+ @patch("api_client.requests.Session.request")
189
+ def test_circuit_opens_after_consecutive_failures(self, mock_request):
190
+ mock_request.return_value = _mock_response(503)
191
+ client = APIClient(
192
+ "http://test.com",
193
+ max_retries=1,
194
+ rate_limit_per_sec=1000,
195
+ circuit_breaker_threshold=3,
196
+ circuit_breaker_timeout=60,
197
+ )
198
+
199
+ # Cause consecutive failures to open the circuit
200
+ for _ in range(3):
201
+ try:
202
+ client.get("/fail")
203
+ except CircuitBreakerOpen:
204
+ pass
205
+
206
+ # Now circuit should be open
207
+ with pytest.raises(CircuitBreakerOpen):
208
+ client.get("/blocked")
209
+
210
+ @patch("api_client.requests.Session.request")
211
+ def test_circuit_half_open_after_timeout(self, mock_request):
212
+ mock_request.return_value = _mock_response(503)
213
+ client = APIClient(
214
+ "http://test.com",
215
+ max_retries=1,
216
+ rate_limit_per_sec=1000,
217
+ circuit_breaker_threshold=3,
218
+ circuit_breaker_timeout=0.1,
219
+ )
220
+
221
+ # Open the circuit
222
+ for _ in range(5):
223
+ try:
224
+ client.get("/fail")
225
+ except CircuitBreakerOpen:
226
+ pass
227
+
228
+ # Wait for timeout to elapse
229
+ time.sleep(0.15)
230
+
231
+ # Should allow one request through (half-open)
232
+ mock_request.return_value = _mock_response(200)
233
+ resp = client.get("/recover")
234
+ assert resp.status_code == 200
235
+
236
+ @patch("api_client.requests.Session.request")
237
+ def test_circuit_closes_on_half_open_success(self, mock_request):
238
+ mock_request.return_value = _mock_response(503)
239
+ client = APIClient(
240
+ "http://test.com",
241
+ max_retries=1,
242
+ rate_limit_per_sec=1000,
243
+ circuit_breaker_threshold=3,
244
+ circuit_breaker_timeout=0.1,
245
+ )
246
+
247
+ # Open the circuit
248
+ for _ in range(5):
249
+ try:
250
+ client.get("/fail")
251
+ except CircuitBreakerOpen:
252
+ pass
253
+
254
+ time.sleep(0.15)
255
+
256
+ # Half-open success
257
+ mock_request.return_value = _mock_response(200)
258
+ client.get("/recover")
259
+
260
+ # Circuit should be closed now — further requests should work
261
+ resp = client.get("/another")
262
+ assert resp.status_code == 200
263
+
264
+
265
+ class TestHTTPMethods:
266
+ @patch("api_client.requests.Session.request")
267
+ def test_post(self, mock_request):
268
+ mock_request.return_value = _mock_response(201)
269
+ client = APIClient("http://test.com", rate_limit_per_sec=1000)
270
+ resp = client.post("/data", json={"key": "value"})
271
+ assert resp.status_code == 201
272
+
273
+ @patch("api_client.requests.Session.request")
274
+ def test_put(self, mock_request):
275
+ mock_request.return_value = _mock_response(200)
276
+ client = APIClient("http://test.com", rate_limit_per_sec=1000)
277
+ resp = client.put("/data/1", json={"key": "updated"})
278
+ assert resp.status_code == 200
279
+
280
+ @patch("api_client.requests.Session.request")
281
+ def test_delete(self, mock_request):
282
+ mock_request.return_value = _mock_response(204)
283
+ client = APIClient("http://test.com", rate_limit_per_sec=1000)
284
+ resp = client.delete("/data/1")
285
+ assert resp.status_code == 204
286
+
287
+
288
+ class TestRequestLogging:
289
+ @patch("api_client.requests.Session.request")
290
+ def test_requests_are_logged(self, mock_request):
291
+ mock_request.return_value = _mock_response(200)
292
+ client = APIClient("http://test.com", rate_limit_per_sec=1000)
293
+ client.get("/a")
294
+ client.post("/b")
295
+
296
+ log = client.request_log
297
+ assert len(log) >= 2
298
+ assert log[0]["method"].upper() == "GET"
299
+ assert "/a" in log[0]["url"]
300
+ assert log[0]["status_code"] == 200
301
+ assert "timestamp" in log[0]
302
+
303
+
304
+ class TestTimeout:
305
+ @patch("api_client.requests.Session.request")
306
+ def test_timeout_passed_to_request(self, mock_request):
307
+ mock_request.return_value = _mock_response(200)
308
+ client = APIClient(
309
+ "http://test.com",
310
+ rate_limit_per_sec=1000,
311
+ timeout=5.0,
312
+ )
313
+ client.get("/test")
314
+ call_kwargs = mock_request.call_args
315
+ # Timeout should be passed to the underlying request
316
+ assert call_kwargs.kwargs.get("timeout") == 5.0 or (
317
+ len(call_kwargs.args) > 0 and
318
+ any(isinstance(a, (int, float)) and a == 5.0
319
+ for a in call_kwargs.args)
320
+ ) or "timeout" in str(call_kwargs)
321
+ lint_command: ruff check api_client.py
322
+ timeout: 600
323
+ agents:
324
+ - claude
325
+ - codex
326
+ - gemini
327
+ - aider
328
+ - opencode
329
+ scoring:
330
+ tests: 25
331
+ verify: 30
332
+ exit: 20
333
+ lint: 15
334
+ time: 5
335
+ lines: 5