coderace 1.9.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. coderace-2.0.0/BUILD-REPORT-trend.md +68 -0
  2. coderace-2.0.0/BUILD-REPORT.md +41 -0
  3. {coderace-1.9.0 → coderace-2.0.0}/CHANGELOG.md +15 -0
  4. {coderace-1.9.0 → coderace-2.0.0}/PKG-INFO +1 -1
  5. {coderace-1.9.0 → coderace-2.0.0}/coderace/__init__.py +1 -1
  6. {coderace-1.9.0 → coderace-2.0.0}/coderace/cli.py +77 -0
  7. coderace-2.0.0/coderace/commands/trend.py +330 -0
  8. {coderace-1.9.0 → coderace-2.0.0}/pyproject.toml +1 -1
  9. coderace-2.0.0/tests/commands/test_trend.py +409 -0
  10. coderace-1.9.0/BUILD-REPORT.md +0 -22
  11. {coderace-1.9.0 → coderace-2.0.0}/.claude-task.md +0 -0
  12. {coderace-1.9.0 → coderace-2.0.0}/.github/workflows/examples/coderace-pr-review.yml +0 -0
  13. {coderace-1.9.0 → coderace-2.0.0}/.github/workflows/examples/coderace-quality-gate.yml +0 -0
  14. {coderace-1.9.0 → coderace-2.0.0}/.github/workflows/publish.yml +0 -0
  15. {coderace-1.9.0 → coderace-2.0.0}/.gitignore +0 -0
  16. {coderace-1.9.0 → coderace-2.0.0}/DONE.txt +0 -0
  17. {coderace-1.9.0 → coderace-2.0.0}/LICENSE +0 -0
  18. {coderace-1.9.0 → coderace-2.0.0}/README.md +0 -0
  19. {coderace-1.9.0 → coderace-2.0.0}/action.yml +0 -0
  20. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-benchmark-tasks-v2.md +0 -0
  21. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-benchmark.md +0 -0
  22. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-builtin-tasks.md +0 -0
  23. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-ci-integration.md +0 -0
  24. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-context-eval.md +0 -0
  25. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-cost-tracking.md +0 -0
  26. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-dashboard.md +0 -0
  27. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-leaderboard.md +0 -0
  28. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-model-selection.md +0 -0
  29. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-race-mode.md +0 -0
  30. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-review-mode.md +0 -0
  31. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-v0.2.md +0 -0
  32. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-v090-tasks.md +0 -0
  33. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-v1.0-statistical.md +0 -0
  34. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-v1.6.0-github-action-review.md +0 -0
  35. {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-verification-tests.md +0 -0
  36. {coderace-1.9.0 → coderace-2.0.0}/benchmark-results/fibonacci-2026-02-27.md +0 -0
  37. {coderace-1.9.0 → coderace-2.0.0}/benchmark-results/fibonacci-v2-2026-02-27.md +0 -0
  38. {coderace-1.9.0 → coderace-2.0.0}/benchmark-results/hard-tasks-2026-02-27.md +0 -0
  39. {coderace-1.9.0 → coderace-2.0.0}/benchmark-results/multi-task-2026-02-27.md +0 -0
  40. {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/__init__.py +0 -0
  41. {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/aider.py +0 -0
  42. {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/base.py +0 -0
  43. {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/claude.py +0 -0
  44. {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/codex.py +0 -0
  45. {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/gemini.py +0 -0
  46. {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/opencode.py +0 -0
  47. {coderace-1.9.0 → coderace-2.0.0}/coderace/benchmark.py +0 -0
  48. {coderace-1.9.0 → coderace-2.0.0}/coderace/benchmark_report.py +0 -0
  49. {coderace-1.9.0 → coderace-2.0.0}/coderace/benchmark_stats.py +0 -0
  50. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/__init__.py +0 -0
  51. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/api-client.yaml +0 -0
  52. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/binary-search-tree.yaml +0 -0
  53. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/bug-hunt.yaml +0 -0
  54. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/cli-args-parser.yaml +0 -0
  55. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/concurrent-queue.yaml +0 -0
  56. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/csv-analyzer.yaml +0 -0
  57. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/data-pipeline.yaml +0 -0
  58. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/diff-algorithm.yaml +0 -0
  59. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/expression-evaluator.yaml +0 -0
  60. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/fibonacci.yaml +0 -0
  61. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/file-watcher.yaml +0 -0
  62. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/http-server.yaml +0 -0
  63. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/json-parser.yaml +0 -0
  64. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/lru-cache.yaml +0 -0
  65. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/markdown-to-html.yaml +0 -0
  66. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/refactor.yaml +0 -0
  67. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/regex-engine.yaml +0 -0
  68. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/state-machine.yaml +0 -0
  69. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/task-scheduler.yaml +0 -0
  70. {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/url-router.yaml +0 -0
  71. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/__init__.py +0 -0
  72. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/benchmark.py +0 -0
  73. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/context_eval.py +0 -0
  74. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/dashboard.py +0 -0
  75. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/diff.py +0 -0
  76. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/gate.py +0 -0
  77. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/history.py +0 -0
  78. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/leaderboard.py +0 -0
  79. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/race.py +0 -0
  80. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/results.py +0 -0
  81. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/review.py +0 -0
  82. {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/tasks.py +0 -0
  83. {coderace-1.9.0 → coderace-2.0.0}/coderace/context_eval.py +0 -0
  84. {coderace-1.9.0 → coderace-2.0.0}/coderace/context_eval_report.py +0 -0
  85. {coderace-1.9.0 → coderace-2.0.0}/coderace/cost.py +0 -0
  86. {coderace-1.9.0 → coderace-2.0.0}/coderace/dashboard.py +0 -0
  87. {coderace-1.9.0 → coderace-2.0.0}/coderace/display.py +0 -0
  88. {coderace-1.9.0 → coderace-2.0.0}/coderace/elo.py +0 -0
  89. {coderace-1.9.0 → coderace-2.0.0}/coderace/export.py +0 -0
  90. {coderace-1.9.0 → coderace-2.0.0}/coderace/git_ops.py +0 -0
  91. {coderace-1.9.0 → coderace-2.0.0}/coderace/html_report.py +0 -0
  92. {coderace-1.9.0 → coderace-2.0.0}/coderace/maintainer_rubric.py +0 -0
  93. {coderace-1.9.0 → coderace-2.0.0}/coderace/publish.py +0 -0
  94. {coderace-1.9.0 → coderace-2.0.0}/coderace/reporter.py +0 -0
  95. {coderace-1.9.0 → coderace-2.0.0}/coderace/review.py +0 -0
  96. {coderace-1.9.0 → coderace-2.0.0}/coderace/review_report.py +0 -0
  97. {coderace-1.9.0 → coderace-2.0.0}/coderace/scorer.py +0 -0
  98. {coderace-1.9.0 → coderace-2.0.0}/coderace/statistics.py +0 -0
  99. {coderace-1.9.0 → coderace-2.0.0}/coderace/stats.py +0 -0
  100. {coderace-1.9.0 → coderace-2.0.0}/coderace/store.py +0 -0
  101. {coderace-1.9.0 → coderace-2.0.0}/coderace/task.py +0 -0
  102. {coderace-1.9.0 → coderace-2.0.0}/coderace/types.py +0 -0
  103. {coderace-1.9.0 → coderace-2.0.0}/demo-race.yaml +0 -0
  104. {coderace-1.9.0 → coderace-2.0.0}/examples/add-type-hints.yaml +0 -0
  105. {coderace-1.9.0 → coderace-2.0.0}/examples/ci-race-on-pr.yml +0 -0
  106. {coderace-1.9.0 → coderace-2.0.0}/examples/context-eval-demo.sh +0 -0
  107. {coderace-1.9.0 → coderace-2.0.0}/examples/example-task.yaml +0 -0
  108. {coderace-1.9.0 → coderace-2.0.0}/examples/fix-edge-case.yaml +0 -0
  109. {coderace-1.9.0 → coderace-2.0.0}/examples/model-selection.yaml +0 -0
  110. {coderace-1.9.0 → coderace-2.0.0}/examples/write-tests.yaml +0 -0
  111. {coderace-1.9.0 → coderace-2.0.0}/progress-log.md +0 -0
  112. {coderace-1.9.0 → coderace-2.0.0}/scripts/ci-gate.sh +0 -0
  113. {coderace-1.9.0 → coderace-2.0.0}/scripts/ci-review.sh +0 -0
  114. {coderace-1.9.0 → coderace-2.0.0}/scripts/ci-run.sh +0 -0
  115. {coderace-1.9.0 → coderace-2.0.0}/scripts/format-comment.py +0 -0
  116. {coderace-1.9.0 → coderace-2.0.0}/scripts/format-review-comment.py +0 -0
  117. {coderace-1.9.0 → coderace-2.0.0}/tasks/markdown-table.yaml +0 -0
  118. {coderace-1.9.0 → coderace-2.0.0}/tasks/parse-duration.yaml +0 -0
  119. {coderace-1.9.0 → coderace-2.0.0}/tests/__init__.py +0 -0
  120. {coderace-1.9.0 → coderace-2.0.0}/tests/conftest.py +0 -0
  121. {coderace-1.9.0 → coderace-2.0.0}/tests/fixtures/sample.patch +0 -0
  122. {coderace-1.9.0 → coderace-2.0.0}/tests/test_adapters.py +0 -0
  123. {coderace-1.9.0 → coderace-2.0.0}/tests/test_benchmark.py +0 -0
  124. {coderace-1.9.0 → coderace-2.0.0}/tests/test_benchmark_tasks_v2.py +0 -0
  125. {coderace-1.9.0 → coderace-2.0.0}/tests/test_benchmark_trials.py +0 -0
  126. {coderace-1.9.0 → coderace-2.0.0}/tests/test_benchmark_v1_integration.py +0 -0
  127. {coderace-1.9.0 → coderace-2.0.0}/tests/test_builtins.py +0 -0
  128. {coderace-1.9.0 → coderace-2.0.0}/tests/test_ci_gate.py +0 -0
  129. {coderace-1.9.0 → coderace-2.0.0}/tests/test_cli.py +0 -0
  130. {coderace-1.9.0 → coderace-2.0.0}/tests/test_cli_store_integration.py +0 -0
  131. {coderace-1.9.0 → coderace-2.0.0}/tests/test_context_eval.py +0 -0
  132. {coderace-1.9.0 → coderace-2.0.0}/tests/test_context_eval_dashboard.py +0 -0
  133. {coderace-1.9.0 → coderace-2.0.0}/tests/test_cost.py +0 -0
  134. {coderace-1.9.0 → coderace-2.0.0}/tests/test_cost_config.py +0 -0
  135. {coderace-1.9.0 → coderace-2.0.0}/tests/test_cost_integration.py +0 -0
  136. {coderace-1.9.0 → coderace-2.0.0}/tests/test_dashboard.py +0 -0
  137. {coderace-1.9.0 → coderace-2.0.0}/tests/test_dashboard_cli.py +0 -0
  138. {coderace-1.9.0 → coderace-2.0.0}/tests/test_diff.py +0 -0
  139. {coderace-1.9.0 → coderace-2.0.0}/tests/test_elo.py +0 -0
  140. {coderace-1.9.0 → coderace-2.0.0}/tests/test_examples.py +0 -0
  141. {coderace-1.9.0 → coderace-2.0.0}/tests/test_export.py +0 -0
  142. {coderace-1.9.0 → coderace-2.0.0}/tests/test_format_comment.py +0 -0
  143. {coderace-1.9.0 → coderace-2.0.0}/tests/test_full_workflow.py +0 -0
  144. {coderace-1.9.0 → coderace-2.0.0}/tests/test_git_ops.py +0 -0
  145. {coderace-1.9.0 → coderace-2.0.0}/tests/test_github_action_review.py +0 -0
  146. {coderace-1.9.0 → coderace-2.0.0}/tests/test_history.py +0 -0
  147. {coderace-1.9.0 → coderace-2.0.0}/tests/test_html_report.py +0 -0
  148. {coderace-1.9.0 → coderace-2.0.0}/tests/test_leaderboard.py +0 -0
  149. {coderace-1.9.0 → coderace-2.0.0}/tests/test_maintainer_rubric.py +0 -0
  150. {coderace-1.9.0 → coderace-2.0.0}/tests/test_markdown_results.py +0 -0
  151. {coderace-1.9.0 → coderace-2.0.0}/tests/test_model_selection_d1_d2.py +0 -0
  152. {coderace-1.9.0 → coderace-2.0.0}/tests/test_model_selection_d3.py +0 -0
  153. {coderace-1.9.0 → coderace-2.0.0}/tests/test_model_selection_d4.py +0 -0
  154. {coderace-1.9.0 → coderace-2.0.0}/tests/test_publish.py +0 -0
  155. {coderace-1.9.0 → coderace-2.0.0}/tests/test_race.py +0 -0
  156. {coderace-1.9.0 → coderace-2.0.0}/tests/test_reporter.py +0 -0
  157. {coderace-1.9.0 → coderace-2.0.0}/tests/test_review.py +0 -0
  158. {coderace-1.9.0 → coderace-2.0.0}/tests/test_scorer.py +0 -0
  159. {coderace-1.9.0 → coderace-2.0.0}/tests/test_statistics.py +0 -0
  160. {coderace-1.9.0 → coderace-2.0.0}/tests/test_stats.py +0 -0
  161. {coderace-1.9.0 → coderace-2.0.0}/tests/test_store.py +0 -0
  162. {coderace-1.9.0 → coderace-2.0.0}/tests/test_task.py +0 -0
  163. {coderace-1.9.0 → coderace-2.0.0}/tests/test_tasks_cli.py +0 -0
  164. {coderace-1.9.0 → coderace-2.0.0}/tests/test_verification_integration.py +0 -0
  165. {coderace-1.9.0 → coderace-2.0.0}/uv.lock +0 -0
@@ -0,0 +1,68 @@
1
+ # Build Report: coderace v2.0.0 trend command
2
+
3
+ **Date:** 2026-03-12
4
+ **Contract:** `/Users/mordecai/.openclaw/workspace/memory/contracts/coderace-v2.0.0-trend.md`
5
+ **Status:** SUCCESS
6
+ **Commit:** `7a33d88` — "feat: add trend command, bump to v2.0.0"
7
+
8
+ ---
9
+
10
+ ## Deliverables Completed
11
+
12
+ - [x] `coderace/commands/trend.py` — full implementation
13
+ - [x] `coderace trend` registered in `coderace/cli.py`
14
+ - [x] `tests/commands/test_trend.py` — 32 new tests
15
+ - [x] `pyproject.toml` version bumped: 1.9.0 → 2.0.0
16
+ - [x] `coderace/__init__.py` version bumped: 1.9.0 → 2.0.0
17
+ - [x] `CHANGELOG.md` entry written
18
+ - [x] `coderace trend --help` confirmed working
19
+
20
+ ---
21
+
22
+ ## Test Results
23
+
24
+ | Suite | Tests | Passed | Failed |
25
+ |-------|------:|-------:|-------:|
26
+ | `tests/commands/test_trend.py` (new) | 32 | 32 | 0 |
27
+ | Full test suite (`tests/`) | 761 | 761 | 0 |
28
+
29
+ ---
30
+
31
+ ## Command Interface
32
+
33
+ ```
34
+ Usage: coderace trend [OPTIONS]
35
+
36
+ Visualize agent score progression over time.
37
+
38
+ Options:
39
+ --agent TEXT Filter by agent name (also enables detailed per-task view)
40
+ --task TEXT Filter by task name
41
+ --days INTEGER Look back this many days (default: 30)
42
+ --format TEXT Output format: terminal (default) | markdown | json
43
+ --help Show this message and exit.
44
+ ```
45
+
46
+ ---
47
+
48
+ ## Implementation Notes
49
+
50
+ - `coderace/commands/trend.py` contains all business logic (TrendPoint, AgentTaskTrend, sparkline, format functions) — CLI command in cli.py is thin glue only, matching the `history` command pattern
51
+ - Sparkline uses Unicode block chars `▁▂▃▄▅▆▇█` with ASCII fallback `_.-*^`
52
+ - Date filtering applied post-query (get_runs doesn't support `since` param directly)
53
+ - `--format json` returns structured `{ trends: [{ agent, task, runs, summary }] }` with improvement_rate as `trend_pct`
54
+
55
+ ---
56
+
57
+ ## Issues Encountered
58
+
59
+ 1. **`CliRunner(mix_stderr=False)` unsupported** in this version of typer's CliRunner. Fixed: removed the argument.
60
+ 2. **`coderace` on PATH is brew-installed v1.3.0** at `/opt/homebrew/bin/coderace`. The pipx version at `~/.local/bin/coderace` is v2.0.0 and has the `trend` command. Confirmed working: `~/.local/bin/coderace trend --help`. The PATH ordering issue is outside build scope.
61
+
62
+ ---
63
+
64
+ ## Stop Conditions Honored
65
+
66
+ - Did NOT push to PyPI
67
+ - Did NOT git push
68
+ - Did NOT tag release
@@ -0,0 +1,41 @@
1
+ # coderace v1.9.0 Build Report
2
+
3
+ **Built:** 2026-03-12
4
+ **Version:** 1.9.0
5
+ **Commit:** 19d65382e908d292b0e42003f98cdf7832fea49a
6
+ **Tests:** 729 passing (700 baseline + 29 new)
7
+ **PyPI:** https://pypi.org/project/coderace/1.9.0/
8
+
9
+ ---
10
+
11
+ ## What was built
12
+
13
+ **CI Quality Gate** — makes the maintainer rubric enforceable in CI.
14
+
15
+ METR published research (Mar 2026) showing ~50% of SWE-bench-passing PRs would be rejected by real maintainers. coderace v1.8.0 shipped the rubric as a diagnostic. v1.9.0 makes it a gate.
16
+
17
+ ### D1: `--min-score` on `coderace review --maintainer-mode`
18
+ - New `--min-score N` flag (0-100 int)
19
+ - Exits 1 when composite rubric score < N
20
+ - Prints `✅ Maintainer score 87 ≥ 80 (gate: PASS)` or `❌ Maintainer score 54 < 80 (gate: FAIL)`
21
+ - Without `--min-score`: existing behavior unchanged
22
+
23
+ ### D2: `coderace gate` standalone command
24
+ - `coderace gate --diff <file|-> --min-score 80`
25
+ - Accepts diff via file path or stdin (`--diff -`)
26
+ - Exits 0 (pass) or 1 (fail)
27
+ - `--json` flag for CI log parsing (score, gate, dimensions)
28
+ - Pure static analysis — no LLM, no API keys required
29
+
30
+ ### D3: GitHub Action update
31
+ - New `action.yml` input: `maintainer-min-score` (default: empty = no gate, backward compatible)
32
+ - New `scripts/ci-gate.sh` CI script handling all diff sources
33
+ - Example workflow: `.github/workflows/examples/coderace-quality-gate.yml`
34
+
35
+ ### D4: Tests (29 new)
36
+ - `tests/test_ci_gate.py` — 29 tests covering gate pass/fail, threshold edge cases, empty diff, JSON output, error handling, --min-score on review, action.yml structure
37
+
38
+ ### D5: Docs
39
+ - README: "CI Quality Gate" section with one-liner examples and GitHub Action snippet
40
+ - CHANGELOG: v1.9.0 entry
41
+ - Version bumped: 1.8.0 → 1.9.0
@@ -1,5 +1,20 @@
1
1
  # Changelog
2
2
 
3
+ ## [2.0.0] - 2026-03-12
4
+
5
+ ### Added
6
+ - `coderace trend` command: visualize agent score progression over time
7
+ - Summary table: Agent | Task | Runs | Avg Score | Best Score | Latest Score | Trend
8
+ - Unicode sparkline in Trend column (`▁▂▃▄▅▆▇█`) with delta arrow (`↑ +8.3` / `↓ -2.1` / `→ 0.0`)
9
+ - Color-coded output: green (improving), red (regressing), white (flat)
10
+ - `--agent`: filter to one agent and show detailed per-task score history
11
+ - `--task`: filter to one task
12
+ - `--days N`: look back N days (default: 30)
13
+ - `--format terminal|markdown|json`: multiple output formats
14
+ - `--format json` returns structured `{ agent, task, runs, summary }` suitable for CI
15
+ - Graceful empty-DB handling (no error, helpful message)
16
+ - 32 new tests covering unit logic, edge cases, and CLI integration
17
+
3
18
  ## [1.9.0] - 2026-03-12
4
19
 
5
20
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: coderace
3
- Version: 1.9.0
3
+ Version: 2.0.0
4
4
  Summary: Race coding agents against each other on real tasks
5
5
  Project-URL: Homepage, https://github.com/mikiships/coderace
6
6
  Project-URL: Repository, https://github.com/mikiships/coderace
@@ -1,3 +1,3 @@
1
1
  """coderace - Race coding agents against each other on real tasks."""
2
2
 
3
- __version__ = "1.9.0"
3
+ __version__ = "2.0.0"
@@ -1108,6 +1108,83 @@ def dashboard(
1108
1108
  console.print(f"[dim]Opened in browser[/dim]")
1109
1109
 
1110
1110
 
1111
+ @app.command()
1112
+ def trend(
1113
+ agent: str | None = typer.Option(
1114
+ None, "--agent", help="Filter by agent name (also enables detailed per-task view)"
1115
+ ),
1116
+ task: str | None = typer.Option(
1117
+ None, "--task", help="Filter by task name"
1118
+ ),
1119
+ days: int = typer.Option(
1120
+ 30, "--days", help="Look back this many days (default: 30)"
1121
+ ),
1122
+ fmt: str | None = typer.Option(
1123
+ None,
1124
+ "--format",
1125
+ "-F",
1126
+ help="Output format: terminal (default) | markdown | json",
1127
+ ),
1128
+ ) -> None:
1129
+ """Visualize agent score progression over time."""
1130
+ import sys
1131
+
1132
+ from coderace.commands.trend import (
1133
+ _build_trends,
1134
+ format_trend_json,
1135
+ format_trend_markdown,
1136
+ format_trend_terminal,
1137
+ )
1138
+ from coderace.store import ResultStore
1139
+
1140
+ try:
1141
+ store = ResultStore()
1142
+ except Exception as exc:
1143
+ console.print(f"[red]Cannot open result store: {exc}[/red]")
1144
+ raise typer.Exit(1)
1145
+
1146
+ try:
1147
+ since = f"{days}d"
1148
+ # Fetch runs with optional filters; use a generous limit
1149
+ from coderace.store import _parse_since
1150
+ from coderace.store import get_db_path
1151
+ import sqlite3
1152
+
1153
+ # Get all runs within the time window, filtered by agent/task
1154
+ db_path = get_db_path()
1155
+ db_path.parent.mkdir(parents=True, exist_ok=True)
1156
+ runs = store.get_runs(
1157
+ task_name=task,
1158
+ agent=agent,
1159
+ limit=10000,
1160
+ )
1161
+ # Apply date filter manually since get_runs doesn't support since
1162
+ if days:
1163
+ cutoff = _parse_since(f"{days}d")
1164
+ if cutoff:
1165
+ runs = [r for r in runs if r.timestamp >= cutoff]
1166
+ finally:
1167
+ store.close()
1168
+
1169
+ trends = _build_trends(runs, agent_filter=agent, task_filter=task)
1170
+
1171
+ if not trends:
1172
+ console.print("[yellow]No trend data found. Run some races first.[/yellow]")
1173
+ return
1174
+
1175
+ if fmt == "markdown":
1176
+ sys.stdout.write(format_trend_markdown(trends, detail_agent=agent))
1177
+ elif fmt == "json":
1178
+ sys.stdout.write(format_trend_json(trends))
1179
+ elif fmt is not None and fmt != "terminal":
1180
+ console.print(
1181
+ f"[red]Unknown --format {fmt!r}. Choose: terminal, markdown, json[/red]"
1182
+ )
1183
+ raise typer.Exit(1)
1184
+ else:
1185
+ format_trend_terminal(trends, detail_agent=agent, console=console)
1186
+
1187
+
1111
1188
  @app.command()
1112
1189
  def version() -> None:
1113
1190
  """Show coderace version."""
@@ -0,0 +1,330 @@
1
+ """Trend command — visualize agent score progression over time."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime, timedelta, timezone
9
+ from typing import Optional
10
+
11
+ from rich.console import Console
12
+ from rich.table import Table
13
+ from rich.text import Text
14
+
15
+ from coderace.store import RunRecord, AgentRecord
16
+
17
+ # Unicode sparkline characters (low → high)
18
+ _SPARK_CHARS = "▁▂▃▄▅▆▇█"
19
+ _SPARK_ASCII = "_.-*^"
20
+
21
+
22
+ @dataclass
23
+ class TrendPoint:
24
+ """A single data point in a trend series."""
25
+
26
+ run_id: int
27
+ timestamp: str
28
+ score: float
29
+ delta: Optional[float] # None for first point
30
+ is_winner: bool
31
+
32
+
33
+ @dataclass
34
+ class AgentTaskTrend:
35
+ """Score trend for an (agent, task) pair."""
36
+
37
+ agent: str
38
+ task: str
39
+ points: list[TrendPoint] = field(default_factory=list)
40
+
41
+ @property
42
+ def runs(self) -> int:
43
+ return len(self.points)
44
+
45
+ @property
46
+ def avg_score(self) -> float:
47
+ if not self.points:
48
+ return 0.0
49
+ return sum(p.score for p in self.points) / len(self.points)
50
+
51
+ @property
52
+ def best_score(self) -> float:
53
+ if not self.points:
54
+ return 0.0
55
+ return max(p.score for p in self.points)
56
+
57
+ @property
58
+ def latest_score(self) -> float:
59
+ if not self.points:
60
+ return 0.0
61
+ return self.points[-1].score
62
+
63
+ @property
64
+ def latest_delta(self) -> Optional[float]:
65
+ if len(self.points) < 2:
66
+ return None
67
+ return self.points[-1].delta
68
+
69
+ @property
70
+ def improvement_rate(self) -> Optional[float]:
71
+ """Pct of runs where score improved vs previous run."""
72
+ if len(self.points) < 2:
73
+ return None
74
+ improvements = sum(
75
+ 1 for p in self.points[1:] if p.delta is not None and p.delta > 0
76
+ )
77
+ return improvements / (len(self.points) - 1)
78
+
79
+ def sparkline(self, use_unicode: bool = True) -> str:
80
+ """Generate a sparkline string for the score series."""
81
+ scores = [p.score for p in self.points]
82
+ if not scores:
83
+ return "—"
84
+ if len(scores) == 1:
85
+ return "—"
86
+
87
+ chars = _SPARK_CHARS if use_unicode else _SPARK_ASCII
88
+ n = len(chars)
89
+ lo, hi = min(scores), max(scores)
90
+ span = hi - lo
91
+
92
+ result = []
93
+ for s in scores:
94
+ if span == 0:
95
+ idx = n // 2
96
+ else:
97
+ idx = round((s - lo) / span * (n - 1))
98
+ result.append(chars[idx])
99
+ return "".join(result)
100
+
101
+
102
+ def _trend_direction(delta: Optional[float]) -> tuple[str, str]:
103
+ """Return (symbol, rich_style) for a delta value."""
104
+ if delta is None:
105
+ return "—", "dim"
106
+ if delta > 0.05:
107
+ return f"↑ +{delta:.1f}", "green"
108
+ if delta < -0.05:
109
+ return f"↓ {delta:.1f}", "red"
110
+ return f"→ {delta:.1f}", "white"
111
+
112
+
113
+ def _build_trends(
114
+ runs: list[RunRecord],
115
+ agent_filter: Optional[str],
116
+ task_filter: Optional[str],
117
+ ) -> list[AgentTaskTrend]:
118
+ """Build per-(agent, task) trend objects from run records."""
119
+ # Group: (agent, task) -> list of (timestamp, run_id, score, is_winner)
120
+ groups: dict[tuple[str, str], list[tuple[str, int, float, bool]]] = {}
121
+
122
+ for run in runs:
123
+ if task_filter and run.task_name != task_filter:
124
+ continue
125
+ for ar in run.agents:
126
+ if agent_filter and ar.agent != agent_filter:
127
+ continue
128
+ key = (ar.agent, run.task_name)
129
+ groups.setdefault(key, []).append(
130
+ (run.timestamp, run.run_id, ar.composite_score, ar.is_winner)
131
+ )
132
+
133
+ trends: list[AgentTaskTrend] = []
134
+ for (agent, task), entries in sorted(groups.items()):
135
+ # Sort chronologically (oldest first) so delta makes sense
136
+ entries.sort(key=lambda x: x[0])
137
+ trend = AgentTaskTrend(agent=agent, task=task)
138
+ prev_score: Optional[float] = None
139
+ for ts, run_id, score, is_winner in entries:
140
+ delta = score - prev_score if prev_score is not None else None
141
+ trend.points.append(
142
+ TrendPoint(
143
+ run_id=run_id,
144
+ timestamp=ts,
145
+ score=score,
146
+ delta=delta,
147
+ is_winner=is_winner,
148
+ )
149
+ )
150
+ prev_score = score
151
+ trends.append(trend)
152
+
153
+ return trends
154
+
155
+
156
+ def format_trend_terminal(
157
+ trends: list[AgentTaskTrend],
158
+ detail_agent: Optional[str] = None,
159
+ console: Optional[Console] = None,
160
+ ) -> str:
161
+ """Render trend data as a Rich terminal table."""
162
+ console = console or Console()
163
+
164
+ if not trends:
165
+ console.print("[yellow]No trend data found.[/yellow]")
166
+ return ""
167
+
168
+ use_unicode = sys.stdout.encoding.lower().startswith(("utf", "us-ascii")) if hasattr(sys.stdout, "encoding") else True
169
+
170
+ if detail_agent:
171
+ # Detailed per-task view for a single agent
172
+ table = Table(title=f"coderace trend — {detail_agent}", show_lines=True)
173
+ table.add_column("Run ID", justify="center", style="bold")
174
+ table.add_column("Date", style="dim")
175
+ table.add_column("Task", style="cyan")
176
+ table.add_column("Score", justify="right")
177
+ table.add_column("Delta", justify="right")
178
+ table.add_column("Result")
179
+
180
+ for trend in trends:
181
+ for p in trend.points:
182
+ ts = p.timestamp
183
+ if "T" in ts:
184
+ ts = ts.split("T")[0] + " " + ts.split("T")[1][:8]
185
+
186
+ sym, sty = _trend_direction(p.delta)
187
+ result_str = "win" if p.is_winner else "loss"
188
+ result_style = "green" if p.is_winner else "dim"
189
+
190
+ table.add_row(
191
+ str(p.run_id),
192
+ ts,
193
+ trend.task,
194
+ f"{p.score:.1f}",
195
+ Text(sym, style=sty),
196
+ Text(result_str, style=result_style),
197
+ )
198
+
199
+ console.print(table)
200
+
201
+ # Summary stats
202
+ total_runs = sum(t.runs for t in trends)
203
+ all_scores = [p.score for t in trends for p in t.points]
204
+ avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
205
+ best = max(all_scores) if all_scores else 0.0
206
+ improvement_rates = [t.improvement_rate for t in trends if t.improvement_rate is not None]
207
+ avg_impr = sum(improvement_rates) / len(improvement_rates) if improvement_rates else None
208
+
209
+ console.print(f"\n[bold]Summary for {detail_agent}[/bold]")
210
+ console.print(f" Total runs: {total_runs}")
211
+ console.print(f" Avg score: {avg:.1f}")
212
+ console.print(f" Best score: {best:.1f}")
213
+ if avg_impr is not None:
214
+ console.print(f" Improvement rate: {avg_impr:.0%}")
215
+ else:
216
+ console.print(" Improvement rate: — (need 2+ runs per task)")
217
+ else:
218
+ # Summary table: one row per (agent, task)
219
+ table = Table(title="coderace trend", show_lines=True)
220
+ table.add_column("Agent", style="cyan")
221
+ table.add_column("Task")
222
+ table.add_column("Runs", justify="right")
223
+ table.add_column("Avg Score", justify="right")
224
+ table.add_column("Best Score", justify="right")
225
+ table.add_column("Latest Score", justify="right")
226
+ table.add_column("Trend")
227
+
228
+ for trend in trends:
229
+ spark = trend.sparkline(use_unicode=use_unicode)
230
+ sym, sty = _trend_direction(trend.latest_delta)
231
+
232
+ if trend.runs < 2:
233
+ trend_cell: str | Text = Text("—", style="dim")
234
+ elif trend.latest_delta is not None and trend.latest_delta > 0.05:
235
+ trend_cell = Text(f"{spark} {sym}", style="green")
236
+ elif trend.latest_delta is not None and trend.latest_delta < -0.05:
237
+ trend_cell = Text(f"{spark} {sym}", style="red")
238
+ else:
239
+ trend_cell = Text(f"{spark} {sym}", style="white")
240
+
241
+ table.add_row(
242
+ trend.agent,
243
+ trend.task,
244
+ str(trend.runs),
245
+ f"{trend.avg_score:.1f}",
246
+ f"{trend.best_score:.1f}",
247
+ f"{trend.latest_score:.1f}",
248
+ trend_cell,
249
+ )
250
+
251
+ console.print(table)
252
+
253
+ str_console = Console(file=None, force_terminal=False, width=120)
254
+ with str_console.capture() as capture:
255
+ str_console.print(table)
256
+ return capture.get()
257
+
258
+
259
+ def format_trend_markdown(
260
+ trends: list[AgentTaskTrend],
261
+ detail_agent: Optional[str] = None,
262
+ ) -> str:
263
+ """Render trend data as a markdown table."""
264
+ if not trends:
265
+ return "## coderace trend\n\n_No trend data found._\n"
266
+
267
+ header = "## coderace trend\n\n"
268
+
269
+ if detail_agent:
270
+ cols = "| Run ID | Date | Task | Score | Delta | Result |\n"
271
+ sep = "|--------|------|------|------:|------:|--------|\n"
272
+ rows: list[str] = []
273
+ for trend in trends:
274
+ for p in trend.points:
275
+ ts = p.timestamp
276
+ if "T" in ts:
277
+ ts = ts.split("T")[0] + " " + ts.split("T")[1][:8]
278
+ delta_str = f"+{p.delta:.1f}" if p.delta is not None and p.delta > 0 else (f"{p.delta:.1f}" if p.delta is not None else "—")
279
+ result_str = "win" if p.is_winner else "loss"
280
+ rows.append(f"| {p.run_id} | {ts} | `{trend.task}` | {p.score:.1f} | {delta_str} | {result_str} |")
281
+ return header + cols + sep + "\n".join(rows) + "\n"
282
+ else:
283
+ cols = "| Agent | Task | Runs | Avg Score | Best Score | Latest Score | Trend |\n"
284
+ sep = "|-------|------|-----:|----------:|-----------:|-------------:|-------|\n"
285
+ rows = []
286
+ for trend in trends:
287
+ delta = trend.latest_delta
288
+ if delta is None:
289
+ trend_str = "—"
290
+ elif delta > 0.05:
291
+ trend_str = f"↑ +{delta:.1f}"
292
+ elif delta < -0.05:
293
+ trend_str = f"↓ {delta:.1f}"
294
+ else:
295
+ trend_str = f"→ {delta:.1f}"
296
+ rows.append(
297
+ f"| {trend.agent} | `{trend.task}` | {trend.runs} | {trend.avg_score:.1f} | {trend.best_score:.1f} | {trend.latest_score:.1f} | {trend_str} |"
298
+ )
299
+ return header + cols + sep + "\n".join(rows) + "\n"
300
+
301
+
302
+ def format_trend_json(trends: list[AgentTaskTrend]) -> str:
303
+ """Render trend data as JSON."""
304
+ data = {
305
+ "trends": [
306
+ {
307
+ "agent": t.agent,
308
+ "task": t.task,
309
+ "runs": [
310
+ {
311
+ "run_id": p.run_id,
312
+ "timestamp": p.timestamp,
313
+ "score": round(p.score, 2),
314
+ "delta": round(p.delta, 2) if p.delta is not None else None,
315
+ "is_winner": p.is_winner,
316
+ }
317
+ for p in t.points
318
+ ],
319
+ "summary": {
320
+ "total_runs": t.runs,
321
+ "avg_score": round(t.avg_score, 2),
322
+ "best_score": round(t.best_score, 2),
323
+ "latest_score": round(t.latest_score, 2),
324
+ "trend_pct": round(t.improvement_rate * 100, 1) if t.improvement_rate is not None else None,
325
+ },
326
+ }
327
+ for t in trends
328
+ ]
329
+ }
330
+ return json.dumps(data, indent=2) + "\n"
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "coderace"
7
- version = "1.9.0"
7
+ version = "2.0.0"
8
8
  description = "Race coding agents against each other on real tasks"
9
9
  readme = "README.md"
10
10
  license = "MIT"