coderace 1.9.0__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderace-2.0.0/BUILD-REPORT-trend.md +68 -0
- coderace-2.0.0/BUILD-REPORT.md +41 -0
- {coderace-1.9.0 → coderace-2.0.0}/CHANGELOG.md +15 -0
- {coderace-1.9.0 → coderace-2.0.0}/PKG-INFO +1 -1
- {coderace-1.9.0 → coderace-2.0.0}/coderace/__init__.py +1 -1
- {coderace-1.9.0 → coderace-2.0.0}/coderace/cli.py +77 -0
- coderace-2.0.0/coderace/commands/trend.py +330 -0
- {coderace-1.9.0 → coderace-2.0.0}/pyproject.toml +1 -1
- coderace-2.0.0/tests/commands/test_trend.py +409 -0
- coderace-1.9.0/BUILD-REPORT.md +0 -22
- {coderace-1.9.0 → coderace-2.0.0}/.claude-task.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/.github/workflows/examples/coderace-pr-review.yml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/.github/workflows/examples/coderace-quality-gate.yml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/.github/workflows/publish.yml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/.gitignore +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/DONE.txt +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/LICENSE +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/README.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/action.yml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-benchmark-tasks-v2.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-benchmark.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-builtin-tasks.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-ci-integration.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-context-eval.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-cost-tracking.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-dashboard.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-leaderboard.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-model-selection.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-race-mode.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-review-mode.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-v0.2.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-v090-tasks.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-v1.0-statistical.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-v1.6.0-github-action-review.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/all-day-build-contract-verification-tests.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/benchmark-results/fibonacci-2026-02-27.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/benchmark-results/fibonacci-v2-2026-02-27.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/benchmark-results/hard-tasks-2026-02-27.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/benchmark-results/multi-task-2026-02-27.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/__init__.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/aider.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/base.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/claude.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/codex.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/gemini.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/adapters/opencode.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/benchmark.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/benchmark_report.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/benchmark_stats.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/__init__.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/api-client.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/binary-search-tree.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/bug-hunt.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/cli-args-parser.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/concurrent-queue.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/csv-analyzer.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/data-pipeline.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/diff-algorithm.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/expression-evaluator.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/fibonacci.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/file-watcher.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/http-server.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/json-parser.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/lru-cache.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/markdown-to-html.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/refactor.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/regex-engine.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/state-machine.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/task-scheduler.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/builtins/tasks/url-router.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/__init__.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/benchmark.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/context_eval.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/dashboard.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/diff.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/gate.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/history.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/leaderboard.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/race.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/results.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/review.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/commands/tasks.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/context_eval.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/context_eval_report.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/cost.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/dashboard.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/display.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/elo.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/export.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/git_ops.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/html_report.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/maintainer_rubric.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/publish.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/reporter.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/review.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/review_report.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/scorer.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/statistics.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/stats.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/store.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/task.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/coderace/types.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/demo-race.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/examples/add-type-hints.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/examples/ci-race-on-pr.yml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/examples/context-eval-demo.sh +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/examples/example-task.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/examples/fix-edge-case.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/examples/model-selection.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/examples/write-tests.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/progress-log.md +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/scripts/ci-gate.sh +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/scripts/ci-review.sh +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/scripts/ci-run.sh +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/scripts/format-comment.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/scripts/format-review-comment.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tasks/markdown-table.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tasks/parse-duration.yaml +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/__init__.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/conftest.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/fixtures/sample.patch +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_adapters.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_benchmark.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_benchmark_tasks_v2.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_benchmark_trials.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_benchmark_v1_integration.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_builtins.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_ci_gate.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_cli.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_cli_store_integration.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_context_eval.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_context_eval_dashboard.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_cost.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_cost_config.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_cost_integration.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_dashboard.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_dashboard_cli.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_diff.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_elo.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_examples.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_export.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_format_comment.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_full_workflow.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_git_ops.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_github_action_review.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_history.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_html_report.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_leaderboard.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_maintainer_rubric.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_markdown_results.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_model_selection_d1_d2.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_model_selection_d3.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_model_selection_d4.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_publish.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_race.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_reporter.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_review.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_scorer.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_statistics.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_stats.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_store.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_task.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_tasks_cli.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/tests/test_verification_integration.py +0 -0
- {coderace-1.9.0 → coderace-2.0.0}/uv.lock +0 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# Build Report: coderace v2.0.0 trend command
|
|
2
|
+
|
|
3
|
+
**Date:** 2026-03-12
|
|
4
|
+
**Contract:** `/Users/mordecai/.openclaw/workspace/memory/contracts/coderace-v2.0.0-trend.md`
|
|
5
|
+
**Status:** SUCCESS
|
|
6
|
+
**Commit:** `7a33d88` — "feat: add trend command, bump to v2.0.0"
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## Deliverables Completed
|
|
11
|
+
|
|
12
|
+
- [x] `coderace/commands/trend.py` — full implementation
|
|
13
|
+
- [x] `coderace trend` registered in `coderace/cli.py`
|
|
14
|
+
- [x] `tests/commands/test_trend.py` — 32 new tests
|
|
15
|
+
- [x] `pyproject.toml` version bumped: 1.9.0 → 2.0.0
|
|
16
|
+
- [x] `coderace/__init__.py` version bumped: 1.9.0 → 2.0.0
|
|
17
|
+
- [x] `CHANGELOG.md` entry written
|
|
18
|
+
- [x] `coderace trend --help` confirmed working
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Test Results
|
|
23
|
+
|
|
24
|
+
| Suite | Tests | Passed | Failed |
|
|
25
|
+
|-------|------:|-------:|-------:|
|
|
26
|
+
| `tests/commands/test_trend.py` (new) | 32 | 32 | 0 |
|
|
27
|
+
| Full test suite (`tests/`) | 761 | 761 | 0 |
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Command Interface
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
Usage: coderace trend [OPTIONS]
|
|
35
|
+
|
|
36
|
+
Visualize agent score progression over time.
|
|
37
|
+
|
|
38
|
+
Options:
|
|
39
|
+
--agent TEXT Filter by agent name (also enables detailed per-task view)
|
|
40
|
+
--task TEXT Filter by task name
|
|
41
|
+
--days INTEGER Look back this many days (default: 30)
|
|
42
|
+
--format TEXT Output format: terminal (default) | markdown | json
|
|
43
|
+
--help Show this message and exit.
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Implementation Notes
|
|
49
|
+
|
|
50
|
+
- `coderace/commands/trend.py` contains all business logic (TrendPoint, AgentTaskTrend, sparkline, format functions) — CLI command in cli.py is thin glue only, matching the `history` command pattern
|
|
51
|
+
- Sparkline uses Unicode block chars `▁▂▃▄▅▆▇█` with ASCII fallback `_.-*^`
|
|
52
|
+
- Date filtering applied post-query (get_runs doesn't support `since` param directly)
|
|
53
|
+
- `--format json` returns structured `{ trends: [{ agent, task, runs, summary }] }` with improvement_rate as `trend_pct`
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Issues Encountered
|
|
58
|
+
|
|
59
|
+
1. **`CliRunner(mix_stderr=False)` unsupported** in this version of typer's CliRunner. Fixed: removed the argument.
|
|
60
|
+
2. **`coderace` on PATH is brew-installed v1.3.0** at `/opt/homebrew/bin/coderace`. The pipx version at `~/.local/bin/coderace` is v2.0.0 and has the `trend` command. Confirmed working: `~/.local/bin/coderace trend --help`. The PATH ordering issue is outside build scope.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Stop Conditions Honored
|
|
65
|
+
|
|
66
|
+
- Did NOT push to PyPI
|
|
67
|
+
- Did NOT git push
|
|
68
|
+
- Did NOT tag release
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# coderace v1.9.0 Build Report
|
|
2
|
+
|
|
3
|
+
**Built:** 2026-03-12
|
|
4
|
+
**Version:** 1.9.0
|
|
5
|
+
**Commit:** 19d65382e908d292b0e42003f98cdf7832fea49a
|
|
6
|
+
**Tests:** 729 passing (700 baseline + 29 new)
|
|
7
|
+
**PyPI:** https://pypi.org/project/coderace/1.9.0/
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## What was built
|
|
12
|
+
|
|
13
|
+
**CI Quality Gate** — makes the maintainer rubric enforceable in CI.
|
|
14
|
+
|
|
15
|
+
METR published research (Mar 2026) showing ~50% of SWE-bench-passing PRs would be rejected by real maintainers. coderace v1.8.0 shipped the rubric as a diagnostic. v1.9.0 makes it a gate.
|
|
16
|
+
|
|
17
|
+
### D1: `--min-score` on `coderace review --maintainer-mode`
|
|
18
|
+
- New `--min-score N` flag (0-100 int)
|
|
19
|
+
- Exits 1 when composite rubric score < N
|
|
20
|
+
- Prints `✅ Maintainer score 87 ≥ 80 (gate: PASS)` or `❌ Maintainer score 54 < 80 (gate: FAIL)`
|
|
21
|
+
- Without `--min-score`: existing behavior unchanged
|
|
22
|
+
|
|
23
|
+
### D2: `coderace gate` standalone command
|
|
24
|
+
- `coderace gate --diff <file|-> --min-score 80`
|
|
25
|
+
- Accepts diff via file path or stdin (`--diff -`)
|
|
26
|
+
- Exits 0 (pass) or 1 (fail)
|
|
27
|
+
- `--json` flag for CI log parsing (score, gate, dimensions)
|
|
28
|
+
- Pure static analysis — no LLM, no API keys required
|
|
29
|
+
|
|
30
|
+
### D3: GitHub Action update
|
|
31
|
+
- New `action.yml` input: `maintainer-min-score` (default: empty = no gate, backward compatible)
|
|
32
|
+
- New `scripts/ci-gate.sh` CI script handling all diff sources
|
|
33
|
+
- Example workflow: `.github/workflows/examples/coderace-quality-gate.yml`
|
|
34
|
+
|
|
35
|
+
### D4: Tests (29 new)
|
|
36
|
+
- `tests/test_ci_gate.py` — 29 tests covering gate pass/fail, threshold edge cases, empty diff, JSON output, error handling, --min-score on review, action.yml structure
|
|
37
|
+
|
|
38
|
+
### D5: Docs
|
|
39
|
+
- README: "CI Quality Gate" section with one-liner examples and GitHub Action snippet
|
|
40
|
+
- CHANGELOG: v1.9.0 entry
|
|
41
|
+
- Version bumped: 1.8.0 → 1.9.0
|
|
@@ -1,5 +1,20 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [2.0.0] - 2026-03-12
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- `coderace trend` command: visualize agent score progression over time
|
|
7
|
+
- Summary table: Agent | Task | Runs | Avg Score | Best Score | Latest Score | Trend
|
|
8
|
+
- Unicode sparkline in Trend column (`▁▂▃▄▅▆▇█`) with delta arrow (`↑ +8.3` / `↓ -2.1` / `→ 0.0`)
|
|
9
|
+
- Color-coded output: green (improving), red (regressing), white (flat)
|
|
10
|
+
- `--agent`: filter to one agent and show detailed per-task score history
|
|
11
|
+
- `--task`: filter to one task
|
|
12
|
+
- `--days N`: look back N days (default: 30)
|
|
13
|
+
- `--format terminal|markdown|json`: multiple output formats
|
|
14
|
+
- `--format json` returns structured `{ agent, task, runs, summary }` suitable for CI
|
|
15
|
+
- Graceful empty-DB handling (no error, helpful message)
|
|
16
|
+
- 32 new tests covering unit logic, edge cases, and CLI integration
|
|
17
|
+
|
|
3
18
|
## [1.9.0] - 2026-03-12
|
|
4
19
|
|
|
5
20
|
### Added
|
|
@@ -1108,6 +1108,83 @@ def dashboard(
|
|
|
1108
1108
|
console.print(f"[dim]Opened in browser[/dim]")
|
|
1109
1109
|
|
|
1110
1110
|
|
|
1111
|
+
@app.command()
|
|
1112
|
+
def trend(
|
|
1113
|
+
agent: str | None = typer.Option(
|
|
1114
|
+
None, "--agent", help="Filter by agent name (also enables detailed per-task view)"
|
|
1115
|
+
),
|
|
1116
|
+
task: str | None = typer.Option(
|
|
1117
|
+
None, "--task", help="Filter by task name"
|
|
1118
|
+
),
|
|
1119
|
+
days: int = typer.Option(
|
|
1120
|
+
30, "--days", help="Look back this many days (default: 30)"
|
|
1121
|
+
),
|
|
1122
|
+
fmt: str | None = typer.Option(
|
|
1123
|
+
None,
|
|
1124
|
+
"--format",
|
|
1125
|
+
"-F",
|
|
1126
|
+
help="Output format: terminal (default) | markdown | json",
|
|
1127
|
+
),
|
|
1128
|
+
) -> None:
|
|
1129
|
+
"""Visualize agent score progression over time."""
|
|
1130
|
+
import sys
|
|
1131
|
+
|
|
1132
|
+
from coderace.commands.trend import (
|
|
1133
|
+
_build_trends,
|
|
1134
|
+
format_trend_json,
|
|
1135
|
+
format_trend_markdown,
|
|
1136
|
+
format_trend_terminal,
|
|
1137
|
+
)
|
|
1138
|
+
from coderace.store import ResultStore
|
|
1139
|
+
|
|
1140
|
+
try:
|
|
1141
|
+
store = ResultStore()
|
|
1142
|
+
except Exception as exc:
|
|
1143
|
+
console.print(f"[red]Cannot open result store: {exc}[/red]")
|
|
1144
|
+
raise typer.Exit(1)
|
|
1145
|
+
|
|
1146
|
+
try:
|
|
1147
|
+
since = f"{days}d"
|
|
1148
|
+
# Fetch runs with optional filters; use a generous limit
|
|
1149
|
+
from coderace.store import _parse_since
|
|
1150
|
+
from coderace.store import get_db_path
|
|
1151
|
+
import sqlite3
|
|
1152
|
+
|
|
1153
|
+
# Get all runs within the time window, filtered by agent/task
|
|
1154
|
+
db_path = get_db_path()
|
|
1155
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1156
|
+
runs = store.get_runs(
|
|
1157
|
+
task_name=task,
|
|
1158
|
+
agent=agent,
|
|
1159
|
+
limit=10000,
|
|
1160
|
+
)
|
|
1161
|
+
# Apply date filter manually since get_runs doesn't support since
|
|
1162
|
+
if days:
|
|
1163
|
+
cutoff = _parse_since(f"{days}d")
|
|
1164
|
+
if cutoff:
|
|
1165
|
+
runs = [r for r in runs if r.timestamp >= cutoff]
|
|
1166
|
+
finally:
|
|
1167
|
+
store.close()
|
|
1168
|
+
|
|
1169
|
+
trends = _build_trends(runs, agent_filter=agent, task_filter=task)
|
|
1170
|
+
|
|
1171
|
+
if not trends:
|
|
1172
|
+
console.print("[yellow]No trend data found. Run some races first.[/yellow]")
|
|
1173
|
+
return
|
|
1174
|
+
|
|
1175
|
+
if fmt == "markdown":
|
|
1176
|
+
sys.stdout.write(format_trend_markdown(trends, detail_agent=agent))
|
|
1177
|
+
elif fmt == "json":
|
|
1178
|
+
sys.stdout.write(format_trend_json(trends))
|
|
1179
|
+
elif fmt is not None and fmt != "terminal":
|
|
1180
|
+
console.print(
|
|
1181
|
+
f"[red]Unknown --format {fmt!r}. Choose: terminal, markdown, json[/red]"
|
|
1182
|
+
)
|
|
1183
|
+
raise typer.Exit(1)
|
|
1184
|
+
else:
|
|
1185
|
+
format_trend_terminal(trends, detail_agent=agent, console=console)
|
|
1186
|
+
|
|
1187
|
+
|
|
1111
1188
|
@app.command()
|
|
1112
1189
|
def version() -> None:
|
|
1113
1190
|
"""Show coderace version."""
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
"""Trend command — visualize agent score progression over time."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime, timedelta, timezone
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
from rich.text import Text
|
|
14
|
+
|
|
15
|
+
from coderace.store import RunRecord, AgentRecord
|
|
16
|
+
|
|
17
|
+
# Unicode sparkline characters (low → high)
|
|
18
|
+
_SPARK_CHARS = "▁▂▃▄▅▆▇█"
|
|
19
|
+
_SPARK_ASCII = "_.-*^"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class TrendPoint:
|
|
24
|
+
"""A single data point in a trend series."""
|
|
25
|
+
|
|
26
|
+
run_id: int
|
|
27
|
+
timestamp: str
|
|
28
|
+
score: float
|
|
29
|
+
delta: Optional[float] # None for first point
|
|
30
|
+
is_winner: bool
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class AgentTaskTrend:
|
|
35
|
+
"""Score trend for an (agent, task) pair."""
|
|
36
|
+
|
|
37
|
+
agent: str
|
|
38
|
+
task: str
|
|
39
|
+
points: list[TrendPoint] = field(default_factory=list)
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def runs(self) -> int:
|
|
43
|
+
return len(self.points)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def avg_score(self) -> float:
|
|
47
|
+
if not self.points:
|
|
48
|
+
return 0.0
|
|
49
|
+
return sum(p.score for p in self.points) / len(self.points)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def best_score(self) -> float:
|
|
53
|
+
if not self.points:
|
|
54
|
+
return 0.0
|
|
55
|
+
return max(p.score for p in self.points)
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def latest_score(self) -> float:
|
|
59
|
+
if not self.points:
|
|
60
|
+
return 0.0
|
|
61
|
+
return self.points[-1].score
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def latest_delta(self) -> Optional[float]:
|
|
65
|
+
if len(self.points) < 2:
|
|
66
|
+
return None
|
|
67
|
+
return self.points[-1].delta
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def improvement_rate(self) -> Optional[float]:
|
|
71
|
+
"""Pct of runs where score improved vs previous run."""
|
|
72
|
+
if len(self.points) < 2:
|
|
73
|
+
return None
|
|
74
|
+
improvements = sum(
|
|
75
|
+
1 for p in self.points[1:] if p.delta is not None and p.delta > 0
|
|
76
|
+
)
|
|
77
|
+
return improvements / (len(self.points) - 1)
|
|
78
|
+
|
|
79
|
+
def sparkline(self, use_unicode: bool = True) -> str:
|
|
80
|
+
"""Generate a sparkline string for the score series."""
|
|
81
|
+
scores = [p.score for p in self.points]
|
|
82
|
+
if not scores:
|
|
83
|
+
return "—"
|
|
84
|
+
if len(scores) == 1:
|
|
85
|
+
return "—"
|
|
86
|
+
|
|
87
|
+
chars = _SPARK_CHARS if use_unicode else _SPARK_ASCII
|
|
88
|
+
n = len(chars)
|
|
89
|
+
lo, hi = min(scores), max(scores)
|
|
90
|
+
span = hi - lo
|
|
91
|
+
|
|
92
|
+
result = []
|
|
93
|
+
for s in scores:
|
|
94
|
+
if span == 0:
|
|
95
|
+
idx = n // 2
|
|
96
|
+
else:
|
|
97
|
+
idx = round((s - lo) / span * (n - 1))
|
|
98
|
+
result.append(chars[idx])
|
|
99
|
+
return "".join(result)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _trend_direction(delta: Optional[float]) -> tuple[str, str]:
|
|
103
|
+
"""Return (symbol, rich_style) for a delta value."""
|
|
104
|
+
if delta is None:
|
|
105
|
+
return "—", "dim"
|
|
106
|
+
if delta > 0.05:
|
|
107
|
+
return f"↑ +{delta:.1f}", "green"
|
|
108
|
+
if delta < -0.05:
|
|
109
|
+
return f"↓ {delta:.1f}", "red"
|
|
110
|
+
return f"→ {delta:.1f}", "white"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _build_trends(
|
|
114
|
+
runs: list[RunRecord],
|
|
115
|
+
agent_filter: Optional[str],
|
|
116
|
+
task_filter: Optional[str],
|
|
117
|
+
) -> list[AgentTaskTrend]:
|
|
118
|
+
"""Build per-(agent, task) trend objects from run records."""
|
|
119
|
+
# Group: (agent, task) -> list of (timestamp, run_id, score, is_winner)
|
|
120
|
+
groups: dict[tuple[str, str], list[tuple[str, int, float, bool]]] = {}
|
|
121
|
+
|
|
122
|
+
for run in runs:
|
|
123
|
+
if task_filter and run.task_name != task_filter:
|
|
124
|
+
continue
|
|
125
|
+
for ar in run.agents:
|
|
126
|
+
if agent_filter and ar.agent != agent_filter:
|
|
127
|
+
continue
|
|
128
|
+
key = (ar.agent, run.task_name)
|
|
129
|
+
groups.setdefault(key, []).append(
|
|
130
|
+
(run.timestamp, run.run_id, ar.composite_score, ar.is_winner)
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
trends: list[AgentTaskTrend] = []
|
|
134
|
+
for (agent, task), entries in sorted(groups.items()):
|
|
135
|
+
# Sort chronologically (oldest first) so delta makes sense
|
|
136
|
+
entries.sort(key=lambda x: x[0])
|
|
137
|
+
trend = AgentTaskTrend(agent=agent, task=task)
|
|
138
|
+
prev_score: Optional[float] = None
|
|
139
|
+
for ts, run_id, score, is_winner in entries:
|
|
140
|
+
delta = score - prev_score if prev_score is not None else None
|
|
141
|
+
trend.points.append(
|
|
142
|
+
TrendPoint(
|
|
143
|
+
run_id=run_id,
|
|
144
|
+
timestamp=ts,
|
|
145
|
+
score=score,
|
|
146
|
+
delta=delta,
|
|
147
|
+
is_winner=is_winner,
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
prev_score = score
|
|
151
|
+
trends.append(trend)
|
|
152
|
+
|
|
153
|
+
return trends
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def format_trend_terminal(
|
|
157
|
+
trends: list[AgentTaskTrend],
|
|
158
|
+
detail_agent: Optional[str] = None,
|
|
159
|
+
console: Optional[Console] = None,
|
|
160
|
+
) -> str:
|
|
161
|
+
"""Render trend data as a Rich terminal table."""
|
|
162
|
+
console = console or Console()
|
|
163
|
+
|
|
164
|
+
if not trends:
|
|
165
|
+
console.print("[yellow]No trend data found.[/yellow]")
|
|
166
|
+
return ""
|
|
167
|
+
|
|
168
|
+
use_unicode = sys.stdout.encoding.lower().startswith(("utf", "us-ascii")) if hasattr(sys.stdout, "encoding") else True
|
|
169
|
+
|
|
170
|
+
if detail_agent:
|
|
171
|
+
# Detailed per-task view for a single agent
|
|
172
|
+
table = Table(title=f"coderace trend — {detail_agent}", show_lines=True)
|
|
173
|
+
table.add_column("Run ID", justify="center", style="bold")
|
|
174
|
+
table.add_column("Date", style="dim")
|
|
175
|
+
table.add_column("Task", style="cyan")
|
|
176
|
+
table.add_column("Score", justify="right")
|
|
177
|
+
table.add_column("Delta", justify="right")
|
|
178
|
+
table.add_column("Result")
|
|
179
|
+
|
|
180
|
+
for trend in trends:
|
|
181
|
+
for p in trend.points:
|
|
182
|
+
ts = p.timestamp
|
|
183
|
+
if "T" in ts:
|
|
184
|
+
ts = ts.split("T")[0] + " " + ts.split("T")[1][:8]
|
|
185
|
+
|
|
186
|
+
sym, sty = _trend_direction(p.delta)
|
|
187
|
+
result_str = "win" if p.is_winner else "loss"
|
|
188
|
+
result_style = "green" if p.is_winner else "dim"
|
|
189
|
+
|
|
190
|
+
table.add_row(
|
|
191
|
+
str(p.run_id),
|
|
192
|
+
ts,
|
|
193
|
+
trend.task,
|
|
194
|
+
f"{p.score:.1f}",
|
|
195
|
+
Text(sym, style=sty),
|
|
196
|
+
Text(result_str, style=result_style),
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
console.print(table)
|
|
200
|
+
|
|
201
|
+
# Summary stats
|
|
202
|
+
total_runs = sum(t.runs for t in trends)
|
|
203
|
+
all_scores = [p.score for t in trends for p in t.points]
|
|
204
|
+
avg = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
|
205
|
+
best = max(all_scores) if all_scores else 0.0
|
|
206
|
+
improvement_rates = [t.improvement_rate for t in trends if t.improvement_rate is not None]
|
|
207
|
+
avg_impr = sum(improvement_rates) / len(improvement_rates) if improvement_rates else None
|
|
208
|
+
|
|
209
|
+
console.print(f"\n[bold]Summary for {detail_agent}[/bold]")
|
|
210
|
+
console.print(f" Total runs: {total_runs}")
|
|
211
|
+
console.print(f" Avg score: {avg:.1f}")
|
|
212
|
+
console.print(f" Best score: {best:.1f}")
|
|
213
|
+
if avg_impr is not None:
|
|
214
|
+
console.print(f" Improvement rate: {avg_impr:.0%}")
|
|
215
|
+
else:
|
|
216
|
+
console.print(" Improvement rate: — (need 2+ runs per task)")
|
|
217
|
+
else:
|
|
218
|
+
# Summary table: one row per (agent, task)
|
|
219
|
+
table = Table(title="coderace trend", show_lines=True)
|
|
220
|
+
table.add_column("Agent", style="cyan")
|
|
221
|
+
table.add_column("Task")
|
|
222
|
+
table.add_column("Runs", justify="right")
|
|
223
|
+
table.add_column("Avg Score", justify="right")
|
|
224
|
+
table.add_column("Best Score", justify="right")
|
|
225
|
+
table.add_column("Latest Score", justify="right")
|
|
226
|
+
table.add_column("Trend")
|
|
227
|
+
|
|
228
|
+
for trend in trends:
|
|
229
|
+
spark = trend.sparkline(use_unicode=use_unicode)
|
|
230
|
+
sym, sty = _trend_direction(trend.latest_delta)
|
|
231
|
+
|
|
232
|
+
if trend.runs < 2:
|
|
233
|
+
trend_cell: str | Text = Text("—", style="dim")
|
|
234
|
+
elif trend.latest_delta is not None and trend.latest_delta > 0.05:
|
|
235
|
+
trend_cell = Text(f"{spark} {sym}", style="green")
|
|
236
|
+
elif trend.latest_delta is not None and trend.latest_delta < -0.05:
|
|
237
|
+
trend_cell = Text(f"{spark} {sym}", style="red")
|
|
238
|
+
else:
|
|
239
|
+
trend_cell = Text(f"{spark} {sym}", style="white")
|
|
240
|
+
|
|
241
|
+
table.add_row(
|
|
242
|
+
trend.agent,
|
|
243
|
+
trend.task,
|
|
244
|
+
str(trend.runs),
|
|
245
|
+
f"{trend.avg_score:.1f}",
|
|
246
|
+
f"{trend.best_score:.1f}",
|
|
247
|
+
f"{trend.latest_score:.1f}",
|
|
248
|
+
trend_cell,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
console.print(table)
|
|
252
|
+
|
|
253
|
+
str_console = Console(file=None, force_terminal=False, width=120)
|
|
254
|
+
with str_console.capture() as capture:
|
|
255
|
+
str_console.print(table)
|
|
256
|
+
return capture.get()
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def format_trend_markdown(
|
|
260
|
+
trends: list[AgentTaskTrend],
|
|
261
|
+
detail_agent: Optional[str] = None,
|
|
262
|
+
) -> str:
|
|
263
|
+
"""Render trend data as a markdown table."""
|
|
264
|
+
if not trends:
|
|
265
|
+
return "## coderace trend\n\n_No trend data found._\n"
|
|
266
|
+
|
|
267
|
+
header = "## coderace trend\n\n"
|
|
268
|
+
|
|
269
|
+
if detail_agent:
|
|
270
|
+
cols = "| Run ID | Date | Task | Score | Delta | Result |\n"
|
|
271
|
+
sep = "|--------|------|------|------:|------:|--------|\n"
|
|
272
|
+
rows: list[str] = []
|
|
273
|
+
for trend in trends:
|
|
274
|
+
for p in trend.points:
|
|
275
|
+
ts = p.timestamp
|
|
276
|
+
if "T" in ts:
|
|
277
|
+
ts = ts.split("T")[0] + " " + ts.split("T")[1][:8]
|
|
278
|
+
delta_str = f"+{p.delta:.1f}" if p.delta is not None and p.delta > 0 else (f"{p.delta:.1f}" if p.delta is not None else "—")
|
|
279
|
+
result_str = "win" if p.is_winner else "loss"
|
|
280
|
+
rows.append(f"| {p.run_id} | {ts} | `{trend.task}` | {p.score:.1f} | {delta_str} | {result_str} |")
|
|
281
|
+
return header + cols + sep + "\n".join(rows) + "\n"
|
|
282
|
+
else:
|
|
283
|
+
cols = "| Agent | Task | Runs | Avg Score | Best Score | Latest Score | Trend |\n"
|
|
284
|
+
sep = "|-------|------|-----:|----------:|-----------:|-------------:|-------|\n"
|
|
285
|
+
rows = []
|
|
286
|
+
for trend in trends:
|
|
287
|
+
delta = trend.latest_delta
|
|
288
|
+
if delta is None:
|
|
289
|
+
trend_str = "—"
|
|
290
|
+
elif delta > 0.05:
|
|
291
|
+
trend_str = f"↑ +{delta:.1f}"
|
|
292
|
+
elif delta < -0.05:
|
|
293
|
+
trend_str = f"↓ {delta:.1f}"
|
|
294
|
+
else:
|
|
295
|
+
trend_str = f"→ {delta:.1f}"
|
|
296
|
+
rows.append(
|
|
297
|
+
f"| {trend.agent} | `{trend.task}` | {trend.runs} | {trend.avg_score:.1f} | {trend.best_score:.1f} | {trend.latest_score:.1f} | {trend_str} |"
|
|
298
|
+
)
|
|
299
|
+
return header + cols + sep + "\n".join(rows) + "\n"
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def format_trend_json(trends: list[AgentTaskTrend]) -> str:
|
|
303
|
+
"""Render trend data as JSON."""
|
|
304
|
+
data = {
|
|
305
|
+
"trends": [
|
|
306
|
+
{
|
|
307
|
+
"agent": t.agent,
|
|
308
|
+
"task": t.task,
|
|
309
|
+
"runs": [
|
|
310
|
+
{
|
|
311
|
+
"run_id": p.run_id,
|
|
312
|
+
"timestamp": p.timestamp,
|
|
313
|
+
"score": round(p.score, 2),
|
|
314
|
+
"delta": round(p.delta, 2) if p.delta is not None else None,
|
|
315
|
+
"is_winner": p.is_winner,
|
|
316
|
+
}
|
|
317
|
+
for p in t.points
|
|
318
|
+
],
|
|
319
|
+
"summary": {
|
|
320
|
+
"total_runs": t.runs,
|
|
321
|
+
"avg_score": round(t.avg_score, 2),
|
|
322
|
+
"best_score": round(t.best_score, 2),
|
|
323
|
+
"latest_score": round(t.latest_score, 2),
|
|
324
|
+
"trend_pct": round(t.improvement_rate * 100, 1) if t.improvement_rate is not None else None,
|
|
325
|
+
},
|
|
326
|
+
}
|
|
327
|
+
for t in trends
|
|
328
|
+
]
|
|
329
|
+
}
|
|
330
|
+
return json.dumps(data, indent=2) + "\n"
|