devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,272 @@
1
+ # Benchmark Suite Design — v1
2
+
3
+ **Outer goal**: see [`autoresearch/NORTH-STAR.md`](../../autoresearch/NORTH-STAR.md) — the harness composes frontier LLMs into a hands-free pipeline that delivers engineer-quality software for users who do not know context engineering, with each composition layer (L0 bare → L1 solo harness → L2 pair harness) justifying its added cost on quality AND wall-time efficiency. This benchmark is the measurement instrument for that contract.
4
+
5
+ **Purpose.** Replace ad-hoc A/B benchmarking with a permanent, comprehensive,
6
+ one-command suite that gates every future harness change with a ship/rollback
7
+ decision. Any prompt edit, phase reorder, new native skill, or model upgrade
8
+ can be validated by running the suite and reading the numbers.
9
+
10
+ **Arm structure (current vs planned).** Today the suite runs `variant` (L2: Claude + Codex pair) vs `bare` (L0). The L1 (solo harness on a single LLM) arm is queued for iter-0020 — until then the benchmark cannot directly verify the L1 contract, only the L0 ↔ L2 delta. Single-LLM users (Opus alone, GPT-5.5 alone) are first-class per the North Star, so this gap is a release-blocker for them, not a future enhancement.
11
+
12
+ **Non-goals.** Publishable-research statistical rigor. Not a regression test
13
+ library for the product code — those live elsewhere. Not a substitute for
14
+ production telemetry — just enough signal for ship decisions.
15
+
16
+ ---
17
+
18
+ ## Principles
19
+
20
+ 1. **One command.** `npx devlyn-cli benchmark` runs everything and prints a
21
+ verdict. No manual fixture setup.
22
+ 2. **Novice-proof.** The suite exercises the same paths a first-time user
23
+ hits — including an end-to-end `ideate → auto-resolve → preflight` fixture.
24
+ 3. **LLM-upgrade friendly.** Rubric, fixture semantics, and thresholds stay
25
+ stable; scores and margins float up as models improve. Nothing is
26
+ hardcoded to a specific model version.
27
+ 4. **Karpathy.** No fixture earns its place unless it tests a distinct
28
+ failure mode. Tooling stays boring. History plumbing is simple.
29
+ 5. **Ship gate is numbers, not vibes.** Concrete thresholds in RUBRIC.md.
30
+
31
+ ---
32
+
33
+ ## Directory Layout
34
+
35
+ ```
36
+ benchmark/auto-resolve/
37
+ ├── BENCHMARK-DESIGN.md # this file
38
+ ├── README.md # how to run, interpret, extend
39
+ ├── RUBRIC.md # stable judge rubric + ship gates
40
+
41
+ ├── fixtures/
42
+ │ ├── SCHEMA.md # fixture file format
43
+ │ ├── test-repo/ # bootstrap Node project (shared base)
44
+ │ │ ├── bin/cli.js
45
+ │ │ ├── server/index.js
46
+ │ │ ├── web/page.html
47
+ │ │ ├── tests/
48
+ │ │ ├── playwright.config.js
49
+ │ │ └── package.json
50
+ │ │
51
+ │ ├── F1-cli-trivial-flag/
52
+ │ ├── F2-cli-medium-subcommand/
53
+ │ ├── F3-backend-contract-risk/
54
+ │ ├── F4-web-browser-design/
55
+ │ ├── F5-fix-loop-red-green/
56
+ │ ├── F6-dep-audit-native-module/
57
+ │ ├── F7-out-of-scope-trap/
58
+ │ ├── F8-known-limit-ambiguous/
59
+ │ └── F9-e2e-ideate-to-resolve/
60
+
61
+ ├── scripts/
62
+ │ ├── run-suite.sh # single entry — runs all fixtures × 2 arms + judge + report
63
+ │ ├── run-fixture.sh # one fixture, one arm
64
+ │ ├── judge.sh # Codex blind judge (model-agnostic)
65
+ │ ├── compile-report.py # aggregate into report.md + summary.json
66
+ │ └── ship-gate.py # apply thresholds, return ship/rollback verdict
67
+
68
+ ├── results/ # per-run artifacts (overwritten)
69
+ │ └── <run-id>/
70
+ │ ├── <fixture>/
71
+ │ │ ├── variant/{input.md, transcript.txt, diff.patch, verify.json, timing.json}
72
+ │ │ └── bare/{same}
73
+ │ ├── <fixture>/judge.json
74
+ │ ├── report.md
75
+ │ └── summary.json
76
+
77
+ └── history/
78
+ ├── runs/ # append-only immutable records
79
+ │ └── 2026-04-23T120000Z-v3.6.json
80
+ ├── latest.json # pointer to most recent run
81
+ └── baselines/
82
+ └── shipped.json # last blessed version, used for regression check
83
+ ```
84
+
85
+ ---
86
+
87
+ ## Fixture Schema
88
+
89
+ Every fixture is a directory with these files (see `fixtures/SCHEMA.md`):
90
+
91
+ | File | Purpose |
92
+ |------|---------|
93
+ | `metadata.json` | id, category, difficulty, timeout, required tools, intent block |
94
+ | `spec.md` | pipeline-arm input (auto-resolve-ready spec with Requirements/Constraints/Out-of-Scope/Verification) |
95
+ | `task.txt` | bare-arm input (same intent, natural-language framing) |
96
+ | `expected.json` | machine-readable acceptance criteria + forbidden patterns + verification commands |
97
+ | `NOTES.md` | why this fixture exists, the specific failure mode it tests |
98
+ | `setup.sh` | deterministic starting state — applies to a fresh copy of `test-repo/` |
99
+
100
+ **Drift prevention**: `spec.md` and `task.txt` both derive from the same
101
+ `intent` block in `metadata.json`. A lint step in CI verifies they stay
102
+ consistent.
103
+
104
+ ---
105
+
106
+ ## The 9 Fixtures
107
+
108
+ Category coverage matrix (rows = concerns, columns = fixtures):
109
+
110
+ | Fixture | Trivial | Medium | High-risk | Stress | Edge | E2E |
111
+ |---------|---------|--------|-----------|--------|------|-----|
112
+ | F1-cli-trivial-flag | ✓ | | | | | |
113
+ | F2-cli-medium-subcommand | | ✓ | | | | |
114
+ | F3-backend-contract-risk | | | ✓ | | | |
115
+ | F4-web-browser-design | | | | ✓ (browser-validate) | | |
116
+ | F5-fix-loop-red-green | | | | ✓ (FIX LOOP) | | |
117
+ | F6-dep-audit-native-module | | | | ✓ (CRITIC security dep audit) | | |
118
+ | F7-out-of-scope-trap | | | | ✓ (scope discipline) | | |
119
+ | F8-known-limit-ambiguous | | | | | ✓ (documents where pipeline may lose) | |
120
+ | F9-e2e-ideate-to-resolve | | | | | | ✓ (novice full-flow) |
121
+
122
+ **F9 is load-bearing** for the "novice user types `/devlyn:ideate`" promise.
123
+ Input is a vague idea; pipeline arm runs ideate → auto-resolve on every
124
+ generated spec → preflight; bare arm runs a direct prompt. Judge compares
125
+ the final usable artifact set (code + docs + roadmap state).
126
+
127
+ ---
128
+
129
+ ## Single-Command Invocation
130
+
131
+ ### User experience
132
+
133
+ ```bash
134
+ npx devlyn-cli benchmark # n=1 smoke, all fixtures
135
+ npx devlyn-cli benchmark --n 3 # higher confidence for ship decisions
136
+ npx devlyn-cli benchmark F2 F5 # specific fixtures only
137
+ npx devlyn-cli benchmark --judge-only --run-id <id> # re-judge without re-running
138
+ ```
139
+
140
+ Output on completion:
141
+
142
+ ```
143
+ Benchmark Suite Run — 2026-04-23T12:00Z (v3.6)
144
+ Judge: codex CLI flagship, xhigh, blind (model recorded in run history)
145
+
146
+ Fixture Variant Bare Margin Verdict
147
+ F1-cli-trivial-flag 95 88 +7 PASS
148
+ F2-cli-medium-subcommand 92 81 +11 PASS
149
+ F3-backend-contract-risk 89 72 +17 PASS
150
+ F4-web-browser-design 87 79 +8 PASS
151
+ F5-fix-loop-red-green 91 65 +26 PASS
152
+ F6-dep-audit-native-module 88 70 +18 PASS
153
+ F7-out-of-scope-trap 94 73 +21 PASS
154
+ F8-known-limit-ambiguous 78 79 -1 EXPECTED (known-limit)
155
+ F9-e2e-ideate-to-resolve 90 68 +22 PASS
156
+ ---------------------------------------------------------
157
+ Suite average variant score: 89.3
158
+ Suite average bare score: 75.0
159
+ Suite average margin: +14.3 (ship floor: +5)
160
+ Hard-floor violations: 0
161
+ Regression vs shipped: n/a (first run of v3.6)
162
+ SHIP-GATE VERDICT: ✅ PASS
163
+ ```
164
+
165
+ ### Runner orchestration
166
+
167
+ `run-suite.sh`:
168
+
169
+ 1. Generate run-id `<ISO>-<sha>-<branch>`
170
+ 2. For each fixture × each arm (variant, bare): parallelizable via `xargs -P`
171
+ - `run-fixture.sh --fixture FX --arm variant` → writes `results/<run-id>/FX/variant/*`
172
+ 3. For each fixture: `judge.sh FX <run-id>` → writes `results/<run-id>/FX/judge.json`
173
+ 4. `compile-report.py <run-id>` → writes `report.md` + `summary.json`
174
+ 5. `ship-gate.py <run-id>` → exit 0 (PASS) / 1 (FAIL). Prints verdict to stdout.
175
+ 6. If PASS and `--bless` flag: copy `summary.json` → `history/baselines/shipped.json`
176
+ 7. Always: append `history/runs/<run-id>.json` + update `latest.json`
177
+
178
+ ### `run-fixture.sh` contract
179
+
180
+ - Creates fresh temp copy of `test-repo/` at `/tmp/bench-<run-id>-<fixture>-<arm>/`
181
+ - Applies `setup.sh` if present
182
+ - Copies `spec.md` (variant) or `task.txt` (bare) as the prompt
183
+ - Invokes Claude/auto-resolve (variant) or bare Claude (bare) via isolated Agent
184
+ - Captures: `diff.patch`, `changed-files.txt`, `transcript.txt`, `timing.json`
185
+ - Runs `expected.json::verification_commands`, writes pass/fail per command to `verify.json`
186
+ - Writes `result.json` with aggregate: exit code, duration, files changed, verification score
187
+
188
+ ### `judge.sh` contract
189
+
190
+ - Reads `results/<run-id>/<fixture>/{variant,bare}/{diff.patch,verify.json}` + fixture's `spec.md` + `expected.json`
191
+ - Builds a blind prompt: labels arms A and B randomly per fixture (seed recorded)
192
+ - Invokes `codex exec` (current flagship — no model hardcode) with RUBRIC.md
193
+ - Writes `judge.json`: per-axis scores, winner, margin, critical findings, disqualifiers
194
+ - Idempotent: re-running overwrites the same `judge.json`
195
+
196
+ ---
197
+
198
+ ## LLM-Upgrade Resilience
199
+
200
+ Three mechanisms:
201
+
202
+ 1. **No hardcoded models.** Judge invocation is `codex exec` without `-m`; it
203
+ inherits whichever flagship the CLI currently ships. Same for agents —
204
+ they run against whatever Claude Code session-model the caller has.
205
+ Model provenance is captured in `result.json` per run.
206
+
207
+ 2. **Margin as primary signal, absolute score as secondary.** When models
208
+ improve, both arms get better. Margin (variant − bare) is model-invariant
209
+ — it measures **what the harness adds beyond bare**. Ship gates are
210
+ defined on margin (`>= +5`) and regression (`-3 or worse`), not absolute
211
+ score.
212
+
213
+ 3. **Fixture difficulty gradient.** F1 (trivial) is expected to saturate near
214
+ 100 quickly as models improve — that's fine, it still catches catastrophic
215
+ regressions. F5/F9 (stress/E2E) have enough depth that even a near-perfect
216
+ model won't 100-zero bare. If any fixture saturates (both arms > 95 for
217
+ two consecutive versions), we replace it with a harder one and document
218
+ the swap in `history/runs/<ts>-fixture-rotation.json`.
219
+
220
+ ---
221
+
222
+ ## Ship Gates (from RUBRIC.md)
223
+
224
+ Hard floors (any single failure blocks ship):
225
+
226
+ - **No silent-catch / fabricated verification / skipped required test in variant.** Judge flags this as disqualifier.
227
+ - **Variant may not lose any fixture by more than −5** versus previous shipped version (per-fixture regression floor).
228
+ - **At least 7 of 9 fixtures** must have margin ≥ +5 (suite coverage).
229
+ - **F9 (E2E) must PASS** — novice-flow contract.
230
+
231
+ Soft gates (trigger rollback discussion):
232
+
233
+ - Suite average margin drop > 3 vs last shipped.
234
+ - Any fixture with margin ≤ 0 that previously had margin > +5.
235
+ - Critical-finding catch-rate decrease vs last shipped variant (not vs bare — bare is the opponent, not the regression baseline).
236
+
237
+ Known-limit exception:
238
+
239
+ - F8 is explicitly allowed to tie or lose (margin in [-3, +3]). Its job is to
240
+ document honesty, not to beat bare.
241
+
242
+ ---
243
+
244
+ ## Karpathy Check
245
+
246
+ Where over-engineering lurks:
247
+
248
+ - ❌ **Automatic history mutation during development.** Add append-only
249
+ history AFTER the suite format stabilizes (one version after initial ship).
250
+ - ❌ **Statistical tooling beyond mean/median/margin.** n=1-3 doesn't need
251
+ t-tests.
252
+ - ❌ **Auto-generated fixture cards / dashboards.** Plain `report.md` is enough.
253
+ - ✅ **Keep scripts under 100 lines each** unless they're doing concrete,
254
+ repeated work the user would do by hand.
255
+
256
+ If the suite tooling grows past ~800 total lines, prune aggressively before
257
+ adding anything.
258
+
259
+ ---
260
+
261
+ ## Open Questions (to be answered before first full ship-gate run)
262
+
263
+ 1. Where does `benchmark` subcommand live? Inside `bin/devlyn.js` or as
264
+ standalone `benchmark/auto-resolve/scripts/run-suite.sh` invoked via `npm
265
+ run`? **Proposal**: both — `bin/devlyn.js benchmark` is the advertised
266
+ entry, which shells out to the script.
267
+ 2. Parallel run safety — can we run 9 fixtures × 2 arms concurrently without
268
+ rate-limit / lockfile conflicts? **Proposal**: default sequential with
269
+ `--parallel N` flag. Default `N=1` for safety; the user can opt in.
270
+ 3. Token accounting — Claude Code doesn't expose subagent totals reliably.
271
+ **Proposal**: capture wall time as primary efficiency metric; token
272
+ estimate as best-effort secondary. Do not gate ship on token math alone.
@@ -0,0 +1,114 @@
1
+ # devlyn-cli auto-resolve Benchmark Suite
2
+
3
+ One-command A/B benchmark that gates every harness change with a ship/rollback decision.
4
+
5
+ ## Quick start
6
+
7
+ ```bash
8
+ npx devlyn-cli benchmark # n=1 smoke, all fixtures × 2 arms, judge, report, ship-gate
9
+ npx devlyn-cli benchmark --n 3 # higher confidence for ship decisions
10
+ npx devlyn-cli benchmark F2 # specific fixture only
11
+ npx devlyn-cli benchmark --dry-run # validate suite wiring without model invocation
12
+ npx devlyn-cli benchmark --bless # if ship-gate PASSes, promote this run as the shipped baseline
13
+ npx devlyn-cli benchmark --judge-only --run-id <ID> # re-judge an existing run's artifacts
14
+ ```
15
+
16
+ Exit code 0 = PASS, 1 = FAIL.
17
+
18
+ ## What it does
19
+
20
+ 1. For every fixture × arm (`variant` / `bare`):
21
+ - Prepare a fresh temp copy of `fixtures/test-repo/`.
22
+ - Commit baseline + apply `setup.sh` + commit bench scaffolding.
23
+ - Invoke the arm via an isolated `claude -p` subprocess.
24
+ - Capture `diff.patch`, `transcript.txt`, `timing.json`, run `expected.json::verification_commands`.
25
+ 2. For every fixture, invoke `codex exec` as a blind judge (`A`/`B` randomized per fixture) using the 4-axis rubric in `RUBRIC.md`.
26
+ 3. Aggregate into `results/<run-id>/report.md` + `summary.json`.
27
+ 4. Apply ship-gate thresholds (`scripts/ship-gate.py`). Print verdict.
28
+ 5. Append immutable record to `history/runs/<run-id>.json`.
29
+
30
+ ## Directory layout
31
+
32
+ ```
33
+ benchmark/auto-resolve/
34
+ ├── BENCHMARK-DESIGN.md # full design rationale
35
+ ├── README.md # this file
36
+ ├── RUBRIC.md # 4-axis scoring + ship gates
37
+
38
+ ├── fixtures/
39
+ │ ├── SCHEMA.md # fixture file format
40
+ │ ├── test-repo/ # bootstrap Node project — base for all arms
41
+ │ ├── F2-cli-medium-subcommand/
42
+ │ └── F1,F3-F9/ # add per Stage 2-3
43
+
44
+ ├── scripts/
45
+ │ ├── run-suite.sh # single entry — called by `npx devlyn-cli benchmark`
46
+ │ ├── run-fixture.sh # one fixture × one arm, self-contained
47
+ │ ├── judge.sh # Codex blind judge for one fixture
48
+ │ ├── compile-report.py # aggregates into report.md + summary.json
49
+ │ └── ship-gate.py # applies thresholds + writes history record
50
+
51
+ ├── results/<run-id>/ # per-run artifacts (overwritten)
52
+ └── history/
53
+ ├── runs/ # append-only, one JSON per run
54
+ ├── latest.json # pointer to most recent run
55
+ └── baselines/shipped.json # last blessed version, used for regression floor
56
+ ```
57
+
58
+ ## Prerequisites
59
+
60
+ - `claude` CLI on PATH (Claude Code, used to invoke each arm).
61
+ - `codex` CLI on PATH (used by the blind judge). Install from https://platform.openai.com/docs/codex.
62
+ - `python3`, `node`, `git`, `timeout`.
63
+
64
+ ## Adding a fixture
65
+
66
+ Follow `fixtures/SCHEMA.md`. Six files per fixture: `metadata.json`, `spec.md`, `task.txt`, `expected.json`, `NOTES.md`, `setup.sh`. Common workflow:
67
+
68
+ 1. Copy an existing fixture directory as a template.
69
+ 2. Rewrite `metadata.json::intent` with the new task's plain-language intent.
70
+ 3. Write `spec.md` (auto-resolve-ready) and `task.txt` (plain prompt) both derived from the intent.
71
+ 4. Fill `expected.json` with concrete verification commands and forbidden patterns.
72
+ 5. Document purpose + failure mode in `NOTES.md`.
73
+ 6. Add `setup.sh` if the task needs the base `test-repo` modified before either arm starts.
74
+
75
+ ## LLM-upgrade resilience
76
+
77
+ - **No model hardcoding.** Judge runs `codex exec` without `-m`, inheriting whichever flagship the CLI currently ships. Each run captures `_judge_model` for historical provenance.
78
+ - **Margin-based gates.** Ship thresholds use margin (variant − bare), not absolute score. Both arms improve together as models improve; the harness-added value measured by margin stays meaningful.
79
+ - **Saturation rotation.** When both arms exceed 95 on a fixture for two shipped versions, rotate it (see `RUBRIC.md::Fixture Rotation Policy`).
80
+
81
+ ## Ship gates (summary — see `RUBRIC.md` for full spec)
82
+
83
+ Hard floors (any one fails → block):
84
+
85
+ - Zero variant disqualifier (silent catch, fabricated verification, extra deps beyond `max_deps_added`, etc.).
86
+ - `F9-e2e-ideate-to-resolve` must PASS (novice-flow contract).
87
+ - ≥ 7 of 9 gated fixtures have margin ≥ +5.
88
+ - No per-fixture regression worse than −5 vs last shipped baseline.
89
+
90
+ Soft gates (warning, not block): suite-margin drop > 3, fixture losing its margin, critical-finding catch-rate regression vs last shipped variant.
91
+
92
+ ## Running the full suite (real)
93
+
94
+ Full real benchmark costs roughly 2-3 minutes per arm for simple fixtures and up to 15 minutes per arm for strict-route fixtures. A full n=1 run of 9 fixtures × 2 arms can take 30 min – 2 hrs depending on routes taken.
95
+
96
+ ```bash
97
+ # Smoke run before ship decisions
98
+ npx devlyn-cli benchmark
99
+
100
+ # Ship-decision run
101
+ npx devlyn-cli benchmark --n 3 --label v3.7 --bless
102
+ ```
103
+
104
+ ## Dry-run
105
+
106
+ `--dry-run` skips model invocation. It still:
107
+
108
+ - Prepares each fresh work dir.
109
+ - Writes arm-specific prompts.
110
+ - Commits the baseline.
111
+ - Applies `setup.sh`.
112
+ - Runs verification commands (which will mostly fail since no implementation was added).
113
+
114
+ Use it to sanity-check new fixtures or runner changes before burning model tokens.
@@ -0,0 +1,162 @@
1
+ # Benchmark Judge Rubric
2
+
3
+ Stable across model upgrades. This file is the single source of truth for how
4
+ arms are scored and how ship gates evaluate a run. Do not change the rubric
5
+ during a benchmarking window — changing it invalidates comparability with
6
+ prior `history/runs/`.
7
+
8
+ **Outer goal lives in [`autoresearch/NORTH-STAR.md`](../../autoresearch/NORTH-STAR.md).** The release-decision layer (L0 / L1 / L2 contracts, wall-time efficiency, pair-cost justification) sits on top of the per-arm scoring rules below. When NORTH-STAR.md adds a release-gate number that this file did not have, the new number applies — open a doc-fix iter to mirror it here.
9
+
10
+ ## Scoring — 4 axes, 25 points each, 100 total
11
+
12
+ The blind judge scores both arms on identical axes without knowing which is
13
+ variant vs. bare.
14
+
15
+ ### Axis 1 — Spec Compliance (0-25)
16
+
17
+ Does this implementation satisfy every Requirements bullet in `spec.md`?
18
+ Does every Verification command behave as the spec states?
19
+
20
+ - **25** — All Requirements satisfied. All Verification commands would pass.
21
+ - **19-24** — 90%+ coverage, minor omissions.
22
+ - **13-18** — Partial implementation or verification gaps.
23
+ - **7-12** — Major requirements missed.
24
+ - **0-6** — Does not address the core task.
25
+
26
+ ### Axis 2 — Constraint Respect (0-25)
27
+
28
+ Zero new npm deps (unless spec allows), no silent catches (`try { } catch { return fallback }`), no `any`/`@ts-ignore`, explicit HOME/env guards where required, EACCES-specific handling, no hardcoded values that should be configurable.
29
+
30
+ - Each **disqualifier-severity forbidden_pattern match** or explicit constraint violation = −4 points, minimum 0.
31
+
32
+ ### Axis 3 — Scope Discipline (0-25)
33
+
34
+ Out of Scope respected. No gratuitous refactors of unrelated code. No "while I'm here" additions. No opportunistic upgrades.
35
+
36
+ - Each out-of-scope change = −5 points, minimum 0.
37
+
38
+ ### Axis 4 — Code Quality (0-25)
39
+
40
+ Readable, idiomatic for the language/framework, helpful error messages, appropriate abstraction level (not under- or over-engineered), uses standard library primitives where available (e.g., `fs.accessSync` over mode-bit checks per CLAUDE.md `phase-3-critic.md:32` calibration).
41
+
42
+ - This axis is judge-calibrated; no deterministic grading. The judge looks at
43
+ naming, function/file decomposition, error handling, and comparability to
44
+ idiomatic peer code.
45
+
46
+ ---
47
+
48
+ ## Judge Disqualifiers (hard floor)
49
+
50
+ Any of these produces `disqualifier: true` on the arm, overriding score:
51
+
52
+ - Silent-catch pattern in diff.
53
+ - Fabricated verification (code that claims to pass without actually running).
54
+ - Skipped a required test file that the spec names.
55
+ - Created a file listed in `expected.forbidden_files`.
56
+ - Exceeded `expected.max_deps_added`.
57
+ - `@ts-ignore` / `eslint-disable` without scoped justification comment.
58
+ - Hardcoded paths or values where spec required configurability.
59
+
60
+ Disqualifier arms automatically lose the fixture regardless of score.
61
+
62
+ ---
63
+
64
+ ## Ship Gates
65
+
66
+ After the judge finishes every fixture, `scripts/ship-gate.py` applies these
67
+ rules to the run's `summary.json`.
68
+
69
+ ### Hard floors (any one failure blocks ship)
70
+
71
+ 1. **No disqualifier-level violation** in variant on any fixture.
72
+ 2. **F9 (E2E) must PASS** — novice-flow contract.
73
+ 3. **≥ 7 of 9 fixtures** must have margin ≥ +5 — **headroom-aware** (added 2026-05-02 per iter-0033 R4 + NORTH-STAR amendment): a fixture is excluded from this count when `100 - L0_score < 5` AND `L1_score >= 95` AND the L1 arm has no disqualifier / CRITICAL-HIGH finding / watchdog timeout / regression worse than gate #4. Excluded fixtures become fixture-rotation candidates per the policy below if the two-shipped-version rule is met.
74
+ 4. **No fixture regression worse than −5** vs. last `baselines/shipped.json` on the same fixture.
75
+
76
+ ### Soft gates (produce WARNING but do not block)
77
+
78
+ 5. Suite average margin drop > 3 vs. last shipped.
79
+ 6. A fixture that previously had margin > +5 now has margin ≤ 0.
80
+ 7. Critical-finding catch-rate decrease vs. last shipped variant (not vs. bare).
81
+
82
+ ### Known-limit exception
83
+
84
+ - **F8-known-limit-ambiguous** is excluded from gates 3 and 4. It exists to
85
+ document where the harness may not beat bare. Its allowed margin range is
86
+ [-3, +3]. Margins outside this range trigger a WARNING regardless of sign
87
+ (too-good means the fixture is no longer a known limit; too-bad means we
88
+ shipped a regression somewhere else that this fixture caught).
89
+
90
+ ---
91
+
92
+ ## Run Record
93
+
94
+ Every suite run appends an immutable record to `history/runs/<ts>-<label>.json`:
95
+
96
+ ```json
97
+ {
98
+ "run_id": "2026-04-23T12:00:00Z-v3.6",
99
+ "version_label": "v3.6",
100
+ "git_sha": "fdb7428...",
101
+ "branch": "benchmark/v3.6-ab-...",
102
+ "n_per_fixture": 1,
103
+ "judge_model": "<recorded from ~/.codex/config.toml at run time; do not hardcode>",
104
+ "judge_effort": "xhigh",
105
+ "fixtures": [
106
+ {
107
+ "id": "F2-cli-medium-subcommand",
108
+ "variant": { "score": 92, "wall_s": 707, "tokens_agg": 108852, "disqualifier": false,
109
+ "axes": {"spec": 23, "constraint": 23, "scope": 24, "quality": 22} },
110
+ "bare": { "score": 81, "wall_s": 101, "tokens_agg": 55588, "disqualifier": false,
111
+ "axes": {"spec": 19, "constraint": 19, "scope": 20, "quality": 23} },
112
+ "winner": "variant",
113
+ "margin": 11,
114
+ "critical_findings": {
115
+ "variant": [],
116
+ "bare": ["silent catch in findSkillMdFiles (no-silent-catches violation)"]
117
+ }
118
+ }
119
+ ],
120
+ "suite": {
121
+ "fixtures_run": 9,
122
+ "variant_avg": 89.3,
123
+ "bare_avg": 75.0,
124
+ "margin_avg": 14.3,
125
+ "hard_floor_violations": 0,
126
+ "ship_gate": "PASS"
127
+ }
128
+ }
129
+ ```
130
+
131
+ ---
132
+
133
+ ## Fixture Rotation Policy
134
+
135
+ If any fixture has both arms scoring > 95 for two consecutive shipped
136
+ versions, it's saturated and no longer differentiates. Replace with a harder
137
+ equivalent and record the swap in
138
+ `history/runs/<ts>-fixture-rotation.json`:
139
+
140
+ ```json
141
+ {
142
+ "retired": "F1-cli-trivial-flag",
143
+ "retired_reason": "both arms > 95 on v3.7 and v3.8 (saturation)",
144
+ "replacement": "F1b-cli-trivial-flag-v2",
145
+ "replacement_rationale": "adds exit-code precedence requirement that current leaders didn't handle on first try"
146
+ }
147
+ ```
148
+
149
+ Retired fixtures stay in `fixtures/retired/` for replay if a regression is
150
+ suspected in their area.
151
+
152
+ ---
153
+
154
+ ## Why These Thresholds
155
+
156
+ - **+5 margin floor** — below this, variant isn't reliably beating bare given
157
+ judge variance (empirically ~±3 per axis). Worth paying pipeline cost
158
+ requires margin clearly above noise.
159
+ - **−5 regression floor** — one-axis regression can look like −5; allowing
160
+ less would let real regressions slip through.
161
+ - **7/9 fixtures rule** — tolerates one close-call + F8 known-limit; anything
162
+ worse means the suite is surfacing a broad harness problem.
@@ -0,0 +1,30 @@
1
+ # F1 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Trivial-tier calibration. Every arm should one-shot this; it's here to catch
6
+ catastrophic regressions and to anchor the "saturation" end of the scoring
7
+ scale.
8
+
9
+ ## Failure mode
10
+
11
+ - **Default-behavior regression.** Careless implementations add `--loud`
12
+ handling but accidentally alter the default case (e.g., always uppercasing
13
+ because the flag-check is misplaced). Verification commands 1 and 4 guard
14
+ against that.
15
+ - **Scope creep.** Modifying unrelated code while "here" would be caught by
16
+ both CRITIC design sub-pass and the `git diff --stat` spec requirement.
17
+
18
+ ## Pipeline exercise
19
+
20
+ - Phase 0 routing: expected `standard` route (no risk keywords).
21
+ - Phase 1 BUILD: single-file edit.
22
+ - Phase 1.4 BUILD GATE: `node --check` + `node --test` both must pass.
23
+ - Phase 2 EVAL: minimal findings expected.
24
+ - Phase 3 CRITIC design: verifies diff surgical-ness.
25
+
26
+ ## Rotation trigger
27
+
28
+ When both arms score > 95 for two consecutive shipped versions, replace with
29
+ a harder trivial fixture (e.g., one that requires handling a new flag
30
+ interacting with existing flag precedence).
@@ -0,0 +1,68 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node bin/cli.js hello",
5
+ "exit_code": 0,
6
+ "stdout_contains": [
7
+ "Hello, world!"
8
+ ],
9
+ "stdout_not_contains": [
10
+ "HELLO"
11
+ ]
12
+ },
13
+ {
14
+ "cmd": "node bin/cli.js hello --loud",
15
+ "exit_code": 0,
16
+ "stdout_contains": [
17
+ "HELLO, WORLD!!"
18
+ ],
19
+ "stdout_not_contains": []
20
+ },
21
+ {
22
+ "cmd": "node bin/cli.js hello --loud --name alice",
23
+ "exit_code": 0,
24
+ "stdout_contains": [
25
+ "HELLO, ALICE!!"
26
+ ],
27
+ "stdout_not_contains": []
28
+ },
29
+ {
30
+ "cmd": "node bin/cli.js hello --name bob",
31
+ "exit_code": 0,
32
+ "stdout_contains": [
33
+ "Hello, bob!"
34
+ ],
35
+ "stdout_not_contains": [
36
+ "HELLO"
37
+ ]
38
+ },
39
+ {
40
+ "cmd": "node --test tests/cli.test.js",
41
+ "exit_code": 0,
42
+ "stdout_contains": [],
43
+ "stdout_not_contains": [
44
+ "not ok "
45
+ ]
46
+ }
47
+ ],
48
+ "forbidden_patterns": [
49
+ {
50
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|'')",
51
+ "description": "silent catch returning fallback",
52
+ "files": [
53
+ "bin/cli.js"
54
+ ],
55
+ "severity": "disqualifier"
56
+ }
57
+ ],
58
+ "required_files": [
59
+ "bin/cli.js",
60
+ "tests/cli.test.js"
61
+ ],
62
+ "forbidden_files": [],
63
+ "max_deps_added": 0,
64
+ "spec_output_files": [
65
+ "bin/cli.js",
66
+ "tests/cli.test.js"
67
+ ]
68
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F1-cli-trivial-flag",
3
+ "category": "trivial",
4
+ "difficulty": "trivial",
5
+ "timeout_seconds": 900,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add a boolean --loud flag to bench-test-repo's hello subcommand. When passed, the greeting is uppercased and ends with '!!'. Default behavior unchanged. Update tests."
10
+ }
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ # F1 setup — no changes to base test-repo needed.
3
+ set -e
4
+ exit 0