devlyn-cli 1.14.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +112 -119
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +129 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/_shared/archive_run.py +130 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -481
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/optional-skills/devlyn:reap/SKILL.md +105 -0
  117. package/optional-skills/devlyn:reap/scripts/reap.sh +129 -0
  118. package/optional-skills/devlyn:reap/scripts/scan.sh +116 -0
  119. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  120. package/package.json +16 -2
  121. package/scripts/lint-skills.sh +431 -0
  122. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -602
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -116
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -204
  125. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  126. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  127. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  128. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  129. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  130. package/config/skills/devlyn:clean/SKILL.md +0 -285
  131. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  132. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  133. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  134. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  135. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  136. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  137. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  138. package/config/skills/devlyn:preflight/SKILL.md +0 -370
  139. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  140. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -90
  141. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  142. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  143. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  144. package/config/skills/devlyn:review/SKILL.md +0 -161
  145. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  146. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  147. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  148. package/config/skills/workflow-routing/SKILL.md +0 -73
@@ -0,0 +1,162 @@
1
+ # Benchmark Judge Rubric
2
+
3
+ Stable across model upgrades. This file is the single source of truth for how
4
+ arms are scored and how ship gates evaluate a run. Do not change the rubric
5
+ during a benchmarking window — changing it invalidates comparability with
6
+ prior `history/runs/`.
7
+
8
+ **Outer goal lives in [`autoresearch/NORTH-STAR.md`](../../autoresearch/NORTH-STAR.md).** The release-decision layer (L0 / L1 / L2 contracts, wall-time efficiency, pair-cost justification) sits on top of the per-arm scoring rules below. When NORTH-STAR.md adds a release-gate number that this file did not have, the new number applies — open a doc-fix iter to mirror it here.
9
+
10
+ ## Scoring — 4 axes, 25 points each, 100 total
11
+
12
+ The blind judge scores both arms on identical axes without knowing which is
13
+ variant vs. bare.
14
+
15
+ ### Axis 1 — Spec Compliance (0-25)
16
+
17
+ Does this implementation satisfy every Requirements bullet in `spec.md`?
18
+ Does every Verification command behave as the spec states?
19
+
20
+ - **25** — All Requirements satisfied. All Verification commands would pass.
21
+ - **19-24** — 90%+ coverage, minor omissions.
22
+ - **13-18** — Partial implementation or verification gaps.
23
+ - **7-12** — Major requirements missed.
24
+ - **0-6** — Does not address the core task.
25
+
26
+ ### Axis 2 — Constraint Respect (0-25)
27
+
28
+ Zero new npm deps (unless spec allows), no silent catches (`try { } catch { return fallback }`), no `any`/`@ts-ignore`, explicit HOME/env guards where required, EACCES-specific handling, no hardcoded values that should be configurable.
29
+
30
+ - Each **disqualifier-severity forbidden_pattern match** or explicit constraint violation = −4 points, minimum 0.
31
+
32
+ ### Axis 3 — Scope Discipline (0-25)
33
+
34
+ Out of Scope respected. No gratuitous refactors of unrelated code. No "while I'm here" additions. No opportunistic upgrades.
35
+
36
+ - Each out-of-scope change = −5 points, minimum 0.
37
+
38
+ ### Axis 4 — Code Quality (0-25)
39
+
40
+ Readable, idiomatic for the language/framework, helpful error messages, appropriate abstraction level (not under- or over-engineered), uses standard library primitives where available (e.g., `fs.accessSync` over mode-bit checks per CLAUDE.md `phase-3-critic.md:32` calibration).
41
+
42
+ - This axis is judge-calibrated; no deterministic grading. The judge looks at
43
+ naming, function/file decomposition, error handling, and comparability to
44
+ idiomatic peer code.
45
+
46
+ ---
47
+
48
+ ## Judge Disqualifiers (hard floor)
49
+
50
+ Any of these produces `disqualifier: true` on the arm, overriding score:
51
+
52
+ - Silent-catch pattern in diff.
53
+ - Fabricated verification (code that claims to pass without actually running).
54
+ - Skipped a required test file that the spec names.
55
+ - Created a file listed in `expected.forbidden_files`.
56
+ - Exceeded `expected.max_deps_added`.
57
+ - `@ts-ignore` / `eslint-disable` without scoped justification comment.
58
+ - Hardcoded paths or values where spec required configurability.
59
+
60
+ Disqualifier arms automatically lose the fixture regardless of score.
61
+
62
+ ---
63
+
64
+ ## Ship Gates
65
+
66
+ After the judge finishes every fixture, `scripts/ship-gate.py` applies these
67
+ rules to the run's `summary.json`.
68
+
69
+ ### Hard floors (any one failure blocks ship)
70
+
71
+ 1. **No disqualifier-level violation** in variant on any fixture.
72
+ 2. **F9 (E2E) must PASS** — novice-flow contract.
73
+ 3. **≥ 7 of 9 fixtures** must have margin ≥ +5 — **headroom-aware** (added 2026-05-02 per iter-0033 R4 + NORTH-STAR amendment): a fixture is excluded from this count when `100 - L0_score < 5` AND `L1_score >= 95` AND the L1 arm has no disqualifier / CRITICAL-HIGH finding / watchdog timeout / regression worse than gate #4. Excluded fixtures become fixture-rotation candidates per the policy below if the two-shipped-version rule is met.
74
+ 4. **No fixture regression worse than −5** vs. last `baselines/shipped.json` on the same fixture.
75
+
76
+ ### Soft gates (produce WARNING but do not block)
77
+
78
+ 5. Suite average margin drop > 3 vs. last shipped.
79
+ 6. A fixture that previously had margin > +5 now has margin ≤ 0.
80
+ 7. Critical-finding catch-rate decrease vs. last shipped variant (not vs. bare).
81
+
82
+ ### Known-limit exception
83
+
84
+ - **F8-known-limit-ambiguous** is excluded from gates 3 and 4. It exists to
85
+ document where the harness may not beat bare. Its allowed margin range is
86
+ [-3, +3]. Margins outside this range trigger a WARNING regardless of sign
87
+ (too-good means the fixture is no longer a known limit; too-bad means we
88
+ shipped a regression somewhere else that this fixture caught).
89
+
90
+ ---
91
+
92
+ ## Run Record
93
+
94
+ Every suite run appends an immutable record to `history/runs/<ts>-<label>.json`:
95
+
96
+ ```json
97
+ {
98
+ "run_id": "2026-04-23T12:00:00Z-v3.6",
99
+ "version_label": "v3.6",
100
+ "git_sha": "fdb7428...",
101
+ "branch": "benchmark/v3.6-ab-...",
102
+ "n_per_fixture": 1,
103
+ "judge_model": "<recorded from ~/.codex/config.toml at run time; do not hardcode>",
104
+ "judge_effort": "xhigh",
105
+ "fixtures": [
106
+ {
107
+ "id": "F2-cli-medium-subcommand",
108
+ "variant": { "score": 92, "wall_s": 707, "tokens_agg": 108852, "disqualifier": false,
109
+ "axes": {"spec": 23, "constraint": 23, "scope": 24, "quality": 22} },
110
+ "bare": { "score": 81, "wall_s": 101, "tokens_agg": 55588, "disqualifier": false,
111
+ "axes": {"spec": 19, "constraint": 19, "scope": 20, "quality": 23} },
112
+ "winner": "variant",
113
+ "margin": 11,
114
+ "critical_findings": {
115
+ "variant": [],
116
+ "bare": ["silent catch in findSkillMdFiles (no-silent-catches violation)"]
117
+ }
118
+ }
119
+ ],
120
+ "suite": {
121
+ "fixtures_run": 9,
122
+ "variant_avg": 89.3,
123
+ "bare_avg": 75.0,
124
+ "margin_avg": 14.3,
125
+ "hard_floor_violations": 0,
126
+ "ship_gate": "PASS"
127
+ }
128
+ }
129
+ ```
130
+
131
+ ---
132
+
133
+ ## Fixture Rotation Policy
134
+
135
+ If any fixture has both arms scoring > 95 for two consecutive shipped
136
+ versions, it's saturated and no longer differentiates. Replace with a harder
137
+ equivalent and record the swap in
138
+ `history/runs/<ts>-fixture-rotation.json`:
139
+
140
+ ```json
141
+ {
142
+ "retired": "F1-cli-trivial-flag",
143
+ "retired_reason": "both arms > 95 on v3.7 and v3.8 (saturation)",
144
+ "replacement": "F1b-cli-trivial-flag-v2",
145
+ "replacement_rationale": "adds exit-code precedence requirement that current leaders didn't handle on first try"
146
+ }
147
+ ```
148
+
149
+ Retired fixtures stay in `fixtures/retired/` for replay if a regression is
150
+ suspected in their area.
151
+
152
+ ---
153
+
154
+ ## Why These Thresholds
155
+
156
+ - **+5 margin floor** — below this, variant isn't reliably beating bare given
157
+ judge variance (empirically ~±3 per axis). Worth paying pipeline cost
158
+ requires margin clearly above noise.
159
+ - **−5 regression floor** — one-axis regression can look like −5; allowing
160
+ less would let real regressions slip through.
161
+ - **7/9 fixtures rule** — tolerates one close-call + F8 known-limit; anything
162
+ worse means the suite is surfacing a broad harness problem.
@@ -0,0 +1,30 @@
1
+ # F1 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Trivial-tier calibration. Every arm should one-shot this; it's here to catch
6
+ catastrophic regressions and to anchor the "saturation" end of the scoring
7
+ scale.
8
+
9
+ ## Failure mode
10
+
11
+ - **Default-behavior regression.** Careless implementations add `--loud`
12
+ handling but accidentally alter the default case (e.g., always uppercasing
13
+ because the flag-check is misplaced). Verification commands 1 and 4 guard
14
+ against that.
15
+ - **Scope creep.** Modifying unrelated code while "here" would be caught by
16
+ both CRITIC design sub-pass and the `git diff --stat` spec requirement.
17
+
18
+ ## Pipeline exercise
19
+
20
+ - Phase 0 routing: expected `standard` route (no risk keywords).
21
+ - Phase 1 BUILD: single-file edit.
22
+ - Phase 1.4 BUILD GATE: `node --check` + `node --test` both must pass.
23
+ - Phase 2 EVAL: minimal findings expected.
24
+ - Phase 3 CRITIC design: verifies diff surgical-ness.
25
+
26
+ ## Rotation trigger
27
+
28
+ When both arms score > 95 for two consecutive shipped versions, replace with
29
+ a harder trivial fixture (e.g., one that requires handling a new flag
30
+ interacting with existing flag precedence).
@@ -0,0 +1,68 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node bin/cli.js hello",
5
+ "exit_code": 0,
6
+ "stdout_contains": [
7
+ "Hello, world!"
8
+ ],
9
+ "stdout_not_contains": [
10
+ "HELLO"
11
+ ]
12
+ },
13
+ {
14
+ "cmd": "node bin/cli.js hello --loud",
15
+ "exit_code": 0,
16
+ "stdout_contains": [
17
+ "HELLO, WORLD!!"
18
+ ],
19
+ "stdout_not_contains": []
20
+ },
21
+ {
22
+ "cmd": "node bin/cli.js hello --loud --name alice",
23
+ "exit_code": 0,
24
+ "stdout_contains": [
25
+ "HELLO, ALICE!!"
26
+ ],
27
+ "stdout_not_contains": []
28
+ },
29
+ {
30
+ "cmd": "node bin/cli.js hello --name bob",
31
+ "exit_code": 0,
32
+ "stdout_contains": [
33
+ "Hello, bob!"
34
+ ],
35
+ "stdout_not_contains": [
36
+ "HELLO"
37
+ ]
38
+ },
39
+ {
40
+ "cmd": "node --test tests/cli.test.js",
41
+ "exit_code": 0,
42
+ "stdout_contains": [],
43
+ "stdout_not_contains": [
44
+ "not ok "
45
+ ]
46
+ }
47
+ ],
48
+ "forbidden_patterns": [
49
+ {
50
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|'')",
51
+ "description": "silent catch returning fallback",
52
+ "files": [
53
+ "bin/cli.js"
54
+ ],
55
+ "severity": "disqualifier"
56
+ }
57
+ ],
58
+ "required_files": [
59
+ "bin/cli.js",
60
+ "tests/cli.test.js"
61
+ ],
62
+ "forbidden_files": [],
63
+ "max_deps_added": 0,
64
+ "spec_output_files": [
65
+ "bin/cli.js",
66
+ "tests/cli.test.js"
67
+ ]
68
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F1-cli-trivial-flag",
3
+ "category": "trivial",
4
+ "difficulty": "trivial",
5
+ "timeout_seconds": 900,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add a boolean --loud flag to bench-test-repo's hello subcommand. When passed, the greeting is uppercased and ends with '!!'. Default behavior unchanged. Update tests."
10
+ }
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ # F1 setup — no changes to base test-repo needed.
3
+ set -e
4
+ exit 0
@@ -0,0 +1,45 @@
1
+ ---
2
+ id: "F1-cli-trivial-flag"
3
+ title: "Add --loud flag to hello subcommand"
4
+ status: planned
5
+ complexity: trivial
6
+ depends-on: []
7
+ ---
8
+
9
+ # F1 Add `--loud` flag to `hello`
10
+
11
+ ## Context
12
+
13
+ The `hello` subcommand in `bin/cli.js` currently prints `Hello, <name>!`. A
14
+ `--loud` flag gives users an emphatic variant without breaking the default.
15
+ This is a low-risk edit used to calibrate trivial-tier fixture difficulty.
16
+
17
+ ## Requirements
18
+
19
+ - [ ] `node bin/cli.js hello --loud` prints `HELLO, WORLD!!` (everything uppercased, two trailing exclamation marks).
20
+ - [ ] `node bin/cli.js hello --loud --name alice` prints `HELLO, ALICE!!`.
21
+ - [ ] `node bin/cli.js hello` (no flag) still prints `Hello, world!` (unchanged).
22
+ - [ ] `node bin/cli.js hello --name bob` still prints `Hello, bob!` (unchanged).
23
+ - [ ] Existing tests continue to pass. Add at least one test covering the `--loud` path.
24
+
25
+ ## Constraints
26
+
27
+ - **No new npm dependencies.** Built-ins only.
28
+ - **No silent catches.** If an unknown flag is passed, exit 1 with an informative message (same pattern as the existing `--name` handler).
29
+ - **Surgical diff.** Only touch `bin/cli.js` and `tests/cli.test.js`. Do not reformat unrelated code.
30
+
31
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
32
+
33
+ ## Out of Scope
34
+
35
+ - Adding unrelated flags (`--quiet`, `--locale`, etc.).
36
+ - Refactoring the existing argument parser.
37
+ - Touching `server/`, `web/`, or `tests/server.test.js`.
38
+
39
+ ## Verification
40
+
41
+ - `node bin/cli.js hello` prints `Hello, world!` (exit 0).
42
+ - `node bin/cli.js hello --loud` prints `HELLO, WORLD!!` (exit 0).
43
+ - `node bin/cli.js hello --loud --name alice` prints `HELLO, ALICE!!` (exit 0).
44
+ - `node --test tests/` passes all tests including the new `--loud` case.
45
+ - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched.
@@ -0,0 +1,8 @@
1
+ Add a --loud flag to the `hello` subcommand in bench-test-repo's CLI (bin/cli.js). When --loud is passed, the greeting is uppercased and ends with two exclamation marks.
2
+
3
+ For example:
4
+ - `node bin/cli.js hello --loud` → `HELLO, WORLD!!`
5
+ - `node bin/cli.js hello --loud --name alice` → `HELLO, ALICE!!`
6
+ - `node bin/cli.js hello` → `Hello, world!` (unchanged default)
7
+
8
+ Make sure existing tests still pass and add at least one test for the --loud path. Don't touch unrelated files — only `bin/cli.js` and `tests/cli.test.js`. No new npm dependencies.
@@ -0,0 +1,54 @@
1
+ # F2 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Canonical **medium-complexity single-file CLI task** in the suite. Tests the
6
+ middle-ground: a task big enough that first-draft implementations often miss
7
+ an edge case (EACCES vs missing-dir distinction, TTY gating, HOME guard),
8
+ small enough that every arm can plausibly finish in < 10 minutes.
9
+
10
+ ## What failure mode does it detect?
11
+
12
+ - **Silent catches.** The pattern `try { readdirSync(...) } catch { return [] }`
13
+ is a natural shortcut here. Bare prompt arms tend to take it. The pipeline's
14
+ EVAL phase catches it as a `correctness.silent-error` or
15
+ `hygiene.silent-catch` finding.
16
+ - **Edge-case distinction.** ENOENT vs EACCES must be reported differently.
17
+ Arms that collapse both into a generic FAIL miss a spec Requirement.
18
+ - **Over-engineering.** Since v3.6's CRITIC calibration, hand-rolled
19
+ mode-bit writable checks are blocked in favor of `fs.accessSync(...,
20
+ fs.constants.W_OK)`.
21
+
22
+ ## Which pipeline phases does it exercise?
23
+
24
+ - Phase 0: routing — `permission`, `env` risk keywords in the task body
25
+ escalate to `strict`.
26
+ - Phase 1 BUILD: main implementation pass.
27
+ - Phase 1.4 BUILD GATE: `node --check` syntax gate.
28
+ - Phase 2 EVAL: catches silent-catch trap if present.
29
+ - Phase 3 CRITIC design: applies stdlib-vs-hand-rolled calibration.
30
+ - Phase 3 CRITIC security (native): minimal — no deps changed.
31
+ - Phase 4 DOCS: spec frontmatter `status: done`.
32
+
33
+ ## Why can't another fixture cover this?
34
+
35
+ - F1 is trivial (single-line edit, no edge cases).
36
+ - F3 is backend (different idioms, tests run differently).
37
+ - F5 is designed to force fix-loop (not applicable here).
38
+ - F7 is scope-creep (orthogonal concern).
39
+
40
+ ## When should this fixture be retired or replaced?
41
+
42
+ When both arms score > 95 for two consecutive shipped versions — i.e., the
43
+ fixture saturates and no longer differentiates. Candidate replacement: a
44
+ similar-size CLI task with multiple interacting flags or a subcommand that
45
+ spawns a child process.
46
+
47
+ ## Calibration history
48
+
49
+ - v3.4 skill 57 / bare 45 / margin +12 (gpt-5.3-codex judge)
50
+ - v3.4.1 skill 59 / bare 43 / margin +16 (gpt-5.3-codex judge)
51
+ - v3.5 skill 92 / bare 81 / margin +11 (gpt-5.4 xhigh judge) — huge absolute jump; bare silent-catch caught
52
+
53
+ Absolute scores jumped with the stronger judge. Margin stays solid (+11
54
+ after stdlib calibration is expected to open a few points more).
@@ -0,0 +1,170 @@
1
+ {
2
+ "fixture_id": "F2-cli-medium-subcommand",
3
+ "generated_at": "2026-04-29T09:57:53Z",
4
+ "generated_from": {
5
+ "expected_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json",
6
+ "expected_sha256": "ddef8feba49f20b6957e37840bc6a03e78e554776e380d81ad6390944c72fcab",
7
+ "metadata_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json",
8
+ "metadata_sha256": "1b8066a7c649eb6baad7a3e056edbdb16cc3b796e154cedee0cf2258c5543b18",
9
+ "oracle_script_shas": {
10
+ "scope-tier-a": "baaf21ed4a67f35d2a8af825e72869ef9737b5dfe08d65dd1a11c26fafe297ae",
11
+ "scope-tier-b": "9349d00a5c7456a4df9142923334e7004407d53f2443f2e210945bb771971e25",
12
+ "test-fidelity": "401184da51ae500cecfc75a6c5819b0d28acb63a397f788fb628c2913562f903"
13
+ }
14
+ },
15
+ "required_invariants": [
16
+ {
17
+ "authority": "expected.json/forbidden_patterns",
18
+ "id": "forbidden_pattern__silent_catch_returning_a_fallback_value_violates_no_silent_c__bin_cli_js",
19
+ "operational_check": "variant arm output MUST NOT contain regex pattern \"catch\\\\s*\\\\([^)]*\\\\)\\\\s*\\\\{[^}]*return\\\\s+(\\\\[\\\\]|null|undefined|\\\\{|false|'')\" in files ['bin/cli.js']; rationale: silent catch returning a fallback value — violates no-silent-catches policy",
20
+ "severity": "disqualifier",
21
+ "source_field": "expected.json/forbidden_patterns/0",
22
+ "source_ref": "expected.json:forbidden_patterns[0]"
23
+ },
24
+ {
25
+ "authority": "expected.json/forbidden_patterns",
26
+ "id": "forbidden_pattern__ts_ignore_escape_hatch__bin_cli_js",
27
+ "operational_check": "variant arm output MUST NOT contain regex pattern '@ts-ignore' in files ['bin/cli.js']; rationale: @ts-ignore escape hatch",
28
+ "severity": "disqualifier",
29
+ "source_field": "expected.json/forbidden_patterns/1",
30
+ "source_ref": "expected.json:forbidden_patterns[1]"
31
+ },
32
+ {
33
+ "authority": "expected.json/max_deps_added",
34
+ "id": "max_deps_added__0",
35
+ "operational_check": "variant arm MUST NOT add more than 0 new npm dependencies (count delta of package.json:dependencies + devDependencies)",
36
+ "severity": "hard",
37
+ "source_field": "expected.json/max_deps_added",
38
+ "source_ref": "expected.json:max_deps_added"
39
+ },
40
+ {
41
+ "authority": "expected.json/required_files",
42
+ "id": "required_file__bin_cli_js",
43
+ "operational_check": "variant arm output MUST contain file 'bin/cli.js' (created or preserved)",
44
+ "severity": "hard",
45
+ "source_field": "expected.json/required_files",
46
+ "source_ref": "expected.json:required_files[bin/cli.js]"
47
+ },
48
+ {
49
+ "authority": "metadata/oracle-allowlist",
50
+ "id": "scope-tier-a:lockfile-deletion",
51
+ "operational_check": "variant arm MUST NOT delete a scaffold-present lockfile",
52
+ "severity": "hard",
53
+ "source_field": "oracle/scope-tier-a/scope-tier-a:lockfile-deletion",
54
+ "source_ref": "oracle-scope-tier-a.py"
55
+ },
56
+ {
57
+ "authority": "metadata/oracle-allowlist",
58
+ "id": "scope-tier-a:tier-a-violation",
59
+ "operational_check": "variant arm MUST NOT add or modify paths matching: docs/roadmap/** | docs/VISION.md | docs/ROADMAP.md | .github/** | node_modules/** | **/node_modules/** | test-results/** | coverage/** | .nyc_output/** | basename suffix .log | basename prefix .env or secrets.",
60
+ "severity": "hard",
61
+ "source_field": "oracle/scope-tier-a/scope-tier-a:tier-a-violation",
62
+ "source_ref": "oracle-scope-tier-a.py"
63
+ },
64
+ {
65
+ "authority": "metadata/oracle-allowlist",
66
+ "id": "scope-tier-b:scope-unmatched",
67
+ "operational_check": "every variant-touched file MUST be either inside spec_output_files (Tier C) OR reachable from a Tier C seed via static JS/TS imports OR matched by expected.json:tier_a_waivers",
68
+ "severity": "warn",
69
+ "source_field": "oracle/scope-tier-b/scope-tier-b:scope-unmatched",
70
+ "source_ref": "oracle-scope-tier-b.py"
71
+ },
72
+ {
73
+ "authority": "expected.json/spec_output_files",
74
+ "id": "spec_output_file__bin_cli_js",
75
+ "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'bin/cli.js' is one Tier C seed",
76
+ "severity": "warn",
77
+ "source_field": "expected.json/spec_output_files",
78
+ "source_ref": "expected.json:spec_output_files[bin/cli.js]"
79
+ },
80
+ {
81
+ "authority": "expected.json/spec_output_files",
82
+ "id": "spec_output_file__tests_cli_test_js",
83
+ "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'tests/cli.test.js' is one Tier C seed",
84
+ "severity": "warn",
85
+ "source_field": "expected.json/spec_output_files",
86
+ "source_ref": "expected.json:spec_output_files[tests/cli.test.js]"
87
+ },
88
+ {
89
+ "authority": "metadata/oracle-allowlist",
90
+ "id": "test-fidelity:assertion-regression",
91
+ "operational_check": "effective assertion count MUST NOT drop and skipped-test count MUST NOT rise; vacuous expect.assertions(0) is treated as a real regression",
92
+ "severity": "warn",
93
+ "source_field": "oracle/test-fidelity/test-fidelity:assertion-regression",
94
+ "source_ref": "oracle-test-fidelity.py"
95
+ },
96
+ {
97
+ "authority": "metadata/oracle-allowlist",
98
+ "id": "test-fidelity:mock-swap",
99
+ "operational_check": "post-arm test file MUST NOT swap REAL_PATTERNS hits for MOCK_PATTERNS hits (jest/vi/sinon, nock/msw, app.handle/inject/callback, hand-rolled IncomingMessage/ServerResponse, etc.); a drop in real_calls combined with a rise in mock_calls is a mock-swap flag",
100
+ "severity": "flag",
101
+ "source_field": "oracle/test-fidelity/test-fidelity:mock-swap",
102
+ "source_ref": "oracle-test-fidelity.py"
103
+ },
104
+ {
105
+ "authority": "metadata/oracle-allowlist",
106
+ "id": "test-fidelity:test-file-deleted",
107
+ "operational_check": "no scaffold-present test file may be deleted by the variant arm; deletion of an existing tests/*.test.* / *.spec.* / *.e2e.* file is a flag-severity finding",
108
+ "severity": "flag",
109
+ "source_field": "oracle/test-fidelity/test-fidelity:test-file-deleted",
110
+ "source_ref": "oracle-test-fidelity.py"
111
+ },
112
+ {
113
+ "authority": "metadata/oracle-allowlist",
114
+ "id": "test-fidelity:test-file-renamed",
115
+ "operational_check": "rename of a scaffold-present test file is warn-severity (content fidelity not verified across renames in step 1)",
116
+ "severity": "warn",
117
+ "source_field": "oracle/test-fidelity/test-fidelity:test-file-renamed",
118
+ "source_ref": "oracle-test-fidelity.py"
119
+ },
120
+ {
121
+ "authority": "expected.json/verification_commands",
122
+ "id": "verification__3f35982a",
123
+ "operational_check": "running `node bin/cli.js doctor` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor:']; stdout MUST NOT contain any of ['undefined', 'Error:']",
124
+ "severity": "hard",
125
+ "source_field": "expected.json/verification_commands/0",
126
+ "source_ref": "expected.json:verification_commands[0]"
127
+ },
128
+ {
129
+ "authority": "expected.json/verification_commands",
130
+ "id": "verification__460fce04",
131
+ "operational_check": "running `HOME=/nonexistent node bin/cli.js doctor` in the post-arm work dir MUST exit with code 1; stdout MUST contain all of ['/nonexistent']; stdout MUST NOT contain any of []",
132
+ "severity": "hard",
133
+ "source_field": "expected.json/verification_commands/1",
134
+ "source_ref": "expected.json:verification_commands[1]"
135
+ },
136
+ {
137
+ "authority": "expected.json/verification_commands",
138
+ "id": "verification__973e287e",
139
+ "operational_check": "running `python3 -c \"import subprocess; r = subprocess.run(['node', 'bin/cli.js', 'doctor'], capture_output=True); n = r.stdout.count(b'\\x1b['); print(n); exit(0 if n == 0 else 1)\"` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['0']; stdout MUST NOT contain any of []",
140
+ "severity": "hard",
141
+ "source_field": "expected.json/verification_commands/2",
142
+ "source_ref": "expected.json:verification_commands[2]"
143
+ },
144
+ {
145
+ "authority": "expected.json/verification_commands",
146
+ "id": "verification__d6253a97",
147
+ "operational_check": "running `node bin/cli.js doctor --help` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor']; stdout MUST NOT contain any of []",
148
+ "severity": "hard",
149
+ "source_field": "expected.json/verification_commands/3",
150
+ "source_ref": "expected.json:verification_commands[3]"
151
+ },
152
+ {
153
+ "authority": "expected.json/verification_commands",
154
+ "id": "verification__e0f149e4",
155
+ "operational_check": "running `node bin/cli.js --help` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor']; stdout MUST NOT contain any of []",
156
+ "severity": "hard",
157
+ "source_field": "expected.json/verification_commands/4",
158
+ "source_ref": "expected.json:verification_commands[4]"
159
+ },
160
+ {
161
+ "authority": "expected.json/verification_commands",
162
+ "id": "verification__fdbcd321",
163
+ "operational_check": "running `node bin/cli.js doctor --verbose` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor:']; stdout MUST NOT contain any of ['Error:']",
164
+ "severity": "hard",
165
+ "source_field": "expected.json/verification_commands/5",
166
+ "source_ref": "expected.json:verification_commands[5]"
167
+ }
168
+ ],
169
+ "schema_version": "1"
170
+ }
@@ -0,0 +1,84 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node bin/cli.js doctor",
5
+ "exit_code": 0,
6
+ "stdout_contains": [
7
+ "doctor:"
8
+ ],
9
+ "stdout_not_contains": [
10
+ "undefined",
11
+ "Error:"
12
+ ]
13
+ },
14
+ {
15
+ "cmd": "HOME=/nonexistent node bin/cli.js doctor",
16
+ "exit_code": 1,
17
+ "stdout_contains": [
18
+ "/nonexistent"
19
+ ],
20
+ "stdout_not_contains": []
21
+ },
22
+ {
23
+ "cmd": "python3 -c \"import subprocess; r = subprocess.run(['node', 'bin/cli.js', 'doctor'], capture_output=True); n = r.stdout.count(b'\\x1b['); print(n); exit(0 if n == 0 else 1)\"",
24
+ "exit_code": 0,
25
+ "stdout_contains": [
26
+ "0"
27
+ ],
28
+ "stdout_not_contains": []
29
+ },
30
+ {
31
+ "cmd": "node bin/cli.js doctor --help",
32
+ "exit_code": 0,
33
+ "stdout_contains": [
34
+ "doctor"
35
+ ],
36
+ "stdout_not_contains": []
37
+ },
38
+ {
39
+ "cmd": "node bin/cli.js --help",
40
+ "exit_code": 0,
41
+ "stdout_contains": [
42
+ "doctor"
43
+ ],
44
+ "stdout_not_contains": []
45
+ },
46
+ {
47
+ "cmd": "node bin/cli.js doctor --verbose",
48
+ "exit_code": 0,
49
+ "stdout_contains": [
50
+ "doctor:"
51
+ ],
52
+ "stdout_not_contains": [
53
+ "Error:"
54
+ ]
55
+ }
56
+ ],
57
+ "forbidden_patterns": [
58
+ {
59
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(?:\\[\\]|null|undefined|false|''|\\{\\s*\\})",
60
+ "description": "silent catch returning a fallback value (null / undefined / [] / false / '' / empty {}) \u2014 violates no-silent-catches policy. Structured error returns like `return { level: 'fail', message }` are NOT silent (they surface a user-visible failure object) and must not match.",
61
+ "files": [
62
+ "bin/cli.js"
63
+ ],
64
+ "severity": "disqualifier"
65
+ },
66
+ {
67
+ "pattern": "@ts-ignore",
68
+ "description": "@ts-ignore escape hatch",
69
+ "files": [
70
+ "bin/cli.js"
71
+ ],
72
+ "severity": "disqualifier"
73
+ }
74
+ ],
75
+ "required_files": [
76
+ "bin/cli.js"
77
+ ],
78
+ "forbidden_files": [],
79
+ "max_deps_added": 0,
80
+ "spec_output_files": [
81
+ "bin/cli.js",
82
+ "tests/cli.test.js"
83
+ ]
84
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "id": "F2-cli-medium-subcommand",
3
+ "category": "medium",
4
+ "difficulty": "medium",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": [
7
+ "node"
8
+ ],
9
+ "browser": false,
10
+ "deps_change_expected": false,
11
+ "intent": "Add a `doctor` subcommand to bin/cli.js that diagnoses the local environment: node version check, $HOME/.claude directory check, installed plugins count, installed skills count, TTY-gated ANSI color, summary line, exit code, --verbose flag, help integration. Zero new npm dependencies. No silent error catches.",
12
+ "pair_plan_oracle_categories": [
13
+ "scope-tier-a:lockfile-deletion",
14
+ "scope-tier-a:tier-a-violation",
15
+ "scope-tier-b:scope-unmatched",
16
+ "test-fidelity:assertion-regression",
17
+ "test-fidelity:mock-swap",
18
+ "test-fidelity:test-file-deleted",
19
+ "test-fidelity:test-file-renamed"
20
+ ]
21
+ }