devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,45 @@
1
+ ---
2
+ id: "F1-cli-trivial-flag"
3
+ title: "Add --loud flag to hello subcommand"
4
+ status: planned
5
+ complexity: trivial
6
+ depends-on: []
7
+ ---
8
+
9
+ # F1 Add `--loud` flag to `hello`
10
+
11
+ ## Context
12
+
13
+ The `hello` subcommand in `bin/cli.js` currently prints `Hello, <name>!`. A
14
+ `--loud` flag gives users an emphatic variant without breaking the default.
15
+ This is a low-risk edit used to calibrate trivial-tier fixture difficulty.
16
+
17
+ ## Requirements
18
+
19
+ - [ ] `node bin/cli.js hello --loud` prints `HELLO, WORLD!!` (everything uppercased, two trailing exclamation marks).
20
+ - [ ] `node bin/cli.js hello --loud --name alice` prints `HELLO, ALICE!!`.
21
+ - [ ] `node bin/cli.js hello` (no flag) still prints `Hello, world!` (unchanged).
22
+ - [ ] `node bin/cli.js hello --name bob` still prints `Hello, bob!` (unchanged).
23
+ - [ ] Existing tests continue to pass. Add at least one test covering the `--loud` path.
24
+
25
+ ## Constraints
26
+
27
+ - **No new npm dependencies.** Built-ins only.
28
+ - **No silent catches.** If an unknown flag is passed, exit 1 with an informative message (same pattern as the existing `--name` handler).
29
+ - **Surgical diff.** Only touch `bin/cli.js` and `tests/cli.test.js`. Do not reformat unrelated code.
30
+
31
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
32
+
33
+ ## Out of Scope
34
+
35
+ - Adding unrelated flags (`--quiet`, `--locale`, etc.).
36
+ - Refactoring the existing argument parser.
37
+ - Touching `server/`, `web/`, or `tests/server.test.js`.
38
+
39
+ ## Verification
40
+
41
+ - `node bin/cli.js hello` prints `Hello, world!` (exit 0).
42
+ - `node bin/cli.js hello --loud` prints `HELLO, WORLD!!` (exit 0).
43
+ - `node bin/cli.js hello --loud --name alice` prints `HELLO, ALICE!!` (exit 0).
44
+ - `node --test tests/` passes all tests including the new `--loud` case.
45
+ - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched.
@@ -0,0 +1,8 @@
1
+ Add a --loud flag to the `hello` subcommand in bench-test-repo's CLI (bin/cli.js). When --loud is passed, the greeting is uppercased and ends with two exclamation marks.
2
+
3
+ For example:
4
+ - `node bin/cli.js hello --loud` → `HELLO, WORLD!!`
5
+ - `node bin/cli.js hello --loud --name alice` → `HELLO, ALICE!!`
6
+ - `node bin/cli.js hello` → `Hello, world!` (unchanged default)
7
+
8
+ Make sure existing tests still pass and add at least one test for the --loud path. Don't touch unrelated files — only `bin/cli.js` and `tests/cli.test.js`. No new npm dependencies.
@@ -0,0 +1,54 @@
1
+ # F2 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Canonical **medium-complexity single-file CLI task** in the suite. Tests the
6
+ middle-ground: a task big enough that first-draft implementations often miss
7
+ an edge case (EACCES vs missing-dir distinction, TTY gating, HOME guard),
8
+ small enough that every arm can plausibly finish in < 10 minutes.
9
+
10
+ ## What failure mode does it detect?
11
+
12
+ - **Silent catches.** The pattern `try { readdirSync(...) } catch { return [] }`
13
+ is a natural shortcut here. Bare prompt arms tend to take it. The pipeline's
14
+ EVAL phase catches it as a `correctness.silent-error` or
15
+ `hygiene.silent-catch` finding.
16
+ - **Edge-case distinction.** ENOENT vs EACCES must be reported differently.
17
+ Arms that collapse both into a generic FAIL miss a spec Requirement.
18
+ - **Over-engineering.** Since v3.6's CRITIC calibration, hand-rolled
19
+ mode-bit writable checks are blocked in favor of `fs.accessSync(...,
20
+ fs.constants.W_OK)`.
21
+
22
+ ## Which pipeline phases does it exercise?
23
+
24
+ - Phase 0: routing — `permission`, `env` risk keywords in the task body
25
+ escalate to `strict`.
26
+ - Phase 1 BUILD: main implementation pass.
27
+ - Phase 1.4 BUILD GATE: `node --check` syntax gate.
28
+ - Phase 2 EVAL: catches silent-catch trap if present.
29
+ - Phase 3 CRITIC design: applies stdlib-vs-hand-rolled calibration.
30
+ - Phase 3 CRITIC security (native): minimal — no deps changed.
31
+ - Phase 4 DOCS: spec frontmatter `status: done`.
32
+
33
+ ## Why can't another fixture cover this?
34
+
35
+ - F1 is trivial (single-line edit, no edge cases).
36
+ - F3 is backend (different idioms, tests run differently).
37
+ - F5 is designed to force fix-loop (not applicable here).
38
+ - F7 is scope-creep (orthogonal concern).
39
+
40
+ ## When should this fixture be retired or replaced?
41
+
42
+ When both arms score > 95 for two consecutive shipped versions — i.e., the
43
+ fixture saturates and no longer differentiates. Candidate replacement: a
44
+ similar-size CLI task with multiple interacting flags or a subcommand that
45
+ spawns a child process.
46
+
47
+ ## Calibration history
48
+
49
+ - v3.4 skill 57 / bare 45 / margin +12 (gpt-5.3-codex judge)
50
+ - v3.4.1 skill 59 / bare 43 / margin +16 (gpt-5.3-codex judge)
51
+ - v3.5 skill 92 / bare 81 / margin +11 (gpt-5.4 xhigh judge) — huge absolute jump; bare silent-catch caught
52
+
53
+ Absolute scores jumped with the stronger judge. Margin stays solid (+11
54
+ after stdlib calibration is expected to open a few points more).
@@ -0,0 +1,170 @@
1
+ {
2
+ "fixture_id": "F2-cli-medium-subcommand",
3
+ "generated_at": "2026-04-29T09:57:53Z",
4
+ "generated_from": {
5
+ "expected_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json",
6
+ "expected_sha256": "ddef8feba49f20b6957e37840bc6a03e78e554776e380d81ad6390944c72fcab",
7
+ "metadata_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json",
8
+ "metadata_sha256": "1b8066a7c649eb6baad7a3e056edbdb16cc3b796e154cedee0cf2258c5543b18",
9
+ "oracle_script_shas": {
10
+ "scope-tier-a": "baaf21ed4a67f35d2a8af825e72869ef9737b5dfe08d65dd1a11c26fafe297ae",
11
+ "scope-tier-b": "9349d00a5c7456a4df9142923334e7004407d53f2443f2e210945bb771971e25",
12
+ "test-fidelity": "401184da51ae500cecfc75a6c5819b0d28acb63a397f788fb628c2913562f903"
13
+ }
14
+ },
15
+ "required_invariants": [
16
+ {
17
+ "authority": "expected.json/forbidden_patterns",
18
+ "id": "forbidden_pattern__silent_catch_returning_a_fallback_value_violates_no_silent_c__bin_cli_js",
19
+ "operational_check": "variant arm output MUST NOT contain regex pattern \"catch\\\\s*\\\\([^)]*\\\\)\\\\s*\\\\{[^}]*return\\\\s+(\\\\[\\\\]|null|undefined|\\\\{|false|'')\" in files ['bin/cli.js']; rationale: silent catch returning a fallback value — violates no-silent-catches policy",
20
+ "severity": "disqualifier",
21
+ "source_field": "expected.json/forbidden_patterns/0",
22
+ "source_ref": "expected.json:forbidden_patterns[0]"
23
+ },
24
+ {
25
+ "authority": "expected.json/forbidden_patterns",
26
+ "id": "forbidden_pattern__ts_ignore_escape_hatch__bin_cli_js",
27
+ "operational_check": "variant arm output MUST NOT contain regex pattern '@ts-ignore' in files ['bin/cli.js']; rationale: @ts-ignore escape hatch",
28
+ "severity": "disqualifier",
29
+ "source_field": "expected.json/forbidden_patterns/1",
30
+ "source_ref": "expected.json:forbidden_patterns[1]"
31
+ },
32
+ {
33
+ "authority": "expected.json/max_deps_added",
34
+ "id": "max_deps_added__0",
35
+ "operational_check": "variant arm MUST NOT add more than 0 new npm dependencies (count delta of package.json:dependencies + devDependencies)",
36
+ "severity": "hard",
37
+ "source_field": "expected.json/max_deps_added",
38
+ "source_ref": "expected.json:max_deps_added"
39
+ },
40
+ {
41
+ "authority": "expected.json/required_files",
42
+ "id": "required_file__bin_cli_js",
43
+ "operational_check": "variant arm output MUST contain file 'bin/cli.js' (created or preserved)",
44
+ "severity": "hard",
45
+ "source_field": "expected.json/required_files",
46
+ "source_ref": "expected.json:required_files[bin/cli.js]"
47
+ },
48
+ {
49
+ "authority": "metadata/oracle-allowlist",
50
+ "id": "scope-tier-a:lockfile-deletion",
51
+ "operational_check": "variant arm MUST NOT delete a scaffold-present lockfile",
52
+ "severity": "hard",
53
+ "source_field": "oracle/scope-tier-a/scope-tier-a:lockfile-deletion",
54
+ "source_ref": "oracle-scope-tier-a.py"
55
+ },
56
+ {
57
+ "authority": "metadata/oracle-allowlist",
58
+ "id": "scope-tier-a:tier-a-violation",
59
+ "operational_check": "variant arm MUST NOT add or modify paths matching: docs/roadmap/** | docs/VISION.md | docs/ROADMAP.md | .github/** | node_modules/** | **/node_modules/** | test-results/** | coverage/** | .nyc_output/** | basename suffix .log | basename prefix .env or secrets.",
60
+ "severity": "hard",
61
+ "source_field": "oracle/scope-tier-a/scope-tier-a:tier-a-violation",
62
+ "source_ref": "oracle-scope-tier-a.py"
63
+ },
64
+ {
65
+ "authority": "metadata/oracle-allowlist",
66
+ "id": "scope-tier-b:scope-unmatched",
67
+ "operational_check": "every variant-touched file MUST be either inside spec_output_files (Tier C) OR reachable from a Tier C seed via static JS/TS imports OR matched by expected.json:tier_a_waivers",
68
+ "severity": "warn",
69
+ "source_field": "oracle/scope-tier-b/scope-tier-b:scope-unmatched",
70
+ "source_ref": "oracle-scope-tier-b.py"
71
+ },
72
+ {
73
+ "authority": "expected.json/spec_output_files",
74
+ "id": "spec_output_file__bin_cli_js",
75
+ "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'bin/cli.js' is one Tier C seed",
76
+ "severity": "warn",
77
+ "source_field": "expected.json/spec_output_files",
78
+ "source_ref": "expected.json:spec_output_files[bin/cli.js]"
79
+ },
80
+ {
81
+ "authority": "expected.json/spec_output_files",
82
+ "id": "spec_output_file__tests_cli_test_js",
83
+ "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'tests/cli.test.js' is one Tier C seed",
84
+ "severity": "warn",
85
+ "source_field": "expected.json/spec_output_files",
86
+ "source_ref": "expected.json:spec_output_files[tests/cli.test.js]"
87
+ },
88
+ {
89
+ "authority": "metadata/oracle-allowlist",
90
+ "id": "test-fidelity:assertion-regression",
91
+ "operational_check": "effective assertion count MUST NOT drop and skipped-test count MUST NOT rise; vacuous expect.assertions(0) is treated as a real regression",
92
+ "severity": "warn",
93
+ "source_field": "oracle/test-fidelity/test-fidelity:assertion-regression",
94
+ "source_ref": "oracle-test-fidelity.py"
95
+ },
96
+ {
97
+ "authority": "metadata/oracle-allowlist",
98
+ "id": "test-fidelity:mock-swap",
99
+ "operational_check": "post-arm test file MUST NOT swap REAL_PATTERNS hits for MOCK_PATTERNS hits (jest/vi/sinon, nock/msw, app.handle/inject/callback, hand-rolled IncomingMessage/ServerResponse, etc.); a drop in real_calls combined with a rise in mock_calls is a mock-swap flag",
100
+ "severity": "flag",
101
+ "source_field": "oracle/test-fidelity/test-fidelity:mock-swap",
102
+ "source_ref": "oracle-test-fidelity.py"
103
+ },
104
+ {
105
+ "authority": "metadata/oracle-allowlist",
106
+ "id": "test-fidelity:test-file-deleted",
107
+ "operational_check": "no scaffold-present test file may be deleted by the variant arm; deletion of an existing tests/*.test.* / *.spec.* / *.e2e.* file is a flag-severity finding",
108
+ "severity": "flag",
109
+ "source_field": "oracle/test-fidelity/test-fidelity:test-file-deleted",
110
+ "source_ref": "oracle-test-fidelity.py"
111
+ },
112
+ {
113
+ "authority": "metadata/oracle-allowlist",
114
+ "id": "test-fidelity:test-file-renamed",
115
+ "operational_check": "rename of a scaffold-present test file is warn-severity (content fidelity not verified across renames in step 1)",
116
+ "severity": "warn",
117
+ "source_field": "oracle/test-fidelity/test-fidelity:test-file-renamed",
118
+ "source_ref": "oracle-test-fidelity.py"
119
+ },
120
+ {
121
+ "authority": "expected.json/verification_commands",
122
+ "id": "verification__3f35982a",
123
+ "operational_check": "running `node bin/cli.js doctor` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor:']; stdout MUST NOT contain any of ['undefined', 'Error:']",
124
+ "severity": "hard",
125
+ "source_field": "expected.json/verification_commands/0",
126
+ "source_ref": "expected.json:verification_commands[0]"
127
+ },
128
+ {
129
+ "authority": "expected.json/verification_commands",
130
+ "id": "verification__460fce04",
131
+ "operational_check": "running `HOME=/nonexistent node bin/cli.js doctor` in the post-arm work dir MUST exit with code 1; stdout MUST contain all of ['/nonexistent']; stdout MUST NOT contain any of []",
132
+ "severity": "hard",
133
+ "source_field": "expected.json/verification_commands/1",
134
+ "source_ref": "expected.json:verification_commands[1]"
135
+ },
136
+ {
137
+ "authority": "expected.json/verification_commands",
138
+ "id": "verification__973e287e",
139
+ "operational_check": "running `python3 -c \"import subprocess; r = subprocess.run(['node', 'bin/cli.js', 'doctor'], capture_output=True); n = r.stdout.count(b'\\x1b['); print(n); exit(0 if n == 0 else 1)\"` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['0']; stdout MUST NOT contain any of []",
140
+ "severity": "hard",
141
+ "source_field": "expected.json/verification_commands/2",
142
+ "source_ref": "expected.json:verification_commands[2]"
143
+ },
144
+ {
145
+ "authority": "expected.json/verification_commands",
146
+ "id": "verification__d6253a97",
147
+ "operational_check": "running `node bin/cli.js doctor --help` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor']; stdout MUST NOT contain any of []",
148
+ "severity": "hard",
149
+ "source_field": "expected.json/verification_commands/3",
150
+ "source_ref": "expected.json:verification_commands[3]"
151
+ },
152
+ {
153
+ "authority": "expected.json/verification_commands",
154
+ "id": "verification__e0f149e4",
155
+ "operational_check": "running `node bin/cli.js --help` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor']; stdout MUST NOT contain any of []",
156
+ "severity": "hard",
157
+ "source_field": "expected.json/verification_commands/4",
158
+ "source_ref": "expected.json:verification_commands[4]"
159
+ },
160
+ {
161
+ "authority": "expected.json/verification_commands",
162
+ "id": "verification__fdbcd321",
163
+ "operational_check": "running `node bin/cli.js doctor --verbose` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor:']; stdout MUST NOT contain any of ['Error:']",
164
+ "severity": "hard",
165
+ "source_field": "expected.json/verification_commands/5",
166
+ "source_ref": "expected.json:verification_commands[5]"
167
+ }
168
+ ],
169
+ "schema_version": "1"
170
+ }
@@ -0,0 +1,84 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node bin/cli.js doctor",
5
+ "exit_code": 0,
6
+ "stdout_contains": [
7
+ "doctor:"
8
+ ],
9
+ "stdout_not_contains": [
10
+ "undefined",
11
+ "Error:"
12
+ ]
13
+ },
14
+ {
15
+ "cmd": "HOME=/nonexistent node bin/cli.js doctor",
16
+ "exit_code": 1,
17
+ "stdout_contains": [
18
+ "/nonexistent"
19
+ ],
20
+ "stdout_not_contains": []
21
+ },
22
+ {
23
+ "cmd": "python3 -c \"import subprocess; r = subprocess.run(['node', 'bin/cli.js', 'doctor'], capture_output=True); n = r.stdout.count(b'\\x1b['); print(n); exit(0 if n == 0 else 1)\"",
24
+ "exit_code": 0,
25
+ "stdout_contains": [
26
+ "0"
27
+ ],
28
+ "stdout_not_contains": []
29
+ },
30
+ {
31
+ "cmd": "node bin/cli.js doctor --help",
32
+ "exit_code": 0,
33
+ "stdout_contains": [
34
+ "doctor"
35
+ ],
36
+ "stdout_not_contains": []
37
+ },
38
+ {
39
+ "cmd": "node bin/cli.js --help",
40
+ "exit_code": 0,
41
+ "stdout_contains": [
42
+ "doctor"
43
+ ],
44
+ "stdout_not_contains": []
45
+ },
46
+ {
47
+ "cmd": "node bin/cli.js doctor --verbose",
48
+ "exit_code": 0,
49
+ "stdout_contains": [
50
+ "doctor:"
51
+ ],
52
+ "stdout_not_contains": [
53
+ "Error:"
54
+ ]
55
+ }
56
+ ],
57
+ "forbidden_patterns": [
58
+ {
59
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(?:\\[\\]|null|undefined|false|''|\\{\\s*\\})",
60
+ "description": "silent catch returning a fallback value (null / undefined / [] / false / '' / empty {}) \u2014 violates no-silent-catches policy. Structured error returns like `return { level: 'fail', message }` are NOT silent (they surface a user-visible failure object) and must not match.",
61
+ "files": [
62
+ "bin/cli.js"
63
+ ],
64
+ "severity": "disqualifier"
65
+ },
66
+ {
67
+ "pattern": "@ts-ignore",
68
+ "description": "@ts-ignore escape hatch",
69
+ "files": [
70
+ "bin/cli.js"
71
+ ],
72
+ "severity": "disqualifier"
73
+ }
74
+ ],
75
+ "required_files": [
76
+ "bin/cli.js"
77
+ ],
78
+ "forbidden_files": [],
79
+ "max_deps_added": 0,
80
+ "spec_output_files": [
81
+ "bin/cli.js",
82
+ "tests/cli.test.js"
83
+ ]
84
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "id": "F2-cli-medium-subcommand",
3
+ "category": "medium",
4
+ "difficulty": "medium",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": [
7
+ "node"
8
+ ],
9
+ "browser": false,
10
+ "deps_change_expected": false,
11
+ "intent": "Add a `doctor` subcommand to bin/cli.js that diagnoses the local environment: node version check, $HOME/.claude directory check, installed plugins count, installed skills count, TTY-gated ANSI color, summary line, exit code, --verbose flag, help integration. Zero new npm dependencies. No silent error catches.",
12
+ "pair_plan_oracle_categories": [
13
+ "scope-tier-a:lockfile-deletion",
14
+ "scope-tier-a:tier-a-violation",
15
+ "scope-tier-b:scope-unmatched",
16
+ "test-fidelity:assertion-regression",
17
+ "test-fidelity:mock-swap",
18
+ "test-fidelity:test-file-deleted",
19
+ "test-fidelity:test-file-renamed"
20
+ ]
21
+ }
@@ -0,0 +1,214 @@
1
+ {
2
+ "accepted_invariants": [
3
+ {
4
+ "authority": "expected.json/forbidden_patterns",
5
+ "id": "forbidden_pattern__ts_ignore_escape_hatch__bin_cli_js",
6
+ "operational_check": "variant arm output MUST NOT contain regex pattern '@ts-ignore' in files ['bin/cli.js']; rationale: @ts-ignore escape hatch",
7
+ "paraphrase": "variant arm output MUST NOT contain regex pattern '@ts-ignore' in files ['bin/cli.js']; rationale: @ts-ignore escape hat",
8
+ "source_refs": [
9
+ "expected.json:forbidden_patterns[1]"
10
+ ]
11
+ },
12
+ {
13
+ "authority": "expected.json/max_deps_added",
14
+ "id": "max_deps_added__0",
15
+ "operational_check": "variant arm MUST NOT add more than 0 new npm dependencies (count delta of package.json:dependencies + devDependencies)",
16
+ "paraphrase": "variant arm MUST NOT add more than 0 new npm dependencies (count delta of package.json:dependencies + devDependencies)",
17
+ "source_refs": [
18
+ "expected.json:max_deps_added"
19
+ ]
20
+ },
21
+ {
22
+ "authority": "expected.json/required_files",
23
+ "id": "required_file__bin_cli_js",
24
+ "operational_check": "variant arm output MUST contain file 'bin/cli.js' (created or preserved)",
25
+ "paraphrase": "variant arm output MUST contain file 'bin/cli.js' (created or preserved)",
26
+ "source_refs": [
27
+ "expected.json:required_files[bin/cli.js]"
28
+ ]
29
+ },
30
+ {
31
+ "authority": "metadata/oracle-allowlist",
32
+ "id": "scope-tier-a:lockfile-deletion",
33
+ "operational_check": "variant arm MUST NOT delete a scaffold-present lockfile",
34
+ "paraphrase": "variant arm MUST NOT delete a scaffold-present lockfile",
35
+ "source_refs": [
36
+ "oracle-scope-tier-a.py"
37
+ ]
38
+ },
39
+ {
40
+ "authority": "metadata/oracle-allowlist",
41
+ "id": "scope-tier-a:tier-a-violation",
42
+ "operational_check": "variant arm MUST NOT add or modify paths matching: docs/roadmap/** | docs/VISION.md | docs/ROADMAP.md | .github/** | node_modules/** | **/node_modules/** | test-results/** | coverage/** | .nyc_output/** | basename suffix .log | basename prefix .env or secrets.",
43
+ "paraphrase": "variant arm MUST NOT add or modify paths matching: docs/roadmap/** | docs/VISION.md | docs/ROADMAP.md | .github/** | nod",
44
+ "source_refs": [
45
+ "oracle-scope-tier-a.py"
46
+ ]
47
+ },
48
+ {
49
+ "authority": "metadata/oracle-allowlist",
50
+ "id": "scope-tier-b:scope-unmatched",
51
+ "operational_check": "every variant-touched file MUST be either inside spec_output_files (Tier C) OR reachable from a Tier C seed via static JS/TS imports OR matched by expected.json:tier_a_waivers",
52
+ "paraphrase": "every variant-touched file MUST be either inside spec_output_files (Tier C) OR reachable from a Tier C seed via static J",
53
+ "source_refs": [
54
+ "oracle-scope-tier-b.py"
55
+ ]
56
+ },
57
+ {
58
+ "authority": "expected.json/spec_output_files",
59
+ "id": "spec_output_file__bin_cli_js",
60
+ "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'bin/cli.js' is one Tier C seed",
61
+ "paraphrase": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'bin/cli.js' is o",
62
+ "source_refs": [
63
+ "expected.json:spec_output_files[bin/cli.js]"
64
+ ]
65
+ },
66
+ {
67
+ "authority": "expected.json/spec_output_files",
68
+ "id": "spec_output_file__tests_cli_test_js",
69
+ "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'tests/cli.test.js' is one Tier C seed",
70
+ "paraphrase": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'tests/cli.test.j",
71
+ "source_refs": [
72
+ "expected.json:spec_output_files[tests/cli.test.js]"
73
+ ]
74
+ },
75
+ {
76
+ "authority": "metadata/oracle-allowlist",
77
+ "id": "test-fidelity:assertion-regression",
78
+ "operational_check": "effective assertion count MUST NOT drop and skipped-test count MUST NOT rise; vacuous expect.assertions(0) is treated as a real regression",
79
+ "paraphrase": "effective assertion count MUST NOT drop and skipped-test count MUST NOT rise; vacuous expect.assertions(0) is treated as",
80
+ "source_refs": [
81
+ "oracle-test-fidelity.py"
82
+ ]
83
+ },
84
+ {
85
+ "authority": "metadata/oracle-allowlist",
86
+ "id": "test-fidelity:mock-swap",
87
+ "operational_check": "post-arm test file MUST NOT swap REAL_PATTERNS hits for MOCK_PATTERNS hits (jest/vi/sinon, nock/msw, app.handle/inject/callback, hand-rolled IncomingMessage/ServerResponse, etc.); a drop in real_calls combined with a rise in mock_calls is a mock-swap flag",
88
+ "paraphrase": "post-arm test file MUST NOT swap REAL_PATTERNS hits for MOCK_PATTERNS hits (jest/vi/sinon, nock/msw, app.handle/inject/c",
89
+ "source_refs": [
90
+ "oracle-test-fidelity.py"
91
+ ]
92
+ },
93
+ {
94
+ "authority": "metadata/oracle-allowlist",
95
+ "id": "test-fidelity:test-file-deleted",
96
+ "operational_check": "no scaffold-present test file may be deleted by the variant arm; deletion of an existing tests/*.test.* / *.spec.* / *.e2e.* file is a flag-severity finding",
97
+ "paraphrase": "no scaffold-present test file may be deleted by the variant arm; deletion of an existing tests/*.test.* / *.spec.* / *.e",
98
+ "source_refs": [
99
+ "oracle-test-fidelity.py"
100
+ ]
101
+ },
102
+ {
103
+ "authority": "metadata/oracle-allowlist",
104
+ "id": "test-fidelity:test-file-renamed",
105
+ "operational_check": "rename of a scaffold-present test file is warn-severity (content fidelity not verified across renames in step 1)",
106
+ "paraphrase": "rename of a scaffold-present test file is warn-severity (content fidelity not verified across renames in step 1)",
107
+ "source_refs": [
108
+ "oracle-test-fidelity.py"
109
+ ]
110
+ },
111
+ {
112
+ "authority": "expected.json/verification_commands",
113
+ "id": "verification__3f35982a",
114
+ "operational_check": "running `node bin/cli.js doctor` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor:']; stdout MUST NOT contain any of ['undefined', 'Error:']",
115
+ "paraphrase": "running `node bin/cli.js doctor` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor:'];",
116
+ "source_refs": [
117
+ "expected.json:verification_commands[0]"
118
+ ]
119
+ },
120
+ {
121
+ "authority": "expected.json/verification_commands",
122
+ "id": "verification__460fce04",
123
+ "operational_check": "running `HOME=/nonexistent node bin/cli.js doctor` in the post-arm work dir MUST exit with code 1; stdout MUST contain all of ['/nonexistent']; stdout MUST NOT contain any of []",
124
+ "paraphrase": "running `HOME=/nonexistent node bin/cli.js doctor` in the post-arm work dir MUST exit with code 1; stdout MUST contain a",
125
+ "source_refs": [
126
+ "expected.json:verification_commands[1]"
127
+ ]
128
+ },
129
+ {
130
+ "authority": "expected.json/verification_commands",
131
+ "id": "verification__973e287e",
132
+ "operational_check": "running `python3 -c \"import subprocess; r = subprocess.run(['node', 'bin/cli.js', 'doctor'], capture_output=True); n = r.stdout.count(b'\\x1b['); print(n); exit(0 if n == 0 else 1)\"` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['0']; stdout MUST NOT contain any of []",
133
+ "paraphrase": "running `python3 -c \"import subprocess; r = subprocess.run(['node', 'bin/cli.js', 'doctor'], capture_output=True); n = r",
134
+ "source_refs": [
135
+ "expected.json:verification_commands[2]"
136
+ ]
137
+ },
138
+ {
139
+ "authority": "expected.json/verification_commands",
140
+ "id": "verification__d6253a97",
141
+ "operational_check": "running `node bin/cli.js doctor --help` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor']; stdout MUST NOT contain any of []",
142
+ "paraphrase": "running `node bin/cli.js doctor --help` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doc",
143
+ "source_refs": [
144
+ "expected.json:verification_commands[3]"
145
+ ]
146
+ },
147
+ {
148
+ "authority": "expected.json/verification_commands",
149
+ "id": "verification__e0f149e4",
150
+ "operational_check": "running `node bin/cli.js --help` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor']; stdout MUST NOT contain any of []",
151
+ "paraphrase": "running `node bin/cli.js --help` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor']; ",
152
+ "source_refs": [
153
+ "expected.json:verification_commands[4]"
154
+ ]
155
+ },
156
+ {
157
+ "authority": "expected.json/verification_commands",
158
+ "id": "verification__fdbcd321",
159
+ "operational_check": "running `node bin/cli.js doctor --verbose` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['doctor:']; stdout MUST NOT contain any of ['Error:']",
160
+ "paraphrase": "running `node bin/cli.js doctor --verbose` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['",
161
+ "source_refs": [
162
+ "expected.json:verification_commands[5]"
163
+ ]
164
+ }
165
+ ],
166
+ "authority_order": [
167
+ "spec.md",
168
+ "expected.json/rubric",
169
+ "phase prompt",
170
+ "model preference"
171
+ ],
172
+ "escalated_to_user": [],
173
+ "fixture_id": "F2-cli-medium-subcommand",
174
+ "model_stamps": {
175
+ "claude": {
176
+ "blocked_ids": [],
177
+ "model": "claude-opus-4-7",
178
+ "signed_plan_sha256": "05d19dc09a1c8820f58afcd091c2cd20888f7bf1141af4ec451a69723af0588a",
179
+ "status": "sign",
180
+ "timestamp": "2026-04-29T18:30:00Z"
181
+ },
182
+ "codex": {
183
+ "blocked_ids": [],
184
+ "model": "gpt-5.5",
185
+ "signed_plan_sha256": "05d19dc09a1c8820f58afcd091c2cd20888f7bf1141af4ec451a69723af0588a",
186
+ "status": "sign",
187
+ "timestamp": "2026-04-29T18:31:00Z"
188
+ }
189
+ },
190
+ "plan_status": "final",
191
+ "planning_mode": "pair",
192
+ "rejected_alternatives": [],
193
+ "rounds": [
194
+ {
195
+ "claude_draft_sha256": "0000000000000000000000000000000000000000000000000000000000000000",
196
+ "codex_draft_sha256": "1111111111111111111111111111111111111111111111111111111111111111",
197
+ "merged_sha256": "2222222222222222222222222222222222222222222222222222222222222222",
198
+ "note": "sample-pass synthetic round (test fixture)",
199
+ "round": 1
200
+ }
201
+ ],
202
+ "schema_version": "1",
203
+ "source": {
204
+ "canonical_id_registry_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json",
205
+ "canonical_id_registry_sha256": "98ac16e4536ea3ef2e51d3c728982c014211c193a742cea74f1331e4fbba76be",
206
+ "expected_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json",
207
+ "expected_sha256": "ddef8feba49f20b6957e37840bc6a03e78e554776e380d81ad6390944c72fcab",
208
+ "rubric_path": "benchmark/auto-resolve/RUBRIC.md",
209
+ "rubric_sha256": "5b5b709a0b57f7e6f4fbc072af91e1edbc8d7910ae16b9b7be7170616aeaa9af",
210
+ "spec_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md",
211
+ "spec_sha256": "9b0949c2afd4a522de2bdbbf267d93907fd908bf0f1d0dc5e111ee30ba875bb7"
212
+ },
213
+ "unresolved": []
214
+ }