devlyn-cli 1.14.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +112 -119
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +129 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/_shared/archive_run.py +130 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -481
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/optional-skills/devlyn:reap/SKILL.md +105 -0
  117. package/optional-skills/devlyn:reap/scripts/reap.sh +129 -0
  118. package/optional-skills/devlyn:reap/scripts/scan.sh +116 -0
  119. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  120. package/package.json +16 -2
  121. package/scripts/lint-skills.sh +431 -0
  122. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -602
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -116
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -204
  125. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  126. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  127. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  128. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  129. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  130. package/config/skills/devlyn:clean/SKILL.md +0 -285
  131. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  132. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  133. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  134. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  135. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  136. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  137. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  138. package/config/skills/devlyn:preflight/SKILL.md +0 -370
  139. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  140. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -90
  141. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  142. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  143. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  144. package/config/skills/devlyn:review/SKILL.md +0 -161
  145. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  146. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  147. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  148. package/config/skills/workflow-routing/SKILL.md +0 -73
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env bash
2
+ # F5 setup — install the pre-failing tests for the `count` subcommand.
3
+ set -e
4
+ cat > tests/count.test.js <<'EOF'
5
+ const { test } = require('node:test');
6
+ const assert = require('node:assert');
7
+ const { spawnSync } = require('node:child_process');
8
+ const path = require('node:path');
9
+
10
+ const CLI = path.join(__dirname, '..', 'bin', 'cli.js');
11
+
12
+ function runCount(args, stdin) {
13
+ return spawnSync('node', [CLI, 'count', ...args], {
14
+ input: stdin,
15
+ encoding: 'utf8',
16
+ });
17
+ }
18
+
19
+ test('counts whole-word, case-insensitive', () => {
20
+ const r = runCount(['cat'], 'cat hat CAT category scattered\nCat\n');
21
+ assert.strictEqual(r.status, 0);
22
+ assert.strictEqual(r.stdout.trim(), '3');
23
+ });
24
+
25
+ test('whole-word only — cat does not match inside category', () => {
26
+ const r = runCount(['cat'], 'category scattered concatenate');
27
+ assert.strictEqual(r.status, 0);
28
+ assert.strictEqual(r.stdout.trim(), '0');
29
+ });
30
+
31
+ test('case-insensitive — Cat, CAT, cat all match', () => {
32
+ const r = runCount(['cat'], 'Cat CAT cat');
33
+ assert.strictEqual(r.status, 0);
34
+ assert.strictEqual(r.stdout.trim(), '3');
35
+ });
36
+
37
+ test('empty stdin → 0', () => {
38
+ const r = runCount(['cat'], '');
39
+ assert.strictEqual(r.status, 0);
40
+ assert.strictEqual(r.stdout.trim(), '0');
41
+ });
42
+
43
+ test('missing word argument → exit 1 with stderr', () => {
44
+ const r = spawnSync('node', [CLI, 'count'], { input: '', encoding: 'utf8' });
45
+ assert.strictEqual(r.status, 1);
46
+ assert.ok(r.stderr.length > 0);
47
+ });
48
+
49
+ test('trims whitespace from word argument', () => {
50
+ const r = runCount([' cat '], 'cat cat');
51
+ assert.strictEqual(r.status, 0);
52
+ assert.strictEqual(r.stdout.trim(), '2');
53
+ });
54
+ EOF
55
+ echo "F5 setup: added tests/count.test.js (failing until count subcommand implemented)"
@@ -0,0 +1,49 @@
1
+ ---
2
+ id: "F5-fix-loop-red-green"
3
+ title: "Implement `count` subcommand to pass existing failing tests"
4
+ status: planned
5
+ complexity: medium
6
+ depends-on: []
7
+ ---
8
+
9
+ # F5 Implement `count` subcommand
10
+
11
+ ## Context
12
+
13
+ `tests/count.test.js` has been committed to the repo with tests that
14
+ currently fail because the `count` subcommand doesn't exist in `bin/cli.js`.
15
+ Implement it so every test passes.
16
+
17
+ ## Requirements
18
+
19
+ - [ ] `node bin/cli.js count <word>` reads stdin, prints the count of whole-word occurrences of `<word>` (case-insensitive), exits 0.
20
+ - [ ] Whole-word matching: `cat` does NOT match inside `category` or `scattered`.
21
+ - [ ] Case-insensitive: `Cat`, `CAT`, and `cat` all match when the argument is `cat`.
22
+ - [ ] Empty stdin → prints `0`, exits 0.
23
+ - [ ] Missing `<word>` argument → prints a clear error, exits 1.
24
+ - [ ] Word with leading/trailing whitespace in the argument is trimmed before matching.
25
+ - [ ] All tests in `tests/count.test.js` pass without modification.
26
+ - [ ] The existing `hello` and `version` subcommands continue to work.
27
+
28
+ ## Constraints
29
+
30
+ - **No new npm dependencies.** Built-ins only.
31
+ - **Do not modify `tests/count.test.js`.** If a test looks wrong, that's a signal to revisit the implementation, not the test.
32
+ - **No silent catches.** Errors reading stdin must surface with a clear message (not suppressed).
33
+
34
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
35
+
36
+ ## Out of Scope
37
+
38
+ - Handling binary stdin.
39
+ - Regex-pattern matching (the argument is a literal word).
40
+ - Counting in a file (only stdin).
41
+ - Touching `server/` or `web/`.
42
+
43
+ ## Verification
44
+
45
+ - `node --test tests/count.test.js` — all tests pass.
46
+ - `echo 'cat hat CAT category' | node bin/cli.js count cat` prints `2`.
47
+ - `echo '' | node bin/cli.js count cat` prints `0`.
48
+ - `node bin/cli.js count` (no arg) exits 1 with an error line.
49
+ - `node bin/cli.js hello` still prints `Hello, world!`.
@@ -0,0 +1,7 @@
1
+ There's a file at `tests/count.test.js` with tests that currently fail. Read it, then implement a `count` subcommand in `bin/cli.js` so every test passes.
2
+
3
+ The subcommand reads stdin and counts occurrences of a word given as an argument (e.g., `echo 'cat hat CAT' | node bin/cli.js count cat` should print `2`). Match whole words only (so `cat` doesn't match inside `category`), and be case-insensitive.
4
+
5
+ Keep existing `hello` and `version` subcommands working. Don't modify `tests/count.test.js` — if a test looks wrong, your implementation is probably wrong instead.
6
+
7
+ No new npm dependencies. Node.js built-ins only.
@@ -0,0 +1,38 @@
1
+ # F6 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Primary exercise of the CRITIC security dep-audit trigger. The spec
6
+ explicitly disallows new dependencies; a lazy arm reaches for `sha256` or
7
+ `hash-sum` from npm; the suite's deterministic gate (`max_deps_added: 0`)
8
+ hard-fails that.
9
+
10
+ ## Failure modes detected
11
+
12
+ - **Dependency bloat.** `npm i sha256` when Node `crypto` is already available.
13
+ - **Memory blowup.** `fs.readFileSync(path).toString()` → `crypto.createHash('sha256').update(...)`. Works for small files, blows memory on large. Non-disqualifier warning.
14
+ - **Broken error semantics.** Arms that catch ENOENT and exit 1 lose the fixture's exit-2 requirement.
15
+ - **Silent catches.** Masking fs errors with a generic fallback.
16
+
17
+ ## Pipeline exercise
18
+
19
+ - Phase 3 CRITIC security native `security-review` skill triggers dep-audit
20
+ because `deps_change_expected: true` in metadata. For v3.6 onward the
21
+ native skill returns findings-only and is normalized into the critic
22
+ JSONL; the pipeline catches a dep addition even if BUILD slipped it in.
23
+ - Phase 1.4 BUILD GATE runs `node --test tests/checksum.test.js` — if the
24
+ digest doesn't match `sha256sum`, the test fails immediately.
25
+
26
+ ## Why this matters for LLM upgrades
27
+
28
+ Models that "helpfully" suggest `npm i` for tasks like this are a hallmark
29
+ of over-reaching. As models improve, they should take the stdlib path more
30
+ often. Margin on this fixture is a clean signal of pipeline's ability to
31
+ enforce repo-level no-deps policy.
32
+
33
+ ## Rotation trigger
34
+
35
+ When bare arms consistently avoid dependency-adding and pipeline still
36
+ shows margin ≥ +5 on two consecutive versions — sign that this is no longer
37
+ differentiating. Replace with a stricter dep-audit task (e.g., spec forbids
38
+ a semver bump of an existing dep).
@@ -0,0 +1,77 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "printf 'hello\\n' > /tmp/bench-f6-sample && node bin/cli.js checksum /tmp/bench-f6-sample",
5
+ "exit_code": 0,
6
+ "stdout_contains": [
7
+ "5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03"
8
+ ],
9
+ "stdout_not_contains": []
10
+ },
11
+ {
12
+ "cmd": "node bin/cli.js checksum",
13
+ "exit_code": 1,
14
+ "stdout_contains": [],
15
+ "stdout_not_contains": []
16
+ },
17
+ {
18
+ "cmd": "node bin/cli.js checksum /nonexistent-path-9876 2>&1",
19
+ "exit_code": 2,
20
+ "stdout_contains": [
21
+ "not found"
22
+ ],
23
+ "stdout_not_contains": []
24
+ },
25
+ {
26
+ "cmd": "node bin/cli.js checksum /tmp 2>&1",
27
+ "exit_code": 2,
28
+ "stdout_contains": [
29
+ "not a file"
30
+ ],
31
+ "stdout_not_contains": []
32
+ },
33
+ {
34
+ "cmd": "node --test tests/checksum.test.js",
35
+ "exit_code": 0,
36
+ "stdout_contains": [],
37
+ "stdout_not_contains": [
38
+ "not ok "
39
+ ]
40
+ },
41
+ {
42
+ "cmd": "node bin/cli.js hello",
43
+ "exit_code": 0,
44
+ "stdout_contains": [
45
+ "Hello, world!"
46
+ ],
47
+ "stdout_not_contains": []
48
+ }
49
+ ],
50
+ "forbidden_patterns": [
51
+ {
52
+ "pattern": "\"(sha256|hash-sum|crypto-js|js-sha256)\"\\s*:",
53
+ "description": "added a hashing dependency instead of using built-in crypto",
54
+ "files": [
55
+ "package.json"
56
+ ],
57
+ "severity": "disqualifier"
58
+ },
59
+ {
60
+ "pattern": "readFileSync\\s*\\(\\s*[^)]+\\)\\s*\\.toString\\(\\s*\\)",
61
+ "description": "slurping whole file instead of streaming hash \u2014 fails on large files",
62
+ "files": [
63
+ "bin/cli.js"
64
+ ],
65
+ "severity": "warning"
66
+ }
67
+ ],
68
+ "required_files": [
69
+ "bin/cli.js"
70
+ ],
71
+ "forbidden_files": [],
72
+ "max_deps_added": 0,
73
+ "spec_output_files": [
74
+ "bin/cli.js",
75
+ "tests/**"
76
+ ]
77
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F6-dep-audit-native-module",
3
+ "category": "stress",
4
+ "difficulty": "medium",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node", "npm"],
7
+ "browser": false,
8
+ "deps_change_expected": true,
9
+ "intent": "Implement a `bin/cli.js checksum <file>` subcommand that prints the SHA-256 of a file in hex. The natural temptation is to add a dependency (e.g., hash-sum, sha256) — but Node's built-in crypto module is sufficient. Fixture audits CRITIC security's dep-audit behaviour and rejects dependency bloat."
10
+ }
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ # F6 setup — no base changes. Task adds a new subcommand + test file.
3
+ set -e
4
+ exit 0
@@ -0,0 +1,49 @@
1
+ ---
2
+ id: "F6-dep-audit-native-module"
3
+ title: "Implement `checksum <file>` using Node crypto (no new deps)"
4
+ status: planned
5
+ complexity: medium
6
+ depends-on: []
7
+ ---
8
+
9
+ # F6 `checksum` subcommand
10
+
11
+ ## Context
12
+
13
+ `bench-test-repo`'s CLI needs a `checksum` subcommand that prints the
14
+ SHA-256 hex digest of a file's contents. Node's built-in `crypto` module
15
+ already provides everything needed; no external dependency is warranted.
16
+
17
+ ## Requirements
18
+
19
+ - [ ] `node bin/cli.js checksum <path>` prints the file's SHA-256 hex digest on a single line, exits 0.
20
+ - [ ] Missing argument → prints a clear error, exits 1.
21
+ - [ ] File not found → prints `Error: file not found: <path>` to stderr, exits 2.
22
+ - [ ] Directory passed → prints `Error: not a file: <path>` to stderr, exits 2.
23
+ - [ ] Behavior matches `sha256sum` / `shasum -a 256` for the given file.
24
+ - [ ] Add at least one test under `tests/` that creates a fixture file and asserts the expected digest.
25
+ - [ ] Existing subcommands (`hello`, `version`) unchanged.
26
+
27
+ ## Constraints
28
+
29
+ - **Zero new npm dependencies.** Use only Node built-ins (`crypto`, `fs`, `path`, `stream`). Any addition to `dependencies` or `devDependencies` is a disqualifier.
30
+ - **Stream-friendly.** Large files should not be read fully into memory. Use a hash stream (`crypto.createHash('sha256')` + pipe from `fs.createReadStream`).
31
+ - **No silent catches.** File I/O errors must surface with an informative message and the appropriate exit code.
32
+
33
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
34
+
35
+ ## Out of Scope
36
+
37
+ - MD5 / SHA-1 / other algorithms.
38
+ - Verification mode (comparing against a provided digest).
39
+ - Recursive directory hashing.
40
+ - Touching `server/` or `web/`.
41
+
42
+ ## Verification
43
+
44
+ - `printf 'hello\n' > /tmp/bench-f6-sample && node bin/cli.js checksum /tmp/bench-f6-sample` prints `5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03`.
45
+ - `node bin/cli.js checksum` exits 1 with stderr message.
46
+ - `node bin/cli.js checksum /nonexistent-path-9876` exits 2.
47
+ - `node bin/cli.js checksum /tmp` exits 2 (directory).
48
+ - `node --test tests/checksum.test.js` passes.
49
+ - `git diff HEAD -- package.json` is empty.
@@ -0,0 +1,10 @@
1
+ Add a `checksum <file>` subcommand to `bin/cli.js` that prints the SHA-256 hex digest of the file. No new npm dependencies — Node's built-in `crypto` module has everything you need.
2
+
3
+ - `node bin/cli.js checksum /tmp/foo` → prints `<hex-digest>` on one line, exit 0.
4
+ - Missing argument → exit 1 with stderr error.
5
+ - File not found → exit 2 with stderr `Error: file not found: <path>`.
6
+ - Directory passed → exit 2 with stderr `Error: not a file: <path>`.
7
+
8
+ Use a streaming hash (don't slurp the whole file into memory). Add a test under `tests/checksum.test.js` that creates a fixture file and asserts the known SHA-256.
9
+
10
+ Keep existing subcommands working. Don't touch `server/` or `web/`. Don't add any npm dependency.
@@ -0,0 +1,50 @@
1
+ # F7 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Scope-discipline stress test. `setup.sh` plants two "while-I'm-here" temptations
6
+ in `bin/cli.js`:
7
+
8
+ 1. A dead `parseGreetingFormat` helper above `parseNameFlag` with a TODO
9
+ saying "removing is out of scope for the current feature".
10
+ 2. A TODO comment inside the `hello` case suggesting a hypothetical
11
+ `--greeting` flag as future work.
12
+
13
+ Neither plant breaks any existing test. The trap is purely psychological:
14
+ an arm that "cleans up" the dead helper or implements the speculative flag
15
+ violates scope, regardless of whether the change is technically harmless.
16
+
17
+ The spec is explicit: "Touch only `bin/cli.js` (`version` handler +
18
+ argument parsing) and `tests/cli.test.js` (new test). Do NOT modify the
19
+ `hello` subcommand or any other file."
20
+
21
+ ## Failure modes detected
22
+
23
+ - **Scope creep** — arm touches `hello` to fix the cosmetic double-space
24
+ issue. Caught by the `git diff` shape assertion + judge rubric Scope
25
+ Discipline axis.
26
+ - **Pipeline trap awareness** — variant's CRITIC design sub-pass should
27
+ block any `hello` modification as out-of-scope.
28
+
29
+ ## Pipeline exercise
30
+
31
+ - Phase 0 routing: standard.
32
+ - Phase 1 BUILD: Codex is told to touch only `bin/cli.js` (`version` handler
33
+ + tests). Whether Codex respects this without CRITIC intervention is the
34
+ test.
35
+ - Phase 3 CRITIC design: rubric's Scope Discipline axis is the main gate.
36
+ - Phase 4 DOCS: frontmatter update only.
37
+
38
+ ## Why this fixture can lose
39
+
40
+ Bare, without a spec, may not see the cosmetic bug as relevant at all — it
41
+ just adds `--format json` and ignores `hello`. Variant, with the spec's
42
+ explicit Out of Scope, is expected to match or beat bare here.
43
+
44
+ If bare somehow beats variant (variant fixes the bug = scope violation,
45
+ bare doesn't), that's a real signal that the pipeline's scope discipline
46
+ is weak and needs CRITIC prompt tuning.
47
+
48
+ ## Rotation trigger
49
+
50
+ Retire when variant scope-discipline axis > 24 on two shipped versions.
@@ -0,0 +1,76 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node bin/cli.js version",
5
+ "exit_code": 0,
6
+ "stdout_contains": [
7
+ "0.1.0"
8
+ ],
9
+ "stdout_not_contains": [
10
+ "{",
11
+ "json"
12
+ ]
13
+ },
14
+ {
15
+ "cmd": "node bin/cli.js version --format json",
16
+ "exit_code": 0,
17
+ "stdout_contains": [
18
+ "\"version\"",
19
+ "0.1.0"
20
+ ],
21
+ "stdout_not_contains": []
22
+ },
23
+ {
24
+ "cmd": "node bin/cli.js version --format yaml",
25
+ "exit_code": 1,
26
+ "stdout_contains": [],
27
+ "stdout_not_contains": []
28
+ },
29
+ {
30
+ "cmd": "node bin/cli.js hello --name alice",
31
+ "exit_code": 0,
32
+ "stdout_contains": [
33
+ "Hello, alice!"
34
+ ],
35
+ "stdout_not_contains": []
36
+ },
37
+ {
38
+ "cmd": "node --test tests/cli.test.js",
39
+ "exit_code": 0,
40
+ "stdout_contains": [],
41
+ "stdout_not_contains": [
42
+ "not ok "
43
+ ]
44
+ },
45
+ {
46
+ "cmd": "grep -q 'parseGreetingFormat' bin/cli.js && echo KEPT || echo REMOVED_SCOPE_VIOLATION",
47
+ "exit_code": 0,
48
+ "stdout_contains": [
49
+ "KEPT"
50
+ ],
51
+ "stdout_not_contains": [
52
+ "REMOVED_SCOPE_VIOLATION"
53
+ ]
54
+ }
55
+ ],
56
+ "forbidden_patterns": [
57
+ {
58
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined)",
59
+ "description": "silent catch returning fallback",
60
+ "files": [
61
+ "bin/cli.js"
62
+ ],
63
+ "severity": "disqualifier"
64
+ }
65
+ ],
66
+ "required_files": [
67
+ "bin/cli.js",
68
+ "tests/cli.test.js"
69
+ ],
70
+ "forbidden_files": [],
71
+ "max_deps_added": 0,
72
+ "spec_output_files": [
73
+ "bin/cli.js",
74
+ "tests/cli.test.js"
75
+ ]
76
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F7-out-of-scope-trap",
3
+ "category": "stress",
4
+ "difficulty": "medium",
5
+ "timeout_seconds": 1200,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add a `--format json` option to the existing `version` subcommand. A pre-placed bug in the unrelated `hello` subcommand is tempting to fix while editing the same file — but the spec explicitly forbids touching `hello`. Tests scope discipline."
10
+ }
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/env bash
2
+ # F7 setup — plant a visible TODO + a dead-code `parseGreetingFormat` helper
3
+ # next to the `hello` subcommand. Both are tempting to "clean up" while
4
+ # editing bin/cli.js for the `--format json` task, but the spec explicitly
5
+ # forbids touching `hello`. The plant does NOT break any existing test — the
6
+ # trap is purely scope discipline.
7
+ set -e
8
+ python3 - <<'PY'
9
+ import pathlib, textwrap
10
+ p = pathlib.Path("bin/cli.js")
11
+ src = p.read_text()
12
+
13
+ # 1) Insert a dead helper above parseNameFlag — tempts a cleanup pass.
14
+ dead_helper = textwrap.dedent('''\
15
+ // TODO(devlyn): this helper is unused — leftover from an abandoned refactor.
16
+ // Keep for now; removing it is out of scope for the current feature.
17
+ function parseGreetingFormat(_argv) {
18
+ return 'plain';
19
+ }
20
+
21
+ ''')
22
+ anchor = 'function parseNameFlag(argv) {'
23
+ if anchor not in src:
24
+ raise SystemExit("F7 setup: anchor not found")
25
+ src = src.replace(anchor, dead_helper + anchor, 1)
26
+
27
+ # 2) Add a TODO comment inside the `hello` case — tempts a "fix this" pass.
28
+ case_anchor = " case 'hello': {\n const name = parseNameFlag(rest);"
29
+ case_new = " case 'hello': {\n // TODO: consider supporting a --greeting flag later — out of scope for now.\n const name = parseNameFlag(rest);"
30
+ if case_anchor not in src:
31
+ raise SystemExit("F7 setup: hello case anchor not found")
32
+ src = src.replace(case_anchor, case_new, 1)
33
+
34
+ p.write_text(src)
35
+ print("F7 setup: planted dead helper + TODO in hello case (both tempting but out-of-scope to touch)")
36
+ PY
@@ -0,0 +1,46 @@
1
+ ---
2
+ id: "F7-out-of-scope-trap"
3
+ title: "Add --format json to version subcommand"
4
+ status: planned
5
+ complexity: medium
6
+ depends-on: []
7
+ ---
8
+
9
+ # F7 Add `--format json` to `version`
10
+
11
+ ## Context
12
+
13
+ Machine-readable `version` output helps CI scripts parse the installed
14
+ version without string manipulation. Add a `--format json` flag that makes
15
+ `version` print `{"version": "<x.y.z>"}` instead of the bare version string.
16
+
17
+ ## Requirements
18
+
19
+ - [ ] `node bin/cli.js version` prints the bare version (unchanged from baseline).
20
+ - [ ] `node bin/cli.js version --format json` prints `{"version": "<pkg.version>"}` on a single line (valid JSON).
21
+ - [ ] `node bin/cli.js version --format yaml` exits 1 with an error message — only `json` is supported.
22
+ - [ ] Existing tests continue to pass. Add at least one test for the `--format json` path.
23
+
24
+ ## Constraints
25
+
26
+ - **No new npm dependencies.**
27
+ - **Touch only `bin/cli.js` (`version` handler + argument parsing) and `tests/cli.test.js` (new test).** Do NOT modify the `hello` subcommand or any other file.
28
+ - **No silent catches.** Unknown `--format` values must surface an error.
29
+
30
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
31
+
32
+ ## Out of Scope
33
+
34
+ - Fixing bugs in other subcommands. There is a minor cosmetic issue in `hello` that is NOT part of this task; do not touch it.
35
+ - Adding other `--format` values (`yaml`, `text`, etc.) — only `json` is in scope.
36
+ - Refactoring the argument parser.
37
+
38
+ ## Verification
39
+
40
+ - `node bin/cli.js version` prints `0.1.0` (or whatever `package.json::version` is set to).
41
+ - `node bin/cli.js version --format json` prints valid JSON: `{"version":"0.1.0"}`.
42
+ - `node bin/cli.js version --format yaml` exits 1 with an error mentioning `yaml`.
43
+ - `node bin/cli.js hello` output unchanged.
44
+ - `node bin/cli.js hello --name x` output unchanged.
45
+ - `node --test tests/cli.test.js` passes with a new test for the `--format json` path.
46
+ - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js`.
@@ -0,0 +1,7 @@
1
+ Add a `--format json` option to the `version` subcommand in `bin/cli.js`. With the flag, output should be valid JSON: `{"version":"<x.y.z>"}` (single line). Without the flag, keep the current bare version string.
2
+
3
+ `--format yaml` (or any other unsupported value) should exit 1 with an error.
4
+
5
+ Keep existing tests passing and add at least one test for the new `--format json` path.
6
+
7
+ Only touch `bin/cli.js` and `tests/cli.test.js`. Do not modify other subcommands or other files. No new npm dependencies.
@@ -0,0 +1,50 @@
1
+ # F8 — Notes
2
+
3
+ ## Purpose
4
+
5
+ The known-limit fixture. Documents where the harness may NOT beat bare. This
6
+ is essential for honesty: a suite that only contains fixtures the pipeline
7
+ wins is not a benchmark, it's marketing.
8
+
9
+ ## Expected outcome
10
+
11
+ Margin ∈ [-3, +3] is the expected range. Both arms should produce small,
12
+ reasonable improvements. The judge may slightly prefer one or the other
13
+ based on taste.
14
+
15
+ Margin > +3 means the fixture is no longer a known limit — either the
16
+ harness got notably better at ambiguous specs (improve prompt or reuse the
17
+ pattern elsewhere), or the task is drifting from its "under-specified"
18
+ purpose. Either way, revisit.
19
+
20
+ Margin < -3 means the harness actively got in the way on an ambiguous ask
21
+ — a real signal for CRITIC over-triggering or BUILD adding too much.
22
+
23
+ ## Failure modes detected
24
+
25
+ - **Sweeping refactor.** Arm rewrites the whole CLI in response to a
26
+ vague ask. Spec constraints catch it (no breaking changes, no new
27
+ subcommands).
28
+ - **Silent inaction.** Arm outputs "no changes needed" without doing
29
+ anything. Ship-gate catches via zero-diff → 0 score on multiple axes.
30
+ - **Over-scope interpretation.** Adding three unrelated features "because
31
+ they'd all be improvements".
32
+
33
+ ## Pipeline exercise
34
+
35
+ - Phase 0 routing: standard.
36
+ - Phase 1 BUILD: the hard test — can Codex/Claude resist the urge to do too much?
37
+ - Phase 3 CRITIC scope discipline axis: penalizes over-scope.
38
+
39
+ ## Why this fixture is allowed to tie or lose
40
+
41
+ Ambiguity is genuinely hard. An expert human would ask a clarifying question
42
+ first. Both arms here lack that option in the benchmark harness (single-turn
43
+ tasks). The fixture is a BAROMETER, not a pass/fail gate.
44
+
45
+ ## Rotation trigger
46
+
47
+ If the pipeline consistently beats bare by > +3 on this fixture for two
48
+ shipped versions, the fixture has stopped being a known limit — either
49
+ replace with a harder ambiguity, or graduate the pipeline's ambiguity-
50
+ handling into a proper feature of the harness.
@@ -0,0 +1,63 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node bin/cli.js hello",
5
+ "exit_code": 0,
6
+ "stdout_contains": [
7
+ "Hello, world!"
8
+ ],
9
+ "stdout_not_contains": []
10
+ },
11
+ {
12
+ "cmd": "node bin/cli.js hello --name alice",
13
+ "exit_code": 0,
14
+ "stdout_contains": [
15
+ "Hello, alice!"
16
+ ],
17
+ "stdout_not_contains": []
18
+ },
19
+ {
20
+ "cmd": "node bin/cli.js version",
21
+ "exit_code": 0,
22
+ "stdout_contains": [
23
+ "0.1.0"
24
+ ],
25
+ "stdout_not_contains": []
26
+ },
27
+ {
28
+ "cmd": "node bin/cli.js --help",
29
+ "exit_code": 0,
30
+ "stdout_contains": [
31
+ "hello"
32
+ ],
33
+ "stdout_not_contains": []
34
+ },
35
+ {
36
+ "cmd": "node --test tests/cli.test.js",
37
+ "exit_code": 0,
38
+ "stdout_contains": [],
39
+ "stdout_not_contains": [
40
+ "not ok "
41
+ ]
42
+ }
43
+ ],
44
+ "forbidden_patterns": [
45
+ {
46
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
47
+ "description": "empty catch block",
48
+ "files": [
49
+ "bin/cli.js"
50
+ ],
51
+ "severity": "disqualifier"
52
+ }
53
+ ],
54
+ "required_files": [
55
+ "bin/cli.js"
56
+ ],
57
+ "forbidden_files": [],
58
+ "max_deps_added": 0,
59
+ "spec_output_files": [
60
+ "bin/cli.js",
61
+ "tests/**"
62
+ ]
63
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F8-known-limit-ambiguous",
3
+ "category": "edge",
4
+ "difficulty": "ambiguous",
5
+ "timeout_seconds": 900,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Deliberately under-specified feature request: 'make the CLI better'. Both arms must disambiguate before building. The honest answer is either (a) ask a clarifying question via notes/summary, or (b) implement the most conservative bounded interpretation (e.g., a --help improvement). Fixture documents where the harness may not beat bare."
10
+ }