devlyn-cli 1.14.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +112 -119
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +129 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/_shared/archive_run.py +130 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -481
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/optional-skills/devlyn:reap/SKILL.md +105 -0
  117. package/optional-skills/devlyn:reap/scripts/reap.sh +129 -0
  118. package/optional-skills/devlyn:reap/scripts/scan.sh +116 -0
  119. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  120. package/package.json +16 -2
  121. package/scripts/lint-skills.sh +431 -0
  122. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -602
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -116
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -204
  125. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  126. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  127. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  128. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  129. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  130. package/config/skills/devlyn:clean/SKILL.md +0 -285
  131. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  132. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  133. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  134. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  135. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  136. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  137. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  138. package/config/skills/devlyn:preflight/SKILL.md +0 -370
  139. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  140. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -90
  141. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  142. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  143. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  144. package/config/skills/devlyn:review/SKILL.md +0 -161
  145. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  146. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  147. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  148. package/config/skills/workflow-routing/SKILL.md +0 -73
@@ -0,0 +1,162 @@
1
+ {
2
+ "fixture_id": "F3-backend-contract-risk",
3
+ "generated_at": "2026-04-29T09:57:53Z",
4
+ "generated_from": {
5
+ "expected_path": "benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json",
6
+ "expected_sha256": "c0925ee948179fbc1c76836d98fba0c14c7eba56f462f2922903951923cb22e6",
7
+ "metadata_path": "benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json",
8
+ "metadata_sha256": "c54530db26dbb04ce50b698fed2608206eae6f9a5dc2f666f127695e15d3fa30",
9
+ "oracle_script_shas": {
10
+ "scope-tier-a": "baaf21ed4a67f35d2a8af825e72869ef9737b5dfe08d65dd1a11c26fafe297ae",
11
+ "scope-tier-b": "9349d00a5c7456a4df9142923334e7004407d53f2443f2e210945bb771971e25",
12
+ "test-fidelity": "401184da51ae500cecfc75a6c5819b0d28acb63a397f788fb628c2913562f903"
13
+ }
14
+ },
15
+ "required_invariants": [
16
+ {
17
+ "authority": "expected.json/forbidden_patterns",
18
+ "id": "forbidden_pattern__eslint_disable_without_explicit_per_issue_justification__server_index_js",
19
+ "operational_check": "variant arm output MUST NOT contain regex pattern '/\\\\*\\\\s*eslint-disable' in files ['server/index.js']; rationale: eslint-disable without explicit per-issue justification",
20
+ "severity": "disqualifier",
21
+ "source_field": "expected.json/forbidden_patterns/1",
22
+ "source_ref": "expected.json:forbidden_patterns[1]"
23
+ },
24
+ {
25
+ "authority": "expected.json/forbidden_patterns",
26
+ "id": "forbidden_pattern__silent_catch_returning_fallback__server_index_js",
27
+ "operational_check": "variant arm output MUST NOT contain regex pattern \"catch\\\\s*\\\\([^)]*\\\\)\\\\s*\\\\{[^}]*return\\\\s+(null|undefined|'')\" in files ['server/index.js']; rationale: silent catch returning fallback",
28
+ "severity": "disqualifier",
29
+ "source_field": "expected.json/forbidden_patterns/0",
30
+ "source_ref": "expected.json:forbidden_patterns[0]"
31
+ },
32
+ {
33
+ "authority": "expected.json/max_deps_added",
34
+ "id": "max_deps_added__0",
35
+ "operational_check": "variant arm MUST NOT add more than 0 new npm dependencies (count delta of package.json:dependencies + devDependencies)",
36
+ "severity": "hard",
37
+ "source_field": "expected.json/max_deps_added",
38
+ "source_ref": "expected.json:max_deps_added"
39
+ },
40
+ {
41
+ "authority": "expected.json/required_files",
42
+ "id": "required_file__server_index_js",
43
+ "operational_check": "variant arm output MUST contain file 'server/index.js' (created or preserved)",
44
+ "severity": "hard",
45
+ "source_field": "expected.json/required_files",
46
+ "source_ref": "expected.json:required_files[server/index.js]"
47
+ },
48
+ {
49
+ "authority": "expected.json/required_files",
50
+ "id": "required_file__tests_server_test_js",
51
+ "operational_check": "variant arm output MUST contain file 'tests/server.test.js' (created or preserved)",
52
+ "severity": "hard",
53
+ "source_field": "expected.json/required_files",
54
+ "source_ref": "expected.json:required_files[tests/server.test.js]"
55
+ },
56
+ {
57
+ "authority": "metadata/oracle-allowlist",
58
+ "id": "scope-tier-a:lockfile-deletion",
59
+ "operational_check": "variant arm MUST NOT delete a scaffold-present lockfile",
60
+ "severity": "hard",
61
+ "source_field": "oracle/scope-tier-a/scope-tier-a:lockfile-deletion",
62
+ "source_ref": "oracle-scope-tier-a.py"
63
+ },
64
+ {
65
+ "authority": "metadata/oracle-allowlist",
66
+ "id": "scope-tier-a:tier-a-violation",
67
+ "operational_check": "variant arm MUST NOT add or modify paths matching: docs/roadmap/** | docs/VISION.md | docs/ROADMAP.md | .github/** | node_modules/** | **/node_modules/** | test-results/** | coverage/** | .nyc_output/** | basename suffix .log | basename prefix .env or secrets.",
68
+ "severity": "hard",
69
+ "source_field": "oracle/scope-tier-a/scope-tier-a:tier-a-violation",
70
+ "source_ref": "oracle-scope-tier-a.py"
71
+ },
72
+ {
73
+ "authority": "metadata/oracle-allowlist",
74
+ "id": "scope-tier-b:scope-unmatched",
75
+ "operational_check": "every variant-touched file MUST be either inside spec_output_files (Tier C) OR reachable from a Tier C seed via static JS/TS imports OR matched by expected.json:tier_a_waivers",
76
+ "severity": "warn",
77
+ "source_field": "oracle/scope-tier-b/scope-tier-b:scope-unmatched",
78
+ "source_ref": "oracle-scope-tier-b.py"
79
+ },
80
+ {
81
+ "authority": "expected.json/spec_output_files",
82
+ "id": "spec_output_file__server_index_js",
83
+ "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'server/index.js' is one Tier C seed",
84
+ "severity": "warn",
85
+ "source_field": "expected.json/spec_output_files",
86
+ "source_ref": "expected.json:spec_output_files[server/index.js]"
87
+ },
88
+ {
89
+ "authority": "expected.json/spec_output_files",
90
+ "id": "spec_output_file__tests_server_test_js",
91
+ "operational_check": "variant-touched files MUST be inside (or reachable via static imports from) the spec_output_files set; 'tests/server.test.js' is one Tier C seed",
92
+ "severity": "warn",
93
+ "source_field": "expected.json/spec_output_files",
94
+ "source_ref": "expected.json:spec_output_files[tests/server.test.js]"
95
+ },
96
+ {
97
+ "authority": "metadata/oracle-allowlist",
98
+ "id": "test-fidelity:assertion-regression",
99
+ "operational_check": "effective assertion count MUST NOT drop and skipped-test count MUST NOT rise; vacuous expect.assertions(0) is treated as a real regression",
100
+ "severity": "warn",
101
+ "source_field": "oracle/test-fidelity/test-fidelity:assertion-regression",
102
+ "source_ref": "oracle-test-fidelity.py"
103
+ },
104
+ {
105
+ "authority": "metadata/oracle-allowlist",
106
+ "id": "test-fidelity:mock-swap",
107
+ "operational_check": "post-arm test file MUST NOT swap REAL_PATTERNS hits for MOCK_PATTERNS hits (jest/vi/sinon, nock/msw, app.handle/inject/callback, hand-rolled IncomingMessage/ServerResponse, etc.); a drop in real_calls combined with a rise in mock_calls is a mock-swap flag",
108
+ "severity": "flag",
109
+ "source_field": "oracle/test-fidelity/test-fidelity:mock-swap",
110
+ "source_ref": "oracle-test-fidelity.py"
111
+ },
112
+ {
113
+ "authority": "metadata/oracle-allowlist",
114
+ "id": "test-fidelity:test-file-deleted",
115
+ "operational_check": "no scaffold-present test file may be deleted by the variant arm; deletion of an existing tests/*.test.* / *.spec.* / *.e2e.* file is a flag-severity finding",
116
+ "severity": "flag",
117
+ "source_field": "oracle/test-fidelity/test-fidelity:test-file-deleted",
118
+ "source_ref": "oracle-test-fidelity.py"
119
+ },
120
+ {
121
+ "authority": "metadata/oracle-allowlist",
122
+ "id": "test-fidelity:test-file-renamed",
123
+ "operational_check": "rename of a scaffold-present test file is warn-severity (content fidelity not verified across renames in step 1)",
124
+ "severity": "warn",
125
+ "source_field": "oracle/test-fidelity/test-fidelity:test-file-renamed",
126
+ "source_ref": "oracle-test-fidelity.py"
127
+ },
128
+ {
129
+ "authority": "expected.json/verification_commands",
130
+ "id": "verification__6001efe2",
131
+ "operational_check": "running `node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?per_page=abc`, r => { console.log(r.statusCode); s.close(); }); });'` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['400']; stdout MUST NOT contain any of []",
132
+ "severity": "hard",
133
+ "source_field": "expected.json/verification_commands/3",
134
+ "source_ref": "expected.json:verification_commands[3]"
135
+ },
136
+ {
137
+ "authority": "expected.json/verification_commands",
138
+ "id": "verification__6517d995",
139
+ "operational_check": "running `node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items`, r => { let b = \"\"; r.on(\"data\", c=>b+=c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(JSON.stringify({ total: d.total, page: d.page, per_page: d.per_page, items_len: d.items.length })); s.close(); }); }); });'` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['\"total\":2', '\"page\":1']; stdout MUST NOT contain any of []",
140
+ "severity": "hard",
141
+ "source_field": "expected.json/verification_commands/1",
142
+ "source_ref": "expected.json:verification_commands[1]"
143
+ },
144
+ {
145
+ "authority": "expected.json/verification_commands",
146
+ "id": "verification__73df5e81",
147
+ "operational_check": "running `node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?page=2&per_page=1`, r => { let b = \"\"; r.on(\"data\", c=>b+=c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(d.items[0] && d.items[0].name); s.close(); }); }); });'` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of ['beta']; stdout MUST NOT contain any of []",
148
+ "severity": "hard",
149
+ "source_field": "expected.json/verification_commands/2",
150
+ "source_ref": "expected.json:verification_commands[2]"
151
+ },
152
+ {
153
+ "authority": "expected.json/verification_commands",
154
+ "id": "verification__7c5f3637",
155
+ "operational_check": "running `node --test tests/server.test.js` in the post-arm work dir MUST exit with code 0; stdout MUST contain all of []; stdout MUST NOT contain any of ['fail ']",
156
+ "severity": "hard",
157
+ "source_field": "expected.json/verification_commands/0",
158
+ "source_ref": "expected.json:verification_commands[0]"
159
+ }
160
+ ],
161
+ "schema_version": "1"
162
+ }
@@ -0,0 +1,65 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/server.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": [
8
+ "not ok "
9
+ ]
10
+ },
11
+ {
12
+ "cmd": "node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items`, r => { let b = \"\"; r.on(\"data\", c=>b+=c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(JSON.stringify({ total: d.total, page: d.page, per_page: d.per_page, items_len: d.items.length })); s.close(); }); }); });'",
13
+ "exit_code": 0,
14
+ "stdout_contains": [
15
+ "\"total\":2",
16
+ "\"page\":1"
17
+ ],
18
+ "stdout_not_contains": []
19
+ },
20
+ {
21
+ "cmd": "node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?page=2&per_page=1`, r => { let b = \"\"; r.on(\"data\", c=>b+=c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(d.items[0] && d.items[0].name); s.close(); }); }); });'",
22
+ "exit_code": 0,
23
+ "stdout_contains": [
24
+ "beta"
25
+ ],
26
+ "stdout_not_contains": []
27
+ },
28
+ {
29
+ "cmd": "node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?per_page=abc`, r => { console.log(r.statusCode); s.close(); }); });'",
30
+ "exit_code": 0,
31
+ "stdout_contains": [
32
+ "400"
33
+ ],
34
+ "stdout_not_contains": []
35
+ }
36
+ ],
37
+ "forbidden_patterns": [
38
+ {
39
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|'')",
40
+ "description": "silent catch returning fallback",
41
+ "files": [
42
+ "server/index.js"
43
+ ],
44
+ "severity": "disqualifier"
45
+ },
46
+ {
47
+ "pattern": "/\\*\\s*eslint-disable",
48
+ "description": "eslint-disable without explicit per-issue justification",
49
+ "files": [
50
+ "server/index.js"
51
+ ],
52
+ "severity": "disqualifier"
53
+ }
54
+ ],
55
+ "required_files": [
56
+ "server/index.js",
57
+ "tests/server.test.js"
58
+ ],
59
+ "forbidden_files": [],
60
+ "max_deps_added": 0,
61
+ "spec_output_files": [
62
+ "server/index.js",
63
+ "tests/server.test.js"
64
+ ]
65
+ }
@@ -0,0 +1,19 @@
1
+ {
2
+ "id": "F3-backend-contract-risk",
3
+ "category": "high-risk",
4
+ "difficulty": "high",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Change the GET /items response shape from { items } to a paginated { items, total, page, per_page } while keeping the existing 1-based id semantics and updating tests. A lazy implementation will leave tests broken or drop the items array — the spec requires both.",
10
+ "pair_plan_oracle_categories": [
11
+ "scope-tier-a:lockfile-deletion",
12
+ "scope-tier-a:tier-a-violation",
13
+ "scope-tier-b:scope-unmatched",
14
+ "test-fidelity:assertion-regression",
15
+ "test-fidelity:mock-swap",
16
+ "test-fidelity:test-file-deleted",
17
+ "test-fidelity:test-file-renamed"
18
+ ]
19
+ }
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ # F3 setup — no changes to base test-repo. Task modifies existing server/index.js.
3
+ set -e
4
+ exit 0
@@ -0,0 +1,56 @@
1
+ ---
2
+ id: "F3-backend-contract-risk"
3
+ title: "Paginate GET /items response"
4
+ status: planned
5
+ complexity: high
6
+ depends-on: []
7
+ ---
8
+
9
+ # F3 Paginate `GET /items`
10
+
11
+ ## Context
12
+
13
+ `server/index.js` currently returns `{ items: [...] }` for `GET /items` with
14
+ no pagination metadata. As the dataset grows, clients need a `total` and
15
+ paging information. The task: wrap the existing response in a pagination
16
+ envelope, accept `?page` and `?per_page` query parameters, and update tests
17
+ so existing assertions continue to pass alongside new paging assertions.
18
+
19
+ ## Requirements
20
+
21
+ - [ ] `GET /items` (no query) returns `{ items, total, page, per_page }` where:
22
+ - `items` is the full list (baseline repo has 2 items).
23
+ - `total` is the full item count.
24
+ - `page` is `1`.
25
+ - `per_page` is the full item count when no pagination was requested.
26
+ - [ ] `GET /items?page=1&per_page=1` returns the first item wrapped in the envelope with `items.length === 1`, `total === 2`, `page === 1`, `per_page === 1`.
27
+ - [ ] `GET /items?page=2&per_page=1` returns the second item similarly.
28
+ - [ ] `GET /items?page=99&per_page=1` returns `items: []`, `total === 2`, `page === 99`, `per_page === 1` (out-of-range page is allowed — bare empty array, never a 404).
29
+ - [ ] `GET /items/:id` behavior unchanged (the per-item route does NOT get paginated).
30
+ - [ ] `tests/server.test.js` is updated so every existing assertion still holds (semantically) AND the new paging behavior is covered by at least two new tests.
31
+ - [ ] `GET /health` continues to return `{ status: 'ok' }` unchanged.
32
+
33
+ ## Constraints
34
+
35
+ - **No new npm dependencies.** Use only Express + built-ins already in the repo.
36
+ - **No silent catches.** Invalid `page` or `per_page` (non-numeric, zero, negative) must respond 400 with `{ error: 'invalid_query', field }`.
37
+ - **No breaking change to `/items/:id`.** The per-item route must keep its current contract (the fixture explicitly does NOT paginate single-item lookups).
38
+ - **Backward-compat note**: clients that previously read `response.items` MUST still get the array at the same key inside the new envelope.
39
+
40
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
41
+
42
+ ## Out of Scope
43
+
44
+ - Caching, rate limiting, authentication.
45
+ - Converting `items` to a database-backed list.
46
+ - Touching `bin/cli.js`, `web/`, or `tests/cli.test.js`.
47
+ - Adding a new route.
48
+
49
+ ## Verification
50
+
51
+ - Server start: `node server/index.js` listens on port 3000 (exit via SIGINT).
52
+ - `curl -s http://127.0.0.1:3000/items | jq '.total'` returns `2`.
53
+ - `curl -s 'http://127.0.0.1:3000/items?per_page=1&page=2' | jq '.items[0].name'` returns `"beta"`.
54
+ - `curl -s 'http://127.0.0.1:3000/items?per_page=abc' -o /dev/null -w '%{http_code}'` returns `400`.
55
+ - `node --test tests/server.test.js` passes; must include ≥ 2 new paging tests.
56
+ - `git diff --stat` shows only `server/index.js` and `tests/server.test.js` touched.
@@ -0,0 +1,9 @@
1
+ The `GET /items` endpoint in `server/index.js` currently returns `{ items: [...] }`. Paginate it: the response should be `{ items, total, page, per_page }`. Accept `?page` and `?per_page` query params. When no params are given, return everything on page 1 with `per_page` equal to the full count.
2
+
3
+ Keep `GET /items/:id` unchanged (no pagination on single-item lookup). `GET /health` stays as-is.
4
+
5
+ Invalid `page` or `per_page` (non-numeric, zero, negative) → respond 400 with `{ error: 'invalid_query', field: '<name>' }`. Out-of-range page (beyond the last item) returns an empty `items` array, NOT a 404.
6
+
7
+ Update `tests/server.test.js` so existing behavior is still covered AND you add at least two new tests for the paging behavior.
8
+
9
+ No new npm dependencies. Only touch `server/index.js` and `tests/server.test.js`.
@@ -0,0 +1,40 @@
1
+ # F4 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Exercises the browser-validate phase of the pipeline (Phase 1.5). Catches
6
+ web-UI-only regressions that unit tests can't see and that server/integration
7
+ tests won't surface.
8
+
9
+ ## Failure modes detected
10
+
11
+ - **Italic via Unicode.** Arms may reach for Unicode italic characters
12
+ (`𝑖𝑡𝑎𝑙𝑖𝑐`) instead of CSS. Spec explicitly forbids this because it breaks
13
+ screen readers.
14
+ - **CDN link.** Linking to Google Fonts or an external CSS cuts the bench
15
+ and breaks offline / air-gapped runs — disqualifier.
16
+ - **Breaking Greet.** Careless refactors rewire the Greet button's handler
17
+ by mistake. Pipeline's Phase 1.5 browser-validate + dedicated spec test
18
+ catches it.
19
+ - **Accessibility drift.** Missing/incorrect `aria-label` on button.
20
+
21
+ ## Pipeline exercise
22
+
23
+ - Phase 1.5 BROWSER VALIDATE is the primary gate (web file changes trigger it).
24
+ - Phase 3 CRITIC design checks the DOM structure and event-handler wiring.
25
+
26
+ ## Caveats
27
+
28
+ - Playwright requires browser binaries installed locally. If the runner
29
+ machine lacks them, the browser test commands will fail. The suite
30
+ runner can still scoring for diff + grep checks, but the Playwright
31
+ command will show exit ≠ 0.
32
+ - The bench runner sets `BROWSER_METADATA` so future versions can wire
33
+ stricter browser-required gating; today the fixture only checks file
34
+ presence in verification.
35
+
36
+ ## Rotation trigger
37
+
38
+ When both arms consistently produce correct output AND include accessible
39
+ markup without pipeline intervention, rotate to a harder UI task (e.g., a
40
+ form with validation states).
@@ -0,0 +1,57 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "grep -q 'id=\"whisper\"' web/index.html && echo OK",
5
+ "exit_code": 0,
6
+ "stdout_contains": [
7
+ "OK"
8
+ ],
9
+ "stdout_not_contains": []
10
+ },
11
+ {
12
+ "cmd": "grep -q 'hello from bench-test-repo' web/index.html && echo OK",
13
+ "exit_code": 0,
14
+ "stdout_contains": [
15
+ "OK"
16
+ ],
17
+ "stdout_not_contains": []
18
+ },
19
+ {
20
+ "cmd": "grep -qE '(italic|font-style)' web/index.html && echo OK",
21
+ "exit_code": 0,
22
+ "stdout_contains": [
23
+ "OK"
24
+ ],
25
+ "stdout_not_contains": []
26
+ },
27
+ {
28
+ "cmd": "bash -c 'shopt -s nullglob; files=(tests/e2e/*.spec.*); [ ${#files[@]} -gt 0 ] && echo FOUND || { echo MISSING; exit 1; }'",
29
+ "exit_code": 0,
30
+ "stdout_contains": [
31
+ "FOUND"
32
+ ],
33
+ "stdout_not_contains": [
34
+ "MISSING"
35
+ ]
36
+ }
37
+ ],
38
+ "forbidden_patterns": [
39
+ {
40
+ "pattern": "(cdnjs|unpkg|jsdelivr|fonts\\.googleapis)",
41
+ "description": "external CDN reference \u2014 out-of-scope / offline brittleness",
42
+ "files": [
43
+ "web/index.html"
44
+ ],
45
+ "severity": "disqualifier"
46
+ }
47
+ ],
48
+ "required_files": [
49
+ "web/index.html"
50
+ ],
51
+ "forbidden_files": [],
52
+ "max_deps_added": 0,
53
+ "spec_output_files": [
54
+ "web/index.html",
55
+ "tests/e2e/**"
56
+ ]
57
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F4-web-browser-design",
3
+ "category": "stress",
4
+ "difficulty": "medium",
5
+ "timeout_seconds": 1800,
6
+ "required_tools": ["node", "npx"],
7
+ "browser": true,
8
+ "deps_change_expected": false,
9
+ "intent": "Add a second button labelled 'Whisper' to web/index.html that, when clicked, replaces the #output text with 'hello from bench-test-repo' rendered in lowercase italic. The existing 'Greet' button continues to work unchanged. Tests exercise both buttons via the static page (no server)."
10
+ }
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env bash
2
+ # F4 setup — no base changes needed. The task extends web/index.html and
3
+ # creates a Playwright test file.
4
+ set -e
5
+ mkdir -p tests/e2e
6
+ exit 0
@@ -0,0 +1,49 @@
1
+ ---
2
+ id: "F4-web-browser-design"
3
+ title: "Add a Whisper button with italic lowercase output"
4
+ status: planned
5
+ complexity: medium
6
+ depends-on: []
7
+ ---
8
+
9
+ # F4 Add Whisper button
10
+
11
+ ## Context
12
+
13
+ `web/index.html` currently has one button ("Greet") that fills `#output`
14
+ with `Hello from bench-test-repo`. Add a second button beside it labelled
15
+ `Whisper` that fills `#output` with `hello from bench-test-repo` — lowercase
16
+ and italicized — using only the page's own CSS/JS.
17
+
18
+ ## Requirements
19
+
20
+ - [ ] A new `<button id="whisper">Whisper</button>` renders beside the existing `#greet` button.
21
+ - [ ] Clicking `#whisper` sets `#output` textContent to `hello from bench-test-repo` (lowercase, no exclamation).
22
+ - [ ] `#output`'s rendering of the whisper text is italic. Use CSS (inline, a class, or toggling a class). Do not rely on Unicode italic characters.
23
+ - [ ] Clicking `#greet` continues to set `#output` to `Hello from bench-test-repo` as before (no italic styling).
24
+ - [ ] A text node in `#output` is readable by Playwright via `data-testid="output"` (already present in the baseline).
25
+ - [ ] Minimal diff: only `web/index.html` and any new files directly needed for the test harness (e.g., `tests/e2e/whisper.spec.js` per the existing Playwright config).
26
+
27
+ ## Constraints
28
+
29
+ - **No new npm dependencies.** Playwright is already scripted via `npx serve` and the repo's `playwright.config.js`.
30
+ - **No external resources.** Don't link to CDN fonts, external CSS, or remote images.
31
+ - **No inline JS frameworks.** Stick to the vanilla pattern already in `index.html`.
32
+ - **Accessibility.** Both buttons must have accessible names equal to their visible labels; `#whisper` adds `aria-label="whisper"` only if its visible text differs (it doesn't, so leave it off).
33
+
34
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
35
+
36
+ ## Out of Scope
37
+
38
+ - Animations / transitions.
39
+ - Theme toggle / dark mode.
40
+ - Any change to `bin/cli.js`, `server/`, or CLI tests.
41
+ - Moving styles into a separate .css file.
42
+
43
+ ## Verification
44
+
45
+ - Page loads: `npx serve -l 5173 web &` + `curl -s http://127.0.0.1:5173/` returns HTML containing `<button id="whisper"`.
46
+ - Clicking whisper produces `hello from bench-test-repo` in `#output` — verifiable via Playwright:
47
+ `npx playwright test tests/e2e/` passes the whisper spec.
48
+ - Clicking greet still produces `Hello from bench-test-repo` (test stays green).
49
+ - `git diff --stat` shows only `web/index.html` and the added Playwright test file.
@@ -0,0 +1,9 @@
1
+ Add a second button next to the existing "Greet" button in `web/index.html`, labelled "Whisper". When clicked, it should set `#output` to `hello from bench-test-repo` (lowercase, no exclamation mark) rendered in italic.
2
+
3
+ The existing "Greet" button must continue to set `#output` to `Hello from bench-test-repo` as before — no italic, no change.
4
+
5
+ Keep everything self-contained in the page: no CDN fonts, no new npm dependencies, no external resources. Use the same vanilla JS pattern that's already there.
6
+
7
+ Write a Playwright test under `tests/e2e/` that exercises both buttons. The repo already has `playwright.config.js` and serves `web/` via `npx serve -l 5173`.
8
+
9
+ Only touch `web/index.html` and the new Playwright test file.
@@ -0,0 +1,38 @@
1
+ # F5 — Notes
2
+
3
+ ## Purpose
4
+
5
+ The suite's FIX LOOP stress test. The tests are intentionally constructed so
6
+ the obvious first-pass implementation (simple `input.split(' ').filter(w => w === word).length`) passes the basic count case but fails on:
7
+
8
+ - Case insensitivity (`Cat` should match `cat`).
9
+ - Whole-word boundaries (`cat` should NOT match inside `category`).
10
+ - Empty-stdin edge (returning `undefined` instead of `0`).
11
+
12
+ Variant's pipeline is expected to:
13
+ 1. BUILD produces a first implementation.
14
+ 2. BUILD GATE runs `node --test`; some tests fail.
15
+ 3. EVAL emits findings with `criterion_ref` pointing at specific failing cases.
16
+ 4. FIX LOOP round 1 targets those findings and converges.
17
+
18
+ Bare, without a forcing mechanism, often ships the first implementation and
19
+ calls it done. Verification catches that.
20
+
21
+ ## Failure modes detected
22
+
23
+ - **Partial implementation.** Naive token split without regex word boundaries.
24
+ - **Case handling.** Missing `.toLowerCase()` on both sides of the comparison.
25
+ - **Async stdin.** Using `process.stdin.on('data')` without handling `end` properly → program hangs on test invocation.
26
+ - **Forgotten empty case.** `stdin.read()` returning `null` → `null.length` or `undefined` output.
27
+
28
+ ## Pipeline exercise
29
+
30
+ - **Phase 2 EVAL** is the star: it must identify each failing test case with file:line evidence.
31
+ - **Phase 2.5 FIX LOOP** runs at least once. A fixture passing with 0 fix rounds is a smoke signal that the test-trap design is too lenient; inspect.
32
+ - **Phase 1.4 BUILD GATE** uses `node --test` which exits non-zero on any failure, forcing route to 2.5.
33
+
34
+ ## Rotation trigger
35
+
36
+ When fix rounds consistently = 0 across two shipped versions, the trap is too
37
+ easy. Stiffen by adding a fourth test edge (e.g., Unicode folding, hyphenated
38
+ words).
@@ -0,0 +1,65 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/count.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": [
8
+ "not ok "
9
+ ]
10
+ },
11
+ {
12
+ "cmd": "echo 'cat hat CAT category' | node bin/cli.js count cat",
13
+ "exit_code": 0,
14
+ "stdout_contains": [
15
+ "2"
16
+ ],
17
+ "stdout_not_contains": [
18
+ "3",
19
+ "4"
20
+ ]
21
+ },
22
+ {
23
+ "cmd": "echo '' | node bin/cli.js count cat",
24
+ "exit_code": 0,
25
+ "stdout_contains": [
26
+ "0"
27
+ ],
28
+ "stdout_not_contains": []
29
+ },
30
+ {
31
+ "cmd": "node bin/cli.js count",
32
+ "exit_code": 1,
33
+ "stdout_contains": [],
34
+ "stdout_not_contains": []
35
+ },
36
+ {
37
+ "cmd": "node bin/cli.js hello",
38
+ "exit_code": 0,
39
+ "stdout_contains": [
40
+ "Hello, world!"
41
+ ],
42
+ "stdout_not_contains": []
43
+ }
44
+ ],
45
+ "forbidden_patterns": [
46
+ {
47
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
48
+ "description": "empty catch block \u2014 silent error suppression",
49
+ "files": [
50
+ "bin/cli.js"
51
+ ],
52
+ "severity": "disqualifier"
53
+ }
54
+ ],
55
+ "required_files": [
56
+ "bin/cli.js",
57
+ "tests/count.test.js"
58
+ ],
59
+ "forbidden_files": [],
60
+ "max_deps_added": 0,
61
+ "spec_output_files": [
62
+ "bin/cli.js",
63
+ "tests/**/count.test.js"
64
+ ]
65
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F5-fix-loop-red-green",
3
+ "category": "stress",
4
+ "difficulty": "medium",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Make the pre-installed failing tests for a new `count` subcommand pass. The tests require case-insensitive whole-word counting of stdin input against a provided word argument. A naive first implementation satisfies basic counts but misses case-insensitivity or whole-word boundaries — EVAL catches it and FIX LOOP drives the correct second pass."
10
+ }