devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +318 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  12. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  18. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  25. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  31. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  40. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  48. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  56. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  64. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  73. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  82. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  91. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  100. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  101. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  102. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  103. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  104. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  105. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  106. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  107. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  110. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  111. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  112. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  113. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  114. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  116. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  117. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  118. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  119. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  120. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  121. package/config/skills/_shared/archive_run.py +3 -0
  122. package/config/skills/_shared/codex-config.md +2 -2
  123. package/config/skills/_shared/codex-monitored.sh +72 -7
  124. package/config/skills/_shared/collect-codex-findings.py +125 -0
  125. package/config/skills/_shared/engine-preflight.md +1 -1
  126. package/config/skills/_shared/expected.schema.json +18 -0
  127. package/config/skills/_shared/spec-verify-check.py +312 -10
  128. package/config/skills/_shared/verify-merge-findings.py +327 -0
  129. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  130. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  131. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  132. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  133. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  134. package/package.json +1 -1
  135. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,70 @@
1
+ # F11 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Pair-discriminating high-risk fixture. Adds a batch-import write endpoint
6
+ with an all-or-nothing guarantee. The pair-edge mechanism: implementers
7
+ who validate-as-they-go produce a partial-write bug — by the time the
8
+ invalid item is hit and 400 returned, prior items have already been
9
+ appended. The natural shape:
10
+
11
+ ```js
12
+ app.post('/items/import', (req, res) => {
13
+ for (const it of req.body.items) {
14
+ if (!valid(it)) return res.status(400).json(...);
15
+ items.push({ id: nextId(), ...it }); // already mutated
16
+ }
17
+ res.status(201).json({ inserted: req.body.items.length });
18
+ });
19
+ ```
20
+
21
+ This passes the "happy path" test trivially and the "all-bad" test trivially.
22
+ It fails only on the discriminating case: one bad item mid-batch — store
23
+ ends up with the prefix already inserted while the response says 400.
24
+
25
+ A reviewer with fresh eyes asking "what does the store look like after the
26
+ failure response?" catches it; the same model that wrote the loop tends to
27
+ focus on the response correctness without re-examining the store delta.
28
+
29
+ ## Failure modes detected
30
+
31
+ - **Partial inserts** before validation failure (the core discriminator).
32
+ - **Order swap** — implementer inserts at wrong index or sorts unexpectedly.
33
+ - **Id collision** — implementer reuses ids when batch validation rejects.
34
+ - **Silent catch** wrapping `JSON.parse` or validation. Caught by
35
+ forbidden_pattern.
36
+
37
+ ## Pipeline exercise
38
+
39
+ - Phase 1 BUILD: implementer must derive that "all or nothing" requires
40
+ validating the entire batch before any mutation, OR using a
41
+ copy-on-write pattern that rolls back on validation failure.
42
+ - Phase 2 EVAL: scrutinizes whether the new tests assert the
43
+ store-unchanged invariant after a failed batch, not just the 400.
44
+ - Phase 3 CRITIC: production-readiness on the "all or nothing" claim.
45
+
46
+ ## Discrimination expectation
47
+
48
+ Calibration target (set in pyx-memory project memory 2026-05-05):
49
+
50
+ - bare arm: 45-65 (passes spec text, fails the store-unchanged verifier
51
+ on mid-batch failure).
52
+ - solo arm: 65-78 (review pass may catch the store-delta issue if the
53
+ reviewer re-reads the spec; coin-flip).
54
+ - pair arm: 78-90 (cross-perspective derivation of the rollback or
55
+ validate-first pattern).
56
+
57
+ ## Public-spec wording — load-bearing
58
+
59
+ The spec uses "accepted as a whole or rejected as a whole" and "left
60
+ exactly as it was" instead of trigger keywords. If the spec said
61
+ "transactional", "atomic", or "rollback", a single-pass solo arm would
62
+ keyword-match the answer pattern and ace the fixture. The English prose
63
+ forces invariant derivation — the discriminating axis.
64
+
65
+ ## Rotation trigger
66
+
67
+ Retire when both arms consistently land > 90 across two shipped versions,
68
+ OR when "all-or-nothing batch" becomes a recognized pattern such that
69
+ solo arm reliably validates-first on the initial implementation pass.
70
+ Whichever comes first.
@@ -0,0 +1,52 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/server.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": ["not ok "]
8
+ },
9
+ {
10
+ "cmd": "node data/_verify-failed-leaves-unchanged.js",
11
+ "exit_code": 0,
12
+ "stdout_contains": ["\"ok\":true"],
13
+ "stdout_not_contains": []
14
+ },
15
+ {
16
+ "cmd": "node data/_verify-success.js",
17
+ "exit_code": 0,
18
+ "stdout_contains": ["\"ok\":true"],
19
+ "stdout_not_contains": []
20
+ },
21
+ {
22
+ "cmd": "node data/_verify-invalid-body.js",
23
+ "exit_code": 0,
24
+ "stdout_contains": ["\"ok\":true"],
25
+ "stdout_not_contains": []
26
+ }
27
+ ],
28
+ "forbidden_patterns": [
29
+ {
30
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
31
+ "description": "silent catch returning fallback in import path",
32
+ "files": ["server/index.js"],
33
+ "severity": "disqualifier"
34
+ },
35
+ {
36
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
37
+ "description": "empty catch block",
38
+ "files": ["server/index.js"],
39
+ "severity": "disqualifier"
40
+ },
41
+ {
42
+ "pattern": "/\\*\\s*eslint-disable",
43
+ "description": "eslint-disable without scoped justification",
44
+ "files": ["server/index.js"],
45
+ "severity": "disqualifier"
46
+ }
47
+ ],
48
+ "required_files": ["server/index.js", "tests/server.test.js"],
49
+ "forbidden_files": [],
50
+ "max_deps_added": 0,
51
+ "spec_output_files": ["server/index.js", "tests/server.test.js"]
52
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F11-batch-import-all-or-nothing",
3
+ "category": "high-risk",
4
+ "difficulty": "high",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add POST /items/import that accepts a batch of items in one request. Each item must have a non-empty name and a positive integer qty. If any item in the batch fails the check, respond 400 and leave the existing list exactly as it was — partial batches are not accepted."
10
+ }
@@ -0,0 +1,171 @@
1
+ #!/usr/bin/env bash
2
+ # F11 setup — stage verifier scripts. Store remains in-memory; no data file needed.
3
+ set -e
4
+
5
+ mkdir -p data
6
+
7
+ # Verifier: a batch with one bad item in the middle returns 400 and leaves
8
+ # the stored list exactly as it was before the import.
9
+ cat > data/_verify-failed-leaves-unchanged.js <<'JS'
10
+ 'use strict';
11
+ const http = require('http');
12
+ const { app } = require('../server');
13
+
14
+ const s = http.createServer(app).listen(0, async () => {
15
+ const { port } = s.address();
16
+
17
+ const get = (path) => new Promise((resolve) => {
18
+ http.get(`http://127.0.0.1:${port}${path}`, (r) => {
19
+ let b = ''; r.on('data', (c) => (b += c));
20
+ r.on('end', () => {
21
+ let d = null; try { d = JSON.parse(b); } catch {}
22
+ resolve({ status: r.statusCode, body: d });
23
+ });
24
+ }).on('error', () => resolve({ status: 0, body: null }));
25
+ });
26
+
27
+ const post = (path, body) => new Promise((resolve) => {
28
+ const req = http.request(
29
+ { host: '127.0.0.1', port, method: 'POST', path,
30
+ headers: { 'Content-Type': 'application/json' } },
31
+ (r) => {
32
+ let b = ''; r.on('data', (c) => (b += c));
33
+ r.on('end', () => {
34
+ let d = null; try { d = JSON.parse(b); } catch {}
35
+ resolve({ status: r.statusCode, body: d });
36
+ });
37
+ }
38
+ );
39
+ req.on('error', () => resolve({ status: 0, body: null }));
40
+ req.write(JSON.stringify(body));
41
+ req.end();
42
+ });
43
+
44
+ const before = await get('/items');
45
+ const r = await post('/items/import', {
46
+ items: [
47
+ { name: 'good1', qty: 1 },
48
+ { name: '', qty: 2 }, // invalid: empty name
49
+ { name: 'good2', qty: 3 },
50
+ ],
51
+ });
52
+ const after = await get('/items');
53
+
54
+ const same = JSON.stringify(before.body) === JSON.stringify(after.body);
55
+ const ok = r.status === 400 && same;
56
+ console.log(JSON.stringify({ status: r.status, store_unchanged: same, ok }));
57
+ s.close();
58
+ process.exit(ok ? 0 : 1);
59
+ });
60
+ JS
61
+
62
+ # Verifier: a fully-valid batch returns 201 and items appear in order with distinct ids.
63
+ cat > data/_verify-success.js <<'JS'
64
+ 'use strict';
65
+ const http = require('http');
66
+ const { app } = require('../server');
67
+
68
+ const s = http.createServer(app).listen(0, async () => {
69
+ const { port } = s.address();
70
+
71
+ const get = (path) => new Promise((resolve) => {
72
+ http.get(`http://127.0.0.1:${port}${path}`, (r) => {
73
+ let b = ''; r.on('data', (c) => (b += c));
74
+ r.on('end', () => {
75
+ let d = null; try { d = JSON.parse(b); } catch {}
76
+ resolve({ status: r.statusCode, body: d });
77
+ });
78
+ }).on('error', () => resolve({ status: 0, body: null }));
79
+ });
80
+
81
+ const post = (path, body) => new Promise((resolve) => {
82
+ const req = http.request(
83
+ { host: '127.0.0.1', port, method: 'POST', path,
84
+ headers: { 'Content-Type': 'application/json' } },
85
+ (r) => {
86
+ let b = ''; r.on('data', (c) => (b += c));
87
+ r.on('end', () => {
88
+ let d = null; try { d = JSON.parse(b); } catch {}
89
+ resolve({ status: r.statusCode, body: d });
90
+ });
91
+ }
92
+ );
93
+ req.on('error', () => resolve({ status: 0, body: null }));
94
+ req.write(JSON.stringify(body));
95
+ req.end();
96
+ });
97
+
98
+ const before = await get('/items');
99
+ const beforeLen = (before.body && Array.isArray(before.body.items)) ? before.body.items.length : 0;
100
+ const r = await post('/items/import', {
101
+ items: [
102
+ { name: 'gamma', qty: 1 },
103
+ { name: 'delta', qty: 2 },
104
+ { name: 'epsilon', qty: 3 },
105
+ ],
106
+ });
107
+ const after = await get('/items');
108
+ const afterItems = (after.body && Array.isArray(after.body.items)) ? after.body.items : [];
109
+
110
+ const ids = afterItems.map((i) => i && i.id);
111
+ const uniq = new Set(ids).size === ids.length;
112
+ const lenOk = afterItems.length === beforeLen + 3;
113
+ const last3Names = afterItems.slice(-3).map((i) => i && i.name).join(',');
114
+ const orderOk = last3Names === 'gamma,delta,epsilon';
115
+
116
+ const ok = r.status === 201 && uniq && lenOk && orderOk;
117
+ console.log(JSON.stringify({ status: r.status, uniq, lenOk, orderOk, last3Names, ok }));
118
+ s.close();
119
+ process.exit(ok ? 0 : 1);
120
+ });
121
+ JS
122
+
123
+ # Verifier: malformed body (missing items) returns 400 and store is unchanged.
124
+ cat > data/_verify-invalid-body.js <<'JS'
125
+ 'use strict';
126
+ const http = require('http');
127
+ const { app } = require('../server');
128
+
129
+ const s = http.createServer(app).listen(0, async () => {
130
+ const { port } = s.address();
131
+
132
+ const get = (path) => new Promise((resolve) => {
133
+ http.get(`http://127.0.0.1:${port}${path}`, (r) => {
134
+ let b = ''; r.on('data', (c) => (b += c));
135
+ r.on('end', () => {
136
+ let d = null; try { d = JSON.parse(b); } catch {}
137
+ resolve({ status: r.statusCode, body: d });
138
+ });
139
+ }).on('error', () => resolve({ status: 0, body: null }));
140
+ });
141
+
142
+ const post = (path, raw) => new Promise((resolve) => {
143
+ const req = http.request(
144
+ { host: '127.0.0.1', port, method: 'POST', path,
145
+ headers: { 'Content-Type': 'application/json' } },
146
+ (r) => {
147
+ let b = ''; r.on('data', (c) => (b += c));
148
+ r.on('end', () => {
149
+ let d = null; try { d = JSON.parse(b); } catch {}
150
+ resolve({ status: r.statusCode, body: d });
151
+ });
152
+ }
153
+ );
154
+ req.on('error', () => resolve({ status: 0, body: null }));
155
+ req.write(raw);
156
+ req.end();
157
+ });
158
+
159
+ const before = await get('/items');
160
+ const r = await post('/items/import', JSON.stringify({ wrong: 'shape' }));
161
+ const after = await get('/items');
162
+
163
+ const same = JSON.stringify(before.body) === JSON.stringify(after.body);
164
+ const ok = r.status === 400 && same;
165
+ console.log(JSON.stringify({ status: r.status, store_unchanged: same, ok }));
166
+ s.close();
167
+ process.exit(ok ? 0 : 1);
168
+ });
169
+ JS
170
+
171
+ exit 0
@@ -0,0 +1,51 @@
1
+ ---
2
+ id: "F11-batch-import-all-or-nothing"
3
+ title: "Batch import endpoint"
4
+ status: planned
5
+ complexity: high
6
+ depends-on: []
7
+ ---
8
+
9
+ # F11 Batch import endpoint
10
+
11
+ ## Context
12
+
13
+ `server/index.js` only allows fetching items today. The task: add
14
+ `POST /items/import` that accepts a batch of items in a single request
15
+ and appends them to the existing list. The endpoint is intended for bulk
16
+ loads where users send a CSV-derived JSON array.
17
+
18
+ The product requirement: a batch must be accepted as a whole or rejected
19
+ as a whole. If any item in the batch is invalid, the response is `400`
20
+ and the stored list is left exactly as it was before the request.
21
+
22
+ ## Requirements
23
+
24
+ - [ ] `POST /items/import` accepts JSON body `{ items: Array<{ name: string, qty: number }> }`.
25
+ - [ ] On full success: respond `201` with `{ inserted: <count> }`. The new items appear in `GET /items` in the order given. Each receives a unique numeric `id` not used by any existing item.
26
+ - [ ] Per-item validation: `name` must be a non-empty string after `trim()`; `qty` must be a positive integer (`> 0`). Any item failing either check makes the batch invalid.
27
+ - [ ] On batch invalidity: respond `400` with `{ error: 'invalid_batch', index: <0-based index of first invalid item>, field: 'name' | 'qty' }`. After the response, `GET /items` returns the same list it would have returned before the import was sent.
28
+ - [ ] Empty body, missing `items`, or `items` not an array → `400` with `{ error: 'invalid_body' }`. Same store-unchanged guarantee.
29
+ - [ ] `tests/server.test.js` is updated. Existing assertions still hold AND at least two new tests cover import: one happy path, one with an invalid element mid-batch that asserts the prior list is unchanged after the failed call.
30
+
31
+ ## Constraints
32
+
33
+ - **No new npm dependencies.**
34
+ - **No silent catches.**
35
+ - **No partial updates.** A batch with N items must produce either N inserts or 0 inserts.
36
+ - **No breaking change** to existing `GET /items` and `GET /items/:id`.
37
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
38
+
39
+ ## Out of Scope
40
+
41
+ - Authentication, rate limiting.
42
+ - File-based persistence (the store stays in-memory for this fixture).
43
+ - CSV parsing or any non-JSON payload.
44
+ - Touching `bin/cli.js`, `web/`, or `tests/cli.test.js`.
45
+
46
+ ## Verification
47
+
48
+ - `node --test tests/server.test.js` exits 0.
49
+ - A POST with one valid + one invalid item returns `400`, AND a subsequent `GET /items` returns the same list as before the import.
50
+ - A POST with all-valid items returns `201`, and the items appear in `GET /items` in order with distinct ids.
51
+ - `git diff --stat` shows only `server/index.js` and `tests/server.test.js` touched.
@@ -0,0 +1,9 @@
1
+ Add `POST /items/import` to `server/index.js`. The endpoint takes a JSON body `{ items: [...] }` where each item has a `name` (non-empty string after trim) and a `qty` (positive integer).
2
+
3
+ A batch is accepted as a whole or not at all. If the batch is fully valid, append every item to the list and return 201 with `{ inserted: <count> }`. If any item fails validation, return 400 with `{ error: 'invalid_batch', index, field }` and leave the stored list exactly as it was before the request.
4
+
5
+ Empty body, missing `items`, or `items` not an array → 400 with `{ error: 'invalid_body' }`. Again, the existing list must be untouched.
6
+
7
+ Update `tests/server.test.js` so existing tests still pass AND add at least two new tests. One must send a batch with one bad item in the middle and verify the prior list is unchanged after the failed call.
8
+
9
+ No new npm dependencies. Only touch `server/index.js` and `tests/server.test.js`.
@@ -0,0 +1,83 @@
1
+ # F12 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Pair-discriminating high-risk fixture targeting **platform/domain
6
+ blindspots** rather than spec-prose-derivable invariants. Codex R3
7
+ (2026-05-05) pivot: after F10/F11 pilot showed 2026 bare derives spec-
8
+ hidden invariants from English prose at the same level as pair-mode, the
9
+ discriminator must move from "synonym hiding" to "Node/Express/security
10
+ semantics that the prose does not tutor."
11
+
12
+ The fixture has 5 mechanical verifiers covering 3 distinct domain
13
+ blindspots:
14
+
15
+ 1. **Idempotency** (replay protection). Naive HMAC-correct impls forget
16
+ the seen-id set and respond 200 on a re-delivery. Webhook providers
17
+ retry — pair catches.
18
+
19
+ 2. **Raw-body verification**. Naive impls write `crypto.createHmac(...)
20
+ .update(JSON.stringify(req.body)).digest('hex')` because Express's
21
+ `express.json()` middleware is the obvious body-parsing path. The
22
+ re-serialized form may match a canonical signature, but it does not
23
+ verify the actual bytes the client sent — same parsed object, different
24
+ on-wire bytes pass naive verification.
25
+
26
+ 3. **Timing-safe comparison**. Naive impls `===` the hex strings.
27
+ Production webhook libraries (Stripe, GitHub) use `crypto.timing
28
+ SafeEqual` because non-constant-time compare leaks the true MAC
29
+ byte-by-byte. Spec mentions this directly to bias the model toward
30
+ correctness; the forbidden_pattern slot is reserved if needed.
31
+
32
+ ## Failure modes detected
33
+
34
+ - **Replay accepts**: returns 200 on the second delivery of the same id.
35
+ Verifier 2 catches.
36
+ - **JSON.stringify roundtrip accept**: HMAC over re-serialized req.body
37
+ matches a canonical-signature for non-canonical bytes. Verifier 5
38
+ catches.
39
+ - **Tampered-body accept**: would only happen with a broken impl; verifier
40
+ 3 documents the obvious case for completeness.
41
+ - **Missing-sig accept**: 200 instead of 401. Verifier 4.
42
+ - **Silent catch** wrapping crypto.timingSafeEqual (which throws on length
43
+ mismatch). Caught by forbidden_pattern.
44
+
45
+ ## Pipeline exercise
46
+
47
+ - Phase 1 BUILD: implementer must derive (a) maintain a seen-id set,
48
+ (b) use `express.raw({ type: 'application/json' })` or hand-parse so the
49
+ raw bytes are kept, (c) `crypto.timingSafeEqual` for comparison.
50
+ - Phase 2 EVAL: scrutinizes whether new tests cover replay AND raw-body
51
+ cases, not just happy + tampered.
52
+ - Phase 3 CRITIC: production-readiness on the security claims.
53
+
54
+ ## Discrimination expectation
55
+
56
+ Calibration target:
57
+
58
+ - bare arm: 50-75 (passes 2-4 of 5; likely ace of happy + tampered +
59
+ missing-sig; misses replay if no seen-id set, misses raw-body if uses
60
+ JSON.stringify).
61
+ - solo arm: 65-85 (skill review pass may catch one of the two complex
62
+ blindspots, may miss the other).
63
+ - pair arm: 80-95 (cross-perspective derivation of both replay AND
64
+ raw-body invariants).
65
+
66
+ If bare scores 5/5 here too, the "domain blindspot" thesis also dies
67
+ and we re-evaluate strategy at iter level.
68
+
69
+ ## Public-spec wording
70
+
71
+ Spec mentions `crypto.timingSafeEqual` directly (production constraint,
72
+ not a leak — bare in 2026 will likely already use it). Spec describes
73
+ replay protection as "the provider retries on network failure" — natural
74
+ language, no leak of "idempotency" / "deduplication" / "seen-set"
75
+ keywords. Raw-body trap is intentionally left without explicit
76
+ "use express.raw" hint — that's the discrimination axis.
77
+
78
+ ## Rotation trigger
79
+
80
+ Retire when both arms consistently land > 90 across two shipped versions
81
+ on this fixture. If the raw-body verifier (#5) becomes saturated faster
82
+ than the others, replace it with a different platform blindspot rather
83
+ than retiring the whole fixture.
@@ -0,0 +1,74 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/server.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": ["not ok "]
8
+ },
9
+ {
10
+ "cmd": "node data/_verify-happy.js",
11
+ "exit_code": 0,
12
+ "stdout_contains": ["\"ok\":true"],
13
+ "stdout_not_contains": []
14
+ },
15
+ {
16
+ "cmd": "node data/_verify-replay.js",
17
+ "exit_code": 0,
18
+ "stdout_contains": ["\"ok\":true"],
19
+ "stdout_not_contains": []
20
+ },
21
+ {
22
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/replay-malformed-body.js\"",
23
+ "exit_code": 0,
24
+ "stdout_contains": ["\"ok\":true"],
25
+ "stdout_not_contains": [],
26
+ "contract_refs": [
27
+ "The body of the duplicate is irrelevant — once an `id` is accepted, the same `id` is permanently rejected for the lifetime of the server process.",
28
+ "A second POST with the same accepted `id` returns 409 even if the duplicate"
29
+ ]
30
+ },
31
+ {
32
+ "cmd": "node data/_verify-tampered.js",
33
+ "exit_code": 0,
34
+ "stdout_contains": ["\"ok\":true"],
35
+ "stdout_not_contains": []
36
+ },
37
+ {
38
+ "cmd": "node data/_verify-missing-sig.js",
39
+ "exit_code": 0,
40
+ "stdout_contains": ["\"ok\":true"],
41
+ "stdout_not_contains": []
42
+ },
43
+ {
44
+ "cmd": "node data/_verify-raw-body.js",
45
+ "exit_code": 0,
46
+ "stdout_contains": ["\"ok\":true"],
47
+ "stdout_not_contains": []
48
+ }
49
+ ],
50
+ "forbidden_patterns": [
51
+ {
52
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
53
+ "description": "silent catch returning fallback in webhook path",
54
+ "files": ["server/index.js"],
55
+ "severity": "disqualifier"
56
+ },
57
+ {
58
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
59
+ "description": "empty catch block",
60
+ "files": ["server/index.js"],
61
+ "severity": "disqualifier"
62
+ },
63
+ {
64
+ "pattern": "/\\*\\s*eslint-disable",
65
+ "description": "eslint-disable without scoped justification",
66
+ "files": ["server/index.js"],
67
+ "severity": "disqualifier"
68
+ }
69
+ ],
70
+ "required_files": ["server/index.js", "tests/server.test.js", "data/webhook-secret.txt"],
71
+ "forbidden_files": [],
72
+ "max_deps_added": 0,
73
+ "spec_output_files": ["server/index.js", "tests/server.test.js"]
74
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F12-webhook-raw-body-signature",
3
+ "category": "high-risk",
4
+ "difficulty": "high",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add POST /webhook that verifies an HMAC-SHA256 signature in the X-Signature header against the request body, accepts each event id at most once, and rejects tampered or replayed events. The provider computes the signature over the exact bytes of the body it sends; the server must verify against the same bytes."
10
+ }