devlyn-cli 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/CLAUDE.md +1 -1
  2. package/README.md +1 -1
  3. package/benchmark/auto-resolve/README.md +318 -2
  4. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  91. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  100. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  101. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  102. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  103. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  104. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  105. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  106. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  107. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  110. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  111. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  112. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  113. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  114. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  116. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  117. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  118. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  119. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  120. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  121. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  122. package/bin/devlyn.js +56 -10
  123. package/config/skills/_shared/archive_run.py +3 -0
  124. package/config/skills/_shared/codex-config.md +2 -2
  125. package/config/skills/_shared/codex-monitored.sh +72 -7
  126. package/config/skills/_shared/collect-codex-findings.py +125 -0
  127. package/config/skills/_shared/engine-preflight.md +1 -1
  128. package/config/skills/_shared/expected.schema.json +18 -0
  129. package/config/skills/_shared/spec-verify-check.py +312 -10
  130. package/config/skills/_shared/verify-merge-findings.py +327 -0
  131. package/config/skills/devlyn:ideate/SKILL.md +1 -1
  132. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  133. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  134. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  135. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  136. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  137. package/package.json +1 -1
  138. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,251 @@
1
+ #!/usr/bin/env bash
2
+ # F12 setup — seed the shared secret and stage verifier scripts.
3
+ set -e
4
+
5
+ mkdir -p data
6
+
7
+ # Single-line secret, no trailing newline (printf, not echo).
8
+ printf 'wh_test_secret_a3f9e1c2_d4b6e7' > data/webhook-secret.txt
9
+
10
+ # Sample event payload for human inspection (not used by verifiers).
11
+ cat > data/_sample-event.json <<'JSON'
12
+ {"id":"evt_001","type":"order.created","timestamp":1735689600,"data":{"order_id":"o_42","amount_cents":1500}}
13
+ JSON
14
+
15
+ # Verifier 1: happy path — exact bytes the provider signed → 200 accepted.
16
+ cat > data/_verify-happy.js <<'JS'
17
+ 'use strict';
18
+ const http = require('http');
19
+ const fs = require('fs');
20
+ const crypto = require('crypto');
21
+ const { app } = require('../server');
22
+
23
+ const SECRET = fs.readFileSync('data/webhook-secret.txt');
24
+
25
+ function hmacHex(bytes) {
26
+ return crypto.createHmac('sha256', SECRET).update(bytes).digest('hex');
27
+ }
28
+
29
+ function postRaw(port, bytes, sig) {
30
+ return new Promise((resolve) => {
31
+ const req = http.request(
32
+ { host: '127.0.0.1', port, method: 'POST', path: '/webhook',
33
+ headers: {
34
+ 'Content-Type': 'application/json',
35
+ 'Content-Length': Buffer.byteLength(bytes),
36
+ 'X-Signature': sig,
37
+ } },
38
+ (r) => {
39
+ let b = ''; r.on('data', (c) => (b += c));
40
+ r.on('end', () => {
41
+ let d = null; try { d = JSON.parse(b); } catch {}
42
+ resolve({ status: r.statusCode, body: d });
43
+ });
44
+ }
45
+ );
46
+ req.on('error', () => resolve({ status: 0, body: null }));
47
+ req.write(bytes);
48
+ req.end();
49
+ });
50
+ }
51
+
52
+ const s = http.createServer(app).listen(0, async () => {
53
+ const { port } = s.address();
54
+ const body = Buffer.from('{"id":"evt_h1","type":"order.created","timestamp":1,"data":{}}');
55
+ const sig = hmacHex(body);
56
+ const r = await postRaw(port, body, sig);
57
+ const ok = r.status === 200 && r.body && r.body.accepted === true && r.body.id === 'evt_h1';
58
+ console.log(JSON.stringify({ status: r.status, body: r.body, ok }));
59
+ s.close();
60
+ process.exit(ok ? 0 : 1);
61
+ });
62
+ JS
63
+
64
+ # Verifier 2: replay (same id) — first 200, second 409 even with valid sig.
65
+ cat > data/_verify-replay.js <<'JS'
66
+ 'use strict';
67
+ const http = require('http');
68
+ const fs = require('fs');
69
+ const crypto = require('crypto');
70
+ const { app } = require('../server');
71
+
72
+ const SECRET = fs.readFileSync('data/webhook-secret.txt');
73
+ function hmacHex(bytes) {
74
+ return crypto.createHmac('sha256', SECRET).update(bytes).digest('hex');
75
+ }
76
+ function postRaw(port, bytes, sig) {
77
+ return new Promise((resolve) => {
78
+ const req = http.request(
79
+ { host: '127.0.0.1', port, method: 'POST', path: '/webhook',
80
+ headers: {
81
+ 'Content-Type': 'application/json',
82
+ 'Content-Length': Buffer.byteLength(bytes),
83
+ 'X-Signature': sig,
84
+ } },
85
+ (r) => {
86
+ let b = ''; r.on('data', (c) => (b += c));
87
+ r.on('end', () => {
88
+ let d = null; try { d = JSON.parse(b); } catch {}
89
+ resolve({ status: r.statusCode, body: d });
90
+ });
91
+ }
92
+ );
93
+ req.on('error', () => resolve({ status: 0, body: null }));
94
+ req.write(bytes);
95
+ req.end();
96
+ });
97
+ }
98
+
99
+ const s = http.createServer(app).listen(0, async () => {
100
+ const { port } = s.address();
101
+ const body = Buffer.from('{"id":"evt_r1","type":"x","timestamp":1,"data":{}}');
102
+ const sig = hmacHex(body);
103
+ const first = await postRaw(port, body, sig);
104
+ const second = await postRaw(port, body, sig);
105
+ const ok = first.status === 200 &&
106
+ second.status === 409 &&
107
+ second.body && second.body.error === 'duplicate_event' && second.body.id === 'evt_r1';
108
+ console.log(JSON.stringify({ first: first.status, second: second.status, second_body: second.body, ok }));
109
+ s.close();
110
+ process.exit(ok ? 0 : 1);
111
+ });
112
+ JS
113
+
114
+ # Verifier 3: tampered body → 401. Body changed AFTER signing; the original
115
+ # sig now corresponds to bytes that are no longer in the request.
116
+ cat > data/_verify-tampered.js <<'JS'
117
+ 'use strict';
118
+ const http = require('http');
119
+ const fs = require('fs');
120
+ const crypto = require('crypto');
121
+ const { app } = require('../server');
122
+
123
+ const SECRET = fs.readFileSync('data/webhook-secret.txt');
124
+ function hmacHex(bytes) {
125
+ return crypto.createHmac('sha256', SECRET).update(bytes).digest('hex');
126
+ }
127
+ function postRaw(port, bytes, sig) {
128
+ return new Promise((resolve) => {
129
+ const req = http.request(
130
+ { host: '127.0.0.1', port, method: 'POST', path: '/webhook',
131
+ headers: {
132
+ 'Content-Type': 'application/json',
133
+ 'Content-Length': Buffer.byteLength(bytes),
134
+ 'X-Signature': sig,
135
+ } },
136
+ (r) => {
137
+ let b = ''; r.on('data', (c) => (b += c));
138
+ r.on('end', () => {
139
+ let d = null; try { d = JSON.parse(b); } catch {}
140
+ resolve({ status: r.statusCode, body: d });
141
+ });
142
+ }
143
+ );
144
+ req.on('error', () => resolve({ status: 0, body: null }));
145
+ req.write(bytes);
146
+ req.end();
147
+ });
148
+ }
149
+
150
+ const s = http.createServer(app).listen(0, async () => {
151
+ const { port } = s.address();
152
+ const original = Buffer.from('{"id":"evt_t1","type":"x","timestamp":1,"data":{"amount":100}}');
153
+ const sig = hmacHex(original); // sig over the original
154
+ const tampered = Buffer.from('{"id":"evt_t1","type":"x","timestamp":1,"data":{"amount":9999}}');
155
+ const r = await postRaw(port, tampered, sig);
156
+ const ok = r.status === 401 && r.body && r.body.error === 'invalid_signature';
157
+ console.log(JSON.stringify({ status: r.status, body: r.body, ok }));
158
+ s.close();
159
+ process.exit(ok ? 0 : 1);
160
+ });
161
+ JS
162
+
163
+ # Verifier 4: missing/malformed X-Signature → 401.
164
+ cat > data/_verify-missing-sig.js <<'JS'
165
+ 'use strict';
166
+ const http = require('http');
167
+ const { app } = require('../server');
168
+
169
+ const s = http.createServer(app).listen(0, async () => {
170
+ const { port } = s.address();
171
+ const body = Buffer.from('{"id":"evt_m1","type":"x","timestamp":1,"data":{}}');
172
+ const r = await new Promise((resolve) => {
173
+ const req = http.request(
174
+ { host: '127.0.0.1', port, method: 'POST', path: '/webhook',
175
+ headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(body) } },
176
+ (rs) => {
177
+ let b = ''; rs.on('data', (c) => (b += c));
178
+ rs.on('end', () => {
179
+ let d = null; try { d = JSON.parse(b); } catch {}
180
+ resolve({ status: rs.statusCode, body: d });
181
+ });
182
+ }
183
+ );
184
+ req.on('error', () => resolve({ status: 0, body: null }));
185
+ req.write(body);
186
+ req.end();
187
+ });
188
+ const ok = r.status === 401 && r.body && r.body.error === 'invalid_signature';
189
+ console.log(JSON.stringify({ status: r.status, body: r.body, ok }));
190
+ s.close();
191
+ process.exit(ok ? 0 : 1);
192
+ });
193
+ JS
194
+
195
+ # Verifier 5: raw-body discrimination — same parsed object, different on-wire
196
+ # bytes, signature of an alternate-bytes form. A naive impl that re-serializes
197
+ # req.body and HMACs the result will accept this; a correct impl that HMACs
198
+ # the actual received bytes will reject. The "alternate" bytes here are the
199
+ # canonical (compact) form of the same JSON, so naive's re-stringify produces
200
+ # bytes whose HMAC matches the provided signature.
201
+ cat > data/_verify-raw-body.js <<'JS'
202
+ 'use strict';
203
+ const http = require('http');
204
+ const fs = require('fs');
205
+ const crypto = require('crypto');
206
+ const { app } = require('../server');
207
+
208
+ const SECRET = fs.readFileSync('data/webhook-secret.txt');
209
+ function hmacHex(bytes) {
210
+ return crypto.createHmac('sha256', SECRET).update(bytes).digest('hex');
211
+ }
212
+
213
+ const s = http.createServer(app).listen(0, async () => {
214
+ const { port } = s.address();
215
+ // Canonical bytes (compact form) — what JSON.stringify(parsed_obj) emits.
216
+ const canonical = Buffer.from('{"id":"evt_rb1","type":"x","timestamp":1,"data":{}}');
217
+ // Sig over canonical bytes — what a naive impl will produce when it
218
+ // re-stringifies req.body and HMACs the result.
219
+ const sig = hmacHex(canonical);
220
+ // Wire bytes: same parsed JSON, but with extra whitespace. Different bytes
221
+ // than canonical; an HMAC over THESE bytes is NOT `sig`.
222
+ const wire = Buffer.from('{ "id":"evt_rb1","type":"x","timestamp":1,"data":{} }');
223
+ const r = await new Promise((resolve) => {
224
+ const req = http.request(
225
+ { host: '127.0.0.1', port, method: 'POST', path: '/webhook',
226
+ headers: {
227
+ 'Content-Type': 'application/json',
228
+ 'Content-Length': Buffer.byteLength(wire),
229
+ 'X-Signature': sig,
230
+ } },
231
+ (rs) => {
232
+ let b = ''; rs.on('data', (c) => (b += c));
233
+ rs.on('end', () => {
234
+ let d = null; try { d = JSON.parse(b); } catch {}
235
+ resolve({ status: rs.statusCode, body: d });
236
+ });
237
+ }
238
+ );
239
+ req.on('error', () => resolve({ status: 0, body: null }));
240
+ req.write(wire);
241
+ req.end();
242
+ });
243
+ // Correct: 401 (HMAC over wire bytes ≠ sig). Naive (JSON.stringify path): 200.
244
+ const ok = r.status === 401 && r.body && r.body.error === 'invalid_signature';
245
+ console.log(JSON.stringify({ status: r.status, body: r.body, ok }));
246
+ s.close();
247
+ process.exit(ok ? 0 : 1);
248
+ });
249
+ JS
250
+
251
+ exit 0
@@ -0,0 +1,58 @@
1
+ ---
2
+ id: "F12-webhook-raw-body-signature"
3
+ title: "Webhook signature verification endpoint"
4
+ status: planned
5
+ complexity: high
6
+ depends-on: []
7
+ ---
8
+
9
+ # F12 Webhook signature verification endpoint
10
+
11
+ ## Context
12
+
13
+ A third-party provider sends event notifications via webhook. Each
14
+ delivery is signed: the provider computes HMAC-SHA256 over the body
15
+ bytes using a shared secret and includes the lowercase hex-encoded
16
+ result in the `X-Signature` header.
17
+
18
+ The provider may retry a delivery (network blip, ack lost). When it
19
+ does, it sends the same payload with the same `id` field — the server
20
+ must accept the event at most once and respond `409` on a re-delivery.
21
+
22
+ The shared secret is stored at `data/webhook-secret.txt` (single line,
23
+ no trailing newline).
24
+
25
+ ## Requirements
26
+
27
+ - [ ] `POST /webhook` accepts JSON-shaped bodies of the form `{ id, type, timestamp, data }` where `id` and `type` are non-empty strings, `timestamp` is a number, and `data` is an object.
28
+ - [ ] On valid signature + first delivery of a given `id`: respond `200` with `{ accepted: true, id }`.
29
+ - [ ] On valid signature + re-delivery of an already-seen `id`: respond `409` with `{ error: 'duplicate_event', id }`. The body of the duplicate is irrelevant — once an `id` is accepted, the same `id` is permanently rejected for the lifetime of the server process.
30
+ - [ ] On missing or invalid `X-Signature`, or signature does not match the body: respond `401` with `{ error: 'invalid_signature' }`. Verification compares the signature the provider sent against an HMAC-SHA256 the server computes the same way the provider did.
31
+ - [ ] On a body that fails the shape check (missing fields, wrong types, empty `id`/`type`): respond `400` with `{ error: 'invalid_body' }`. Sig check still runs first — a bad body with a valid sig is still 400, not 401.
32
+ - [ ] `tests/server.test.js` is updated. Existing assertions still hold AND at least three new tests cover: happy path, replay (same id) → 409, tampered body with stale signature → 401.
33
+
34
+ ## Constraints
35
+
36
+ - **No new npm dependencies.** Express + Node `crypto` + Node built-ins only.
37
+ - **No silent catches.** Errors in the verification path surface as `500` with a clear body.
38
+ - **Use `crypto.timingSafeEqual` for the signature comparison.** A non-constant-time `===` between hex strings leaks information about the true MAC byte-by-byte.
39
+ - **No breaking change** to existing `/items`, `/items/:id`, `/health`.
40
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
41
+
42
+ ## Out of Scope
43
+
44
+ - Authentication beyond signature verification.
45
+ - Rate limiting, replay window TTL (the seen-id set is process-lifetime only).
46
+ - Persistence of seen-ids across restarts.
47
+ - Touching `bin/cli.js`, `web/`, or `tests/cli.test.js`.
48
+
49
+ ## Verification
50
+
51
+ - `node --test tests/server.test.js` exits 0.
52
+ - A POST with the provider's sig over the exact body bytes returns 200.
53
+ - A second POST with the same `id` returns 409 even if the signature is valid.
54
+ - A second POST with the same accepted `id` returns 409 even if the duplicate
55
+ body would otherwise fail shape validation; duplicate id wins after acceptance.
56
+ - A POST whose body has been modified after signing (or whose `X-Signature` was computed against different bytes than the body now contains) returns 401.
57
+ - A POST with a missing or malformed `X-Signature` header returns 401.
58
+ - `git diff --stat` shows only `server/index.js` and `tests/server.test.js` touched.
@@ -0,0 +1,13 @@
1
+ Add `POST /webhook` to `server/index.js` to receive signed event notifications from a third-party provider.
2
+
3
+ Each request carries an `X-Signature` header with a lowercase hex HMAC-SHA256 over the body, computed using the shared secret stored at `data/webhook-secret.txt`. Verify the signature; if it doesn't match what you compute the same way, respond 401 with `{ error: 'invalid_signature' }`.
4
+
5
+ The body shape is `{ id, type, timestamp, data }` where `id` and `type` must be non-empty strings, `timestamp` is a number, and `data` is an object. A body that fails this shape returns 400 with `{ error: 'invalid_body' }` (sig check still runs first — a bad body with a valid sig is 400, not 401).
6
+
7
+ The provider retries on network failure, so the same `id` may arrive twice. After accepting an `id`, any subsequent delivery of that same `id` returns 409 with `{ error: 'duplicate_event', id }` — the second body is irrelevant. First delivery returns 200 with `{ accepted: true, id }`.
8
+
9
+ Use `crypto.timingSafeEqual` for the signature comparison.
10
+
11
+ Update `tests/server.test.js` so existing tests still pass AND add at least three new tests covering: happy path, replay of the same id returns 409, tampered body with the original signature returns 401.
12
+
13
+ No new npm dependencies. Only touch `server/index.js` and `tests/server.test.js`.
@@ -0,0 +1,64 @@
1
+ 'use strict';
2
+
3
+ const http = require('node:http');
4
+ const fs = require('node:fs');
5
+ const crypto = require('node:crypto');
6
+ const path = require('node:path');
7
+
8
+ const workdir = process.env.BENCH_WORKDIR;
9
+ const { app } = require(path.join(workdir, 'server'));
10
+ const secret = fs.readFileSync(path.join(workdir, 'data', 'webhook-secret.txt'));
11
+
12
+ function hmacHex(bytes) {
13
+ return crypto.createHmac('sha256', secret).update(bytes).digest('hex');
14
+ }
15
+
16
+ function postRaw(port, bytes) {
17
+ return new Promise((resolve) => {
18
+ const req = http.request(
19
+ {
20
+ host: '127.0.0.1',
21
+ port,
22
+ method: 'POST',
23
+ path: '/webhook',
24
+ headers: {
25
+ 'Content-Type': 'application/json',
26
+ 'Content-Length': Buffer.byteLength(bytes),
27
+ 'X-Signature': hmacHex(bytes)
28
+ }
29
+ },
30
+ (res) => {
31
+ let body = '';
32
+ res.on('data', (chunk) => { body += chunk; });
33
+ res.on('end', () => {
34
+ let parsed = null;
35
+ try {
36
+ parsed = JSON.parse(body);
37
+ } catch {
38
+ parsed = null;
39
+ }
40
+ resolve({ status: res.statusCode, body: parsed });
41
+ });
42
+ }
43
+ );
44
+ req.on('error', () => resolve({ status: 0, body: null }));
45
+ req.write(bytes);
46
+ req.end();
47
+ });
48
+ }
49
+
50
+ const server = http.createServer(app).listen(0, async () => {
51
+ const { port } = server.address();
52
+ const accepted = Buffer.from('{"id":"evt_hidden_replay","type":"x","timestamp":1,"data":{}}');
53
+ const malformedDuplicate = Buffer.from('{"id":"evt_hidden_replay","type":"x","timestamp":"bad","data":{}}');
54
+ const first = await postRaw(port, accepted);
55
+ const second = await postRaw(port, malformedDuplicate);
56
+ const ok = first.status === 200
57
+ && second.status === 409
58
+ && second.body
59
+ && second.body.error === 'duplicate_event'
60
+ && second.body.id === 'evt_hidden_replay';
61
+ console.log(JSON.stringify({ ok, first: first.status, second: second.status, second_body: second.body }));
62
+ server.close();
63
+ process.exit(ok ? 0 : 1);
64
+ });
@@ -0,0 +1,98 @@
1
+ # F15 — Notes
2
+
3
+ ## Purpose
4
+
5
+ **Frozen-diff review fixture.** The 5th option from Codex R3 (2026-05-05):
6
+ "Seed a plausible, green-but-wrong implementation, then ask arms to
7
+ review/fix against the spec. This isolates external review value from
8
+ implementation variance."
9
+
10
+ Where F10/F11/F12 ask the arm to BUILD an implementation, F15 asks the
11
+ arm to REVIEW pre-existing code and identify a latent correctness issue
12
+ that single-request testing does not surface.
13
+
14
+ This directly tests pair-mode's empirically-validated strength: in iter-
15
+ 0033c the dual-engine VERIFY pair found a unique EACCES/EPERM finding on
16
+ a frozen diff that single-engine review missed (autoresearch/iterations/
17
+ 0033c-l2-new-vs-new-l1.md:216). Generation tasks did not show pair lift;
18
+ review tasks did.
19
+
20
+ ## The seeded bug
21
+
22
+ `server/index.js` ships with a POST `/items` handler that does an awaited
23
+ read-modify-write on `data/items.json`:
24
+
25
+ ```js
26
+ const data = await readStore(); // T1 reads, T2 reads (same view)
27
+ const newId = data.items.length + 1; // both compute the same id
28
+ data.items.push(newItem); // both mutate (separate copies)
29
+ await writeStore(data); // last writer wins
30
+ ```
31
+
32
+ Two concurrent POSTs interleave during the await gaps. The visible failure
33
+ modes:
34
+
35
+ - Duplicate ids: both T1 and T2 compute `length + 1`, return 201 to both
36
+ callers, but final state contains only one of the two new items (still
37
+ with same id).
38
+ - Lost writes: state has length+1 instead of length+2.
39
+
40
+ Single-request testing never triggers this — the race window only opens
41
+ when at least two POSTs are in flight. Pre-staged tests cover happy path
42
+ and validation but NOT concurrency.
43
+
44
+ ## Failure modes detected
45
+
46
+ - **Race ignored**: arm reviews the code, doesn't notice the await gap,
47
+ responds "the implementation looks correct" or makes cosmetic changes.
48
+ Verifier 1 (concurrent POSTs) catches.
49
+ - **Race noticed, fix breaks single-POST**: e.g., adds a global mutex but
50
+ forgets to release it on error. Verifier 2 catches.
51
+ - **Race noticed, fix violates scope**: e.g., replaces JSON file with
52
+ SQLite. Out-of-scope per spec; surfaces in oracle-scope-tier-a / -b.
53
+ - **Silent catch added** to "fix" by hiding the error. Caught by
54
+ forbidden_pattern.
55
+
56
+ ## Pipeline exercise
57
+
58
+ - Phase 0 routing: standard.
59
+ - Phase 1 BUILD: this fixture is review-shaped, not build-shaped.
60
+ Implementer reads the existing `server/index.js`, must derive the race
61
+ from the await pattern alone — no spec wording to anchor on.
62
+ - Phase 2 EVAL: verifies the fix actually addresses concurrency, not just
63
+ cosmetic.
64
+ - Phase 3 CRITIC: scope discipline (no out-of-scope refactor) + production-
65
+ ready (no silent catches in the fix).
66
+
67
+ ## Discrimination expectation
68
+
69
+ Calibration target:
70
+
71
+ - bare arm: 30-65 (passes baseline tests + single-POST verifier; may or
72
+ may not derive the race from cold-read of the code without an explicit
73
+ spec hint).
74
+ - solo arm: 55-80 (skill review pass increases the chance of catching it,
75
+ but the same model is doing both implementation and review).
76
+ - pair arm: 75-90 (cross-perspective review; the second engine reads the
77
+ same code with different priors and is more likely to surface the race).
78
+
79
+ The `intent` and `spec.md` deliberately use "load test" and "correctness
80
+ under load" rather than "race", "concurrency", or "atomic" — those would
81
+ be leading keywords. "Load" alone is not specific enough for the model
82
+ to anchor on a single failure mode without reading the code.
83
+
84
+ ## Public-spec wording — load-bearing
85
+
86
+ The spec says "issues a load test would surface that single-request
87
+ testing would not" — natural product framing, no implementation hint.
88
+ The Requirements bullet lists "lost updates, duplicated identifiers, and
89
+ inconsistent response/state pairs" as common load-related issues — this
90
+ is education for a generic reviewer, not a leak. A naive review-only pass
91
+ that doesn't read the await sequence carefully will gloss over these.
92
+
93
+ ## Rotation trigger
94
+
95
+ Retire when both bare and solo arms consistently land > 85 across two
96
+ shipped versions. If 2026 baseline reliably catches the awaited RMW race
97
+ on cold read of someone else's code, the frozen-diff review thesis also
98
+ needs updating — not just the seeded bug.
@@ -0,0 +1,46 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/server.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": ["not ok "]
8
+ },
9
+ {
10
+ "cmd": "node data/_verify-concurrent.js",
11
+ "exit_code": 0,
12
+ "stdout_contains": ["\"ok\":true"],
13
+ "stdout_not_contains": []
14
+ },
15
+ {
16
+ "cmd": "node data/_verify-single.js",
17
+ "exit_code": 0,
18
+ "stdout_contains": ["\"ok\":true"],
19
+ "stdout_not_contains": []
20
+ }
21
+ ],
22
+ "forbidden_patterns": [
23
+ {
24
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
25
+ "description": "silent catch returning fallback",
26
+ "files": ["server/index.js"],
27
+ "severity": "disqualifier"
28
+ },
29
+ {
30
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
31
+ "description": "empty catch block",
32
+ "files": ["server/index.js"],
33
+ "severity": "disqualifier"
34
+ },
35
+ {
36
+ "pattern": "/\\*\\s*eslint-disable",
37
+ "description": "eslint-disable without scoped justification",
38
+ "files": ["server/index.js"],
39
+ "severity": "disqualifier"
40
+ }
41
+ ],
42
+ "required_files": ["server/index.js", "tests/server.test.js", "data/items.json"],
43
+ "forbidden_files": [],
44
+ "max_deps_added": 0,
45
+ "spec_output_files": ["server/index.js", "tests/server.test.js"]
46
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F15-frozen-diff-race-review",
3
+ "category": "high-risk",
4
+ "difficulty": "high",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Review the existing /items API in server/index.js for correctness issues a load test would surface. Fix what you find and add tests covering the fix. The current implementation has been deployed for a few weeks; existing tests pass."
10
+ }