devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +318 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  12. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  18. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  25. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  31. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  40. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  48. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  56. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  64. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  73. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  82. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  91. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  100. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  101. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  102. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  103. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  104. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  105. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  106. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  107. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  110. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  111. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  112. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  113. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  114. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  116. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  117. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  118. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  119. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  120. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  121. package/config/skills/_shared/archive_run.py +3 -0
  122. package/config/skills/_shared/codex-config.md +2 -2
  123. package/config/skills/_shared/codex-monitored.sh +72 -7
  124. package/config/skills/_shared/collect-codex-findings.py +125 -0
  125. package/config/skills/_shared/engine-preflight.md +1 -1
  126. package/config/skills/_shared/expected.schema.json +18 -0
  127. package/config/skills/_shared/spec-verify-check.py +312 -10
  128. package/config/skills/_shared/verify-merge-findings.py +327 -0
  129. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  130. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  131. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  132. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  133. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  134. package/package.json +1 -1
  135. package/scripts/lint-skills.sh +32 -0
package/CLAUDE.md CHANGED
@@ -24,7 +24,7 @@ The runtime sub-agent contract below (Subtractive-first / Goal-locked / No-worka
24
24
 
25
25
  ## Quick Start
26
26
 
27
- Two skills cover the full cycle post iter-0034 Phase 4 cutover (2026-05-04). `/devlyn:ideate` is OPTIONAL; `/devlyn:resolve` is REQUIRED. **Both default to `--engine claude`** pair / multi-engine routing is research-only at HEAD per the iter-0020 + iter-0033g + iter-0034 close-outs (see [`autoresearch/iterations/0020-pair-policy-narrow.md`](autoresearch/iterations/0020-pair-policy-narrow.md) + [`autoresearch/iterations/0034-phase-4-cutover.md`](autoresearch/iterations/0034-phase-4-cutover.md)). Pass `--engine auto` or `--engine codex` explicitly to opt into the research path; the harness silently downgrades to `claude` and emits a banner if the Codex CLI is missing.
27
+ Two skills cover the full cycle post iter-0034 Phase 4 cutover (2026-05-04). `/devlyn:ideate` is OPTIONAL; `/devlyn:resolve` is REQUIRED. **Both default to `--engine claude`** for PLAN/IMPLEMENT. Codex BUILD/IMPLEMENT and PLAN-pair remain research-only, but `/devlyn:resolve` VERIFY has a gated pair-JUDGE product path when its `SKILL.md` trigger policy fires. Pass `--engine auto` or `--engine codex` explicitly to opt into the broader research path; the harness silently downgrades to `claude` and emits a banner if the Codex CLI is missing.
28
28
 
29
29
  1. `/devlyn:ideate` (optional) — unstructured idea → `docs/specs/<id>/spec.md` + `spec.expected.json`. Modes: default Q&A, `--quick` (autonomous-pipeline-safe), `--from-spec <path>`, `--project`.
30
30
  2. `/devlyn:resolve` — hands-free pipeline for any coding task. Free-form goal, `--spec <path>`, or `--verify-only <diff> --spec <path>`. Phases: PLAN → IMPLEMENT → BUILD_GATE → CLEANUP → VERIFY (fresh subagent, findings-only).
@@ -46,8 +46,26 @@ benchmark/auto-resolve/
46
46
  │ ├── run-fixture.sh # one fixture × one arm, self-contained
47
47
  │ ├── judge.sh # Codex blind judge for one fixture
48
48
  │ ├── compile-report.py # aggregates into report.md + summary.json
49
- └── ship-gate.py # applies thresholds + writes history record
49
+ ├── ship-gate.py # applies thresholds + writes history record
50
+ │ ├── run-headroom-candidate.sh
51
+ │ ├── headroom-gate.py # blocks pair measurement without headroom set
52
+ │ ├── test-headroom-gate.sh
53
+ │ ├── run-full-pipeline-pair-candidate.sh
54
+ │ ├── full-pipeline-pair-gate.py
55
+ │ ├── test-full-pipeline-pair-gate.sh
56
+ │ ├── run-frozen-verify-pair.sh
57
+ │ ├── fetch-swebench-instances.py
58
+ │ ├── collect-swebench-predictions.py
59
+ │ ├── run-swebench-solver-batch.sh
60
+ │ ├── prepare-swebench-frozen-case.py
61
+ │ ├── prepare-swebench-frozen-corpus.py
62
+ │ ├── run-swebench-frozen-corpus.sh
63
+ │ ├── swebench-frozen-matrix.py
64
+ │ ├── test-swebench-frozen-case.sh
65
+ │ ├── frozen-verify-gate.py # gates frozen VERIFY pair-lift evidence
66
+ │ └── test-frozen-verify-gate.sh
50
67
 
68
+ ├── external/swebench/ # ignored local imports of SWE-bench cases/repos
51
69
  ├── results/<run-id>/ # per-run artifacts (overwritten)
52
70
  └── history/
53
71
  ├── runs/ # append-only, one JSON per run
@@ -71,6 +89,302 @@ Follow `fixtures/SCHEMA.md`. Six files per fixture: `metadata.json`, `spec.md`,
71
89
  4. Fill `expected.json` with concrete verification commands and forbidden patterns.
72
90
  5. Document purpose + failure mode in `NOTES.md`.
73
91
  6. Add `setup.sh` if the task needs the base `test-repo` modified before either arm starts.
92
+ 7. Run `bash scripts/lint-fixtures.sh`.
93
+
94
+ For L2/pair candidate fixtures, also run:
95
+
96
+ ```bash
97
+ bash benchmark/auto-resolve/scripts/run-headroom-candidate.sh F16-cli-quote-tax-rules
98
+ ```
99
+
100
+ This runs only the arms needed for calibration (`bare` and `solo_claude`),
101
+ blind-judges them, and applies `headroom-gate.py`. A candidate set is not
102
+ usable for pair measurement unless at least two fixtures pass and each fixture
103
+ has clean `bare <= 60` and `solo_claude <= 80` scores. A one-fixture calibration
104
+ run can show useful scores but does not satisfy the set gate.
105
+ When changing the gate itself, run:
106
+
107
+ ```bash
108
+ bash benchmark/auto-resolve/scripts/test-headroom-gate.sh
109
+ ```
110
+
111
+ After a full-pipeline pair run has the calibrated arms (`bare`,
112
+ `solo_claude`, `l2_gated`) plus a blind `judge.json`, gate it separately:
113
+
114
+ ```bash
115
+ bash benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
116
+ --max-pair-solo-wall-ratio 3 \
117
+ F21-cli-scheduler-priority F23-cli-fulfillment-wave
118
+ ```
119
+
120
+ The runner executes `bare` + `solo_claude`, applies `headroom-gate.py`, and
121
+ only then spends a `l2_gated` arm. To gate already-existing artifacts:
122
+
123
+ When a prompt-only pair change needs a fresh `l2_gated` measurement but the
124
+ calibrated `bare` + `solo_claude` arms are already clean, reuse them into a new
125
+ run id:
126
+
127
+ ```bash
128
+ bash benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
129
+ --run-id <new-run-id> \
130
+ --reuse-calibrated-from <prior-headroom-run-id> \
131
+ --max-pair-solo-wall-ratio 3 \
132
+ F21-cli-scheduler-priority F23-cli-fulfillment-wave
133
+ ```
134
+
135
+ ```bash
136
+ python3 benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
137
+ --run-id <full-pipeline-run-id> \
138
+ --min-fixtures 2 \
139
+ --min-pair-margin 5 \
140
+ --max-pair-solo-wall-ratio 3 \
141
+ --out-json benchmark/auto-resolve/results/<full-pipeline-run-id>/full-pipeline-pair-gate.json \
142
+ --out-md benchmark/auto-resolve/results/<full-pipeline-run-id>/full-pipeline-pair-gate.md
143
+ ```
144
+
145
+ This is the full-pipeline claim gate: each counted fixture must satisfy the
146
+ headroom precondition (`bare <= 60`, `solo_claude <= 80`), the `l2_gated` arm
147
+ must be clean, `pair_mode` must be true in the captured resolve state, and the
148
+ blind judge must score `l2_gated` at least `--min-pair-margin` above
149
+ `solo_claude`. When changing this gate, run:
150
+
151
+ ```bash
152
+ bash benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh
153
+ ```
154
+
155
+ Commands that reference `BENCH_FIXTURE_DIR` are hidden post-run oracles: they
156
+ are not staged into BUILD_GATE's `.devlyn/spec-verify.json`.
157
+
158
+ To compare pair VERIFY against solo VERIFY on a frozen implementation diff,
159
+ run:
160
+
161
+ ```bash
162
+ bash benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
163
+ --fixture F16-cli-quote-tax-rules \
164
+ --diff benchmark/auto-resolve/results/<run-id>/F16-cli-quote-tax-rules/solo_claude/diff.patch \
165
+ --pair-mode gated
166
+ ```
167
+
168
+ This applies the diff before `/devlyn:resolve` starts, then runs verify-only
169
+ solo and pair arms against the same committed work tree. `--pair-mode gated`
170
+ tests the product trigger policy; `--pair-mode forced` adds `--pair-verify` for
171
+ diagnostics. Use non-empty diffs only; empty diffs fail fast because they are
172
+ not valid pair evidence.
173
+ Hidden verifier context is available during VERIFY, so this runner prevents
174
+ IMPLEMENT contamination but is not an oracle-blind judge setup.
175
+ The runner writes `compare.json`; `pair_verdict_lift: true` means pair VERIFY
176
+ actually ran and found a verdict-binding issue that solo VERIFY did not.
177
+ If an imported case has no deterministic `verification_commands`, the runner
178
+ does not create `.devlyn/spec-verify.json`; an empty carrier is malformed by the
179
+ normal real-user contract and must not block qualitative frozen review.
180
+
181
+ To gate a set of frozen VERIFY results mechanically:
182
+
183
+ ```bash
184
+ python3 benchmark/auto-resolve/scripts/frozen-verify-gate.py \
185
+ --run-id 20260505T173913Z-9986cd3-frozen-verify \
186
+ --run-id 20260505T230215Z-9986cd3-frozen-verify \
187
+ --max-pair-solo-wall-ratio 3 \
188
+ --out-json benchmark/auto-resolve/results/frozen-verify-gate-20260505.json \
189
+ --out-md benchmark/auto-resolve/results/frozen-verify-gate-20260505.md
190
+ ```
191
+
192
+ When changing the gate itself, run its regression test:
193
+
194
+ ```bash
195
+ bash benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh
196
+ ```
197
+
198
+ This is deliberately narrower than `headroom-gate.py`: it does not claim
199
+ full-pipeline pair superiority. It proves only that, after the implementation
200
+ diff is frozen, gated pair VERIFY fires and returns a stricter verdict-binding
201
+ result than solo VERIFY on the same diff. Each supplied run must cover a
202
+ distinct fixture; repeated runs of the same fixture do not count as independent
203
+ corpus growth. `--max-pair-solo-wall-ratio` is optional, but use it for
204
+ ship-style evidence so quality lift is not accepted without a reasonable
205
+ wall-time bound. The gate infers the fixture id from the runner input metadata;
206
+ artifacts without that metadata, or with a fixture id absent from
207
+ the selected `--fixtures-root`, fail instead of being counted as anonymous or
208
+ fake evidence.
209
+
210
+ ### SWE-bench fixed-diff review pilot
211
+
212
+ SWE-bench is useful here as an external, widely known corpus, but the first
213
+ measurement surface should remain frozen VERIFY rather than full-pipeline
214
+ generation. The official dataset fields include `instance_id`, `repo`,
215
+ `base_commit`, `problem_statement`, `patch`, and `test_patch`; SWE-bench Lite is
216
+ the smaller subset and SWE-bench Verified is the human-validated subset.
217
+ See:
218
+
219
+ - https://www.swebench.com/SWE-bench/guides/datasets/
220
+ - https://www.swebench.com/lite.html
221
+ - https://www.swebench.com/verified.html
222
+
223
+ Fetch a small official Lite/Verified instance file without installing the
224
+ Hugging Face Python stack:
225
+
226
+ ```bash
227
+ python3 benchmark/auto-resolve/scripts/fetch-swebench-instances.py \
228
+ --dataset lite \
229
+ --limit 5 \
230
+ --out benchmark/auto-resolve/external/swebench/instances-lite.jsonl
231
+ ```
232
+
233
+ Prepare one case from an instance JSON and a candidate patch produced by a solo
234
+ run or another external solver:
235
+
236
+ ```bash
237
+ python3 benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py \
238
+ --instance-json /path/to/swebench-instance.json \
239
+ --model-patch /path/to/solo-candidate.patch
240
+ ```
241
+
242
+ Or prepare a small corpus from the official SWE-bench prediction JSONL shape
243
+ (`instance_id`, `model_name_or_path`, `model_patch`):
244
+
245
+ ```bash
246
+ python3 benchmark/auto-resolve/scripts/collect-swebench-predictions.py \
247
+ --patch-root /path/to/logs \
248
+ --instances-jsonl benchmark/auto-resolve/external/swebench/instances-lite.jsonl \
249
+ --model-name external-solo \
250
+ --out benchmark/auto-resolve/external/swebench/solo-predictions.jsonl
251
+ ```
252
+
253
+ The collector expects `/path/to/logs/<instance_id>/patch.diff`; it is useful
254
+ when another solver or a downloaded SWE-bench log bundle provides per-instance
255
+ patch files rather than prediction JSONL.
256
+
257
+ ```bash
258
+ python3 benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py \
259
+ --instances-jsonl benchmark/auto-resolve/external/swebench/instances-lite.jsonl \
260
+ --predictions-jsonl /path/to/solo-predictions.jsonl \
261
+ --limit 5 \
262
+ --out-manifest benchmark/auto-resolve/external/swebench/manifest.json
263
+ ```
264
+
265
+ Then run the command written to
266
+ `benchmark/auto-resolve/external/swebench/cases/<instance_id>/run-command.txt`.
267
+ For a one-off case, the command uses:
268
+
269
+ ```bash
270
+ bash benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
271
+ --fixture <instance_id> \
272
+ --fixtures-root benchmark/auto-resolve/external/swebench/cases \
273
+ --base-repo benchmark/auto-resolve/external/swebench/repos/<repo-cache> \
274
+ --diff benchmark/auto-resolve/external/swebench/cases/<instance_id>/model.patch \
275
+ --pair-mode gated
276
+ ```
277
+
278
+ For a prepared corpus manifest, run the whole set and gate it:
279
+
280
+ ```bash
281
+ bash benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh \
282
+ --manifest benchmark/auto-resolve/external/swebench/manifest.json \
283
+ --min-runs 2 \
284
+ --max-pair-solo-wall-ratio 3 \
285
+ --timeout-seconds 900 \
286
+ --resume-completed-arms \
287
+ --run-ids-out benchmark/auto-resolve/results/swebench-frozen-run-ids.txt \
288
+ --out-json benchmark/auto-resolve/results/swebench-frozen-gate.json \
289
+ --out-md benchmark/auto-resolve/results/swebench-frozen-gate.md
290
+ ```
291
+
292
+ To re-gate existing run ids without re-invoking providers, write one run id per
293
+ line and pass `--gate-only-run-ids <file>` with the same manifest. For large
294
+ tranches, keep `--run-ids-out` and use `--resume-completed-arms` on retries:
295
+ successful solo/pair arms are reused, while failed or provider-limited arms run
296
+ again. The run ids file is the durable handle for gate-only reruns and matrix
297
+ rendering after a bounded run finishes.
298
+
299
+ To produce local candidate patches for a bounded pilot, prepare a solver
300
+ worktree from the same instance JSONL. The generated spec contains only the
301
+ visible SWE-bench problem statement; do not read the instance's gold `patch` or
302
+ `test_patch` while solving.
303
+
304
+ ```bash
305
+ python3 benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py \
306
+ --instances-jsonl benchmark/auto-resolve/external/swebench/instances-lite.jsonl \
307
+ --instance-id django__django-11019 \
308
+ --copy-devlyn-context
309
+ ```
310
+
311
+ Run the prompt in `<worktree>/solve-prompt.txt`, save the resulting diff as
312
+ `<patch-root>/<instance_id>/patch.diff`, then use
313
+ `collect-swebench-predictions.py` to create prediction JSONL.
314
+
315
+ For a bounded local pilot, the batch runner performs those steps
316
+ sequentially and collects prediction JSONL. It redirects provider stdin away
317
+ from the manifest stream so later rows cannot be consumed by a child process.
318
+ The generated solver worktrees and repo caches can become large; once
319
+ `predictions-out` is written and cases are prepared, remove ignored local cache
320
+ directories such as `external/swebench/worktrees/` and
321
+ `external/swebench/repos-solver/` if disk pressure would otherwise interrupt
322
+ the frozen corpus run. Use `--timeout-seconds` and `--resume` for large
323
+ tranches; long-tail solver rows should be recorded as throughput failures
324
+ instead of letting one row hold the whole suite open.
325
+
326
+ ```bash
327
+ bash benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh \
328
+ --instances-jsonl benchmark/auto-resolve/external/swebench/instances-lite.jsonl \
329
+ --instance-id django__django-11039 \
330
+ --instance-id django__django-11049 \
331
+ --predictions-out benchmark/auto-resolve/external/swebench/predictions-lite.jsonl \
332
+ --copy-devlyn-context
333
+ ```
334
+
335
+ Gate a SWE-bench review pilot by pointing the existing frozen gate at the
336
+ external case root:
337
+
338
+ ```bash
339
+ python3 benchmark/auto-resolve/scripts/frozen-verify-gate.py \
340
+ --fixtures-root benchmark/auto-resolve/external/swebench/cases \
341
+ --run-id <swebench-frozen-run-1> \
342
+ --run-id <swebench-frozen-run-2> \
343
+ --run-id <swebench-frozen-run-3> \
344
+ --min-runs 3 \
345
+ --max-pair-solo-wall-ratio 3 \
346
+ --out-json benchmark/auto-resolve/results/swebench-frozen-gate.json \
347
+ --out-md benchmark/auto-resolve/results/swebench-frozen-gate.md
348
+ ```
349
+
350
+ This gives evidence for "pair review catches solo-missed verdict-binding issues
351
+ on real SWE-bench patches." The gate accepts either external solo-vs-pair
352
+ verdict lift or internal pair lift (`pair_judge` stricter than the pair run's
353
+ primary judge), because separate solo and pair primary judges are stochastic.
354
+ For evidence intended to support shipping policy, also set a wall-ratio cap and
355
+ inspect `avg_pair_solo_wall_ratio` plus each row's `pair_solo_wall_ratio`.
356
+ For selection-bias control, render every run in the attempted pilot, not just
357
+ gate rows. The matrix reports verdict-lift rows separately from recall-only
358
+ rows where pair found additional findings but did not change the binding
359
+ verdict. It also reports classification counts, gate rate, and trailing
360
+ non-gate rows. Use the optional yield thresholds when the matrix is meant to
361
+ fail closed instead of only documenting that additional rows are adding
362
+ controls without strengthening the proof gate:
363
+
364
+ ```bash
365
+ python3 benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
366
+ --title "SWE-bench Lite Frozen VERIFY Matrix" \
367
+ --verdict MIXED_WITH_GATE_PASS \
368
+ --gate-json benchmark/auto-resolve/results/swebench-frozen-gate.json \
369
+ --run-id <swebench-frozen-run-1> \
370
+ --run-id <swebench-frozen-run-2> \
371
+ --min-gate-rate 0.25 \
372
+ --max-trailing-non-gate 10 \
373
+ --out-json benchmark/auto-resolve/results/swebench-frozen-matrix.json \
374
+ --out-md benchmark/auto-resolve/results/swebench-frozen-matrix.md
375
+ ```
376
+
377
+ It does not measure official SWE-bench solve rate; run the official SWE-bench
378
+ evaluator separately for that metric. When changing the importer or
379
+ external-base runner path, run:
380
+
381
+ ```bash
382
+ bash benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh
383
+ ```
384
+
385
+ Do not use the retired full-pipeline `l2_forced` arm as pair evidence. It puts
386
+ `--pair-verify` in the initial prompt, so IMPLEMENT can become pair-aware before
387
+ the diff is frozen.
74
388
 
75
389
  ## LLM-upgrade resilience
76
390
 
@@ -91,7 +405,9 @@ Soft gates (warning, not block): suite-margin drop > 3, fixture losing its margi
91
405
 
92
406
  ## Running the full suite (real)
93
407
 
94
- Full real benchmark costs roughly 2-3 minutes per arm for simple fixtures and up to 15 minutes per arm for strict-route fixtures. A full n=1 run of 9 fixtures × 2 arms can take 30 min – 2 hrs depending on routes taken.
408
+ Full real benchmarks usually take 2-3 minutes per arm for simple fixtures and
409
+ up to 15 minutes per arm for strict-route fixtures. A full n=1 run of 9 fixtures
410
+ × 2 arms can take 30 min - 2 hrs depending on routes taken.
95
411
 
96
412
  ```bash
97
413
  # Smoke run before ship decisions
@@ -23,6 +23,12 @@ Does every Verification command behave as the spec states?
23
23
  - **7-12** — Major requirements missed.
24
24
  - **0-6** — Does not address the core task.
25
25
 
26
+ Mechanical cap: after the blind judge returns, `judge.sh` caps total score at
27
+ `floor(100 * verify_score)` and caps the Spec Compliance axis at
28
+ `floor(25 * verify_score)`. This makes the machine-readable acceptance
29
+ contract binding when a judge grades prose generously despite failed required
30
+ verification commands.
31
+
26
32
  ### Axis 2 — Constraint Respect (0-25)
27
33
 
28
34
  Zero new npm deps (unless spec allows), no silent catches (`try { } catch { return fallback }`), no `any`/`@ts-ignore`, explicit HOME/env guards where required, EACCES-specific handling, no hardcoded values that should be configurable.
@@ -0,0 +1,63 @@
1
+ # F10 — Notes
2
+
3
+ ## Purpose
4
+
5
+ Pair-discriminating high-risk fixture. Adds a write endpoint with persistent
6
+ storage. The pair-edge mechanism: when an implementation reads-modifies-
7
+ writes a JSON file in response to async requests, multiple in-flight POSTs
8
+ can interleave during the await gap between read and write — producing
9
+ duplicate ids or lost writes. A single-pass implementer typically writes
10
+ the natural shape:
11
+
12
+ ```js
13
+ const data = JSON.parse(await fs.readFile(...));
14
+ data.items.push({ id: data.items.length + 1, ...body });
15
+ await fs.writeFile(...);
16
+ ```
17
+
18
+ This passes single-POST tests trivially. The failure only surfaces when
19
+ multiple POSTs are in flight together. A reviewer with fresh eyes asking
20
+ "what happens if two of these run at once?" catches it; the same model
21
+ reviewing its own work tends to anchor on the "happy path" view.
22
+
23
+ ## Failure modes detected
24
+
25
+ - **Duplicate ids** from `length + 1` after concurrent reads.
26
+ - **Lost writes** from last-writer-wins overwrite of in-flight POSTs.
27
+ - **No persistence** — implementer keeps in-memory only, ignoring the
28
+ restart-survival requirement. Caught by `data/items.json` byte check.
29
+ - **Silent catch** wrapping the write path. Caught by forbidden_pattern.
30
+
31
+ ## Pipeline exercise
32
+
33
+ - Phase 1 BUILD: implementer must derive that an awaited file read-modify-
34
+ write under parallel requests requires either serialization (mutex /
35
+ queue) or a unique-id source independent of array length.
36
+ - Phase 2 EVAL: scrutinizes whether the new tests exercise the close-
37
+ together POST case rather than just single-POST happy path.
38
+ - Phase 3 CRITIC: scope discipline + production-ready invariant on the
39
+ consistency claim.
40
+
41
+ ## Discrimination expectation
42
+
43
+ Calibration target (set in pyx-memory project memory 2026-05-05):
44
+
45
+ - bare arm: 40-65 (passes spec wording verbatim, fails parallel verifier).
46
+ - solo arm: 60-78 (review pass may catch one mode, may miss another).
47
+ - pair arm: 75-90 (cross-perspective derivation of the consistency
48
+ invariant from the English "close together" wording).
49
+
50
+ ## Public-spec wording — load-bearing
51
+
52
+ The spec deliberately uses "close together" and "remain consistent"
53
+ instead of trigger keywords. If the spec were to say "race condition",
54
+ "concurrent", "atomic", or "lock", a single-pass solo arm would
55
+ keyword-match the answer pattern and ace the fixture. The English
56
+ prose forces invariant derivation, which is where pair has the edge.
57
+
58
+ ## Rotation trigger
59
+
60
+ Retire when both arms consistently land > 90 across two shipped versions,
61
+ OR when "close-together-write" becomes a recognized pattern such that
62
+ solo arm reliably reaches for a serializing mechanism on first read.
63
+ Whichever comes first.
@@ -0,0 +1,60 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/server.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": ["not ok "]
8
+ },
9
+ {
10
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/parallel.js\"",
11
+ "exit_code": 0,
12
+ "stdout_contains": ["\"ok\":true"],
13
+ "stdout_not_contains": [],
14
+ "contract_refs": [
15
+ "After multiple POSTs (single or many at once), `GET /items` reflects every successful POST exactly once",
16
+ "Several POSTs sent close together produce a list with every item, each with a distinct id"
17
+ ]
18
+ },
19
+ {
20
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/invalid.js\"",
21
+ "exit_code": 0,
22
+ "stdout_contains": ["\"ok\":true"],
23
+ "stdout_not_contains": [],
24
+ "contract_refs": [
25
+ "Invalid body (missing `name`, missing `qty`, `qty` not a positive number) → respond `400` with `{ error: 'invalid_body', field: '<name>' }`",
26
+ "An invalid POST returns `400` and the persisted file is byte-identical to its pre-request contents."
27
+ ]
28
+ },
29
+ {
30
+ "cmd": "node -e 'const fs=require(\"fs\"); const d=JSON.parse(fs.readFileSync(\"data/items.json\",\"utf8\")); console.log(Array.isArray(d.items));'",
31
+ "exit_code": 0,
32
+ "stdout_contains": ["true"],
33
+ "stdout_not_contains": ["false", "undefined"]
34
+ }
35
+ ],
36
+ "forbidden_patterns": [
37
+ {
38
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
39
+ "description": "silent catch returning fallback in write path",
40
+ "files": ["server/index.js"],
41
+ "severity": "disqualifier"
42
+ },
43
+ {
44
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
45
+ "description": "empty catch block",
46
+ "files": ["server/index.js"],
47
+ "severity": "disqualifier"
48
+ },
49
+ {
50
+ "pattern": "/\\*\\s*eslint-disable",
51
+ "description": "eslint-disable without scoped justification",
52
+ "files": ["server/index.js"],
53
+ "severity": "disqualifier"
54
+ }
55
+ ],
56
+ "required_files": ["server/index.js", "tests/server.test.js", "data/items.json"],
57
+ "forbidden_files": [],
58
+ "max_deps_added": 0,
59
+ "spec_output_files": ["server/index.js", "tests/server.test.js"]
60
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F10-persist-write-collision",
3
+ "category": "high-risk",
4
+ "difficulty": "high",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add POST /items that appends a new entry and persists the full list to data/items.json. The list and its ids must remain consistent when multiple POSTs arrive close together — every saved item ends up with a unique numeric id and none are lost."
10
+ }
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env bash
2
+ # F10 setup — seed data/items.json.
3
+ set -e
4
+
5
+ mkdir -p data
6
+
7
+ # Seed with the same baseline the in-memory items started with.
8
+ cat > data/items.json <<'JSON'
9
+ {
10
+ "items": [
11
+ { "id": 1, "name": "alpha", "qty": 3 },
12
+ { "id": 2, "name": "beta", "qty": 5 }
13
+ ]
14
+ }
15
+ JSON
16
+
17
+ exit 0
@@ -0,0 +1,52 @@
1
+ ---
2
+ id: "F10-persist-write-collision"
3
+ title: "Add POST /items with persistent store"
4
+ status: planned
5
+ complexity: high
6
+ depends-on: []
7
+ ---
8
+
9
+ # F10 Add POST /items with persistent store
10
+
11
+ ## Context
12
+
13
+ `server/index.js` currently keeps `items` in an in-memory array that
14
+ resets on restart. The task: add `POST /items` that appends a new entry
15
+ and persists the full list to `data/items.json` so state survives
16
+ restarts. Read paths (`GET /items`, `GET /items/:id`) load from the same
17
+ persisted file.
18
+
19
+ The list and every saved item's `id` must remain consistent when
20
+ multiple POSTs arrive close together — no duplicate ids, no lost writes.
21
+
22
+ ## Requirements
23
+
24
+ - [ ] `POST /items` accepts JSON body `{ name: string, qty: number }` and returns `201` with `{ item: { id, name, qty } }` on success.
25
+ - [ ] `id` is a unique numeric value. Every successful POST returns an id no other item has, past or present.
26
+ - [ ] After multiple POSTs (single or many at once), `GET /items` reflects every successful POST exactly once — `items.length` equals starting count plus successful POST count.
27
+ - [ ] `data/items.json` on disk reflects the same state. Restarting the server preserves the items.
28
+ - [ ] `GET /items` and `GET /items/:id` read from the persisted file (or from state initialized from it at startup, with all writes persisted before the response is returned).
29
+ - [ ] Invalid body (missing `name`, missing `qty`, `qty` not a positive number) → respond `400` with `{ error: 'invalid_body', field: '<name>' }`. The persisted store must be unchanged after a 400 response.
30
+ - [ ] `tests/server.test.js` is updated. Existing assertions still hold AND at least two new tests cover POST behavior, including a test that fires several POST requests close together and verifies the resulting list contains every item with distinct ids.
31
+
32
+ ## Constraints
33
+
34
+ - **No new npm dependencies.** Use only Express + Node built-ins (`fs/promises`).
35
+ - **No silent catches.** Any `try/catch` in the write path must surface failure as `500` with a clear body, not return a fake success.
36
+ - **No hardcoded ids.** Existing baseline ids (1, 2) remain valid; new ids must not collide with any past or present id.
37
+ - **No breaking change** to `GET /items` shape or `GET /items/:id` semantics.
38
+ - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
39
+
40
+ ## Out of Scope
41
+
42
+ - Authentication, rate limiting, validation libraries.
43
+ - Database integration. The store is a single JSON file by design.
44
+ - Touching `bin/cli.js`, `web/`, or `tests/cli.test.js`.
45
+
46
+ ## Verification
47
+
48
+ - `node --test tests/server.test.js` exits 0 and the new tests cover POST.
49
+ - Several POSTs sent close together produce a list with every item, each with a distinct id — assertable from a small Node script that POSTs in parallel and reads back `GET /items`.
50
+ - After server restart, items posted before the restart are still present in `GET /items`.
51
+ - An invalid POST returns `400` and the persisted file is byte-identical to its pre-request contents.
52
+ - `git diff --stat` shows only `server/index.js` and `tests/server.test.js` touched (the `data/items.json` seed comes from setup, not the arm).
@@ -0,0 +1,9 @@
1
+ Add `POST /items` to `server/index.js` so users can append a new item with `{ name, qty }`. The full list should be persisted to `data/items.json` so state survives restarts — the existing read endpoints (`GET /items`, `GET /items/:id`) should also use the same persisted file going forward.
2
+
3
+ Every saved item must have a unique numeric id. If two or more POSTs arrive close together, the resulting list must contain every successful one with distinct ids — no duplicates, nothing lost.
4
+
5
+ Invalid body (missing `name`, missing `qty`, `qty` not a positive number) → respond `400` with `{ error: 'invalid_body', field: '<name>' }`, and the persisted store must be unchanged after the 400.
6
+
7
+ Update `tests/server.test.js` so existing tests still pass AND add at least two new tests covering POST behavior. One of them must fire several POSTs close together and verify the final list contains every item with distinct ids.
8
+
9
+ No new npm dependencies. Only touch `server/index.js`, `tests/server.test.js`, and `data/items.json` (which is seeded for you).
@@ -0,0 +1,29 @@
1
+ 'use strict';
2
+ const fs = require('fs');
3
+ const http = require('http');
4
+ const path = require('path');
5
+ const { app } = require(path.join(process.env.BENCH_WORKDIR, 'server'));
6
+
7
+ const before = fs.readFileSync('data/items.json');
8
+
9
+ const s = http.createServer(app).listen(0, () => {
10
+ const { port } = s.address();
11
+ const req = http.request(
12
+ { host: '127.0.0.1', port, method: 'POST', path: '/items',
13
+ headers: { 'Content-Type': 'application/json' } },
14
+ (r) => {
15
+ let b = '';
16
+ r.on('data', (c) => (b += c));
17
+ r.on('end', () => {
18
+ const after = fs.readFileSync('data/items.json');
19
+ const same = before.equals(after);
20
+ const ok = r.statusCode === 400 && same;
21
+ console.log(JSON.stringify({ status: r.statusCode, store_unchanged: same, ok }));
22
+ s.close();
23
+ process.exit(ok ? 0 : 1);
24
+ });
25
+ }
26
+ );
27
+ req.write(JSON.stringify({ name: 'noqty' }));
28
+ req.end();
29
+ });
@@ -0,0 +1,50 @@
1
+ 'use strict';
2
+ const http = require('http');
3
+ const path = require('path');
4
+ const { app } = require(path.join(process.env.BENCH_WORKDIR, 'server'));
5
+
6
+ const s = http.createServer(app).listen(0, () => {
7
+ const { port } = s.address();
8
+ const post = (body) => new Promise((resolve) => {
9
+ const req = http.request(
10
+ { host: '127.0.0.1', port, method: 'POST', path: '/items',
11
+ headers: { 'Content-Type': 'application/json' } },
12
+ (r) => {
13
+ let b = '';
14
+ r.on('data', (c) => (b += c));
15
+ r.on('end', () => resolve({ status: r.statusCode, body: b ? safeJson(b) : null }));
16
+ }
17
+ );
18
+ req.on('error', () => resolve({ status: 0, body: null }));
19
+ req.write(JSON.stringify(body));
20
+ req.end();
21
+ });
22
+
23
+ Promise.all([
24
+ post({ name: 'gamma', qty: 1 }),
25
+ post({ name: 'delta', qty: 2 }),
26
+ post({ name: 'epsilon', qty: 3 }),
27
+ ]).then(async (results) => {
28
+ await new Promise((r) => setTimeout(r, 250));
29
+ http.get(`http://127.0.0.1:${port}/items`, (r) => {
30
+ let b = '';
31
+ r.on('data', (c) => (b += c));
32
+ r.on('end', () => {
33
+ let data;
34
+ try { data = JSON.parse(b); } catch { data = { items: [] }; }
35
+ const items = Array.isArray(data.items) ? data.items : [];
36
+ const ids = items.map((i) => i && i.id);
37
+ const uniq = new Set(ids).size;
38
+ const allPosted = results.every((x) => x.status === 201);
39
+ const ok = allPosted && items.length === 5 && uniq === 5;
40
+ console.log(JSON.stringify({
41
+ len: items.length, uniq, status: results.map((r) => r.status), ok,
42
+ }));
43
+ s.close();
44
+ process.exit(ok ? 0 : 1);
45
+ });
46
+ });
47
+ });
48
+ });
49
+
50
+ function safeJson(s) { try { return JSON.parse(s); } catch { return null; } }