devlyn-cli 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +321 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  49. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  57. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  65. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  74. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  83. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  84. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
  85. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
  86. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
  87. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
  88. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
  89. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
  90. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  91. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  92. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  93. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  94. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  95. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  96. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  97. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
  98. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
  99. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  100. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  101. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  102. package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
  103. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  104. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  105. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  106. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  107. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  108. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  109. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  110. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  111. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  112. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  113. package/config/skills/_shared/archive_run.py +3 -0
  114. package/config/skills/_shared/codex-config.md +2 -2
  115. package/config/skills/_shared/codex-monitored.sh +72 -7
  116. package/config/skills/_shared/collect-codex-findings.py +125 -0
  117. package/config/skills/_shared/engine-preflight.md +1 -1
  118. package/config/skills/_shared/expected.schema.json +18 -0
  119. package/config/skills/_shared/spec-verify-check.py +363 -10
  120. package/config/skills/_shared/verify-merge-findings.py +327 -0
  121. package/config/skills/devlyn:resolve/SKILL.md +69 -8
  122. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  123. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
  124. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  125. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  126. package/package.json +1 -1
  127. package/scripts/lint-skills.sh +69 -20
@@ -0,0 +1,34 @@
1
+ const assert = require('node:assert');
2
+ const fs = require('node:fs');
3
+ const os = require('node:os');
4
+ const path = require('node:path');
5
+ const { spawnSync } = require('node:child_process');
6
+
7
+ const workdir = process.env.BENCH_WORKDIR || process.cwd();
8
+ const input = path.join(os.tmpdir(), `cart-stock-${process.pid}.json`);
9
+
10
+ fs.writeFileSync(input, JSON.stringify({
11
+ state: 'OR',
12
+ coupon: null,
13
+ items: [
14
+ { sku: 'BAG', qty: 2 },
15
+ { sku: 'MUG', qty: 1 },
16
+ { sku: 'BAG', qty: 3 }
17
+ ]
18
+ }));
19
+
20
+ const proc = spawnSync('node', ['bin/cli.js', 'cart', '--input', input], {
21
+ cwd: workdir,
22
+ encoding: 'utf8'
23
+ });
24
+
25
+ assert.strictEqual(proc.status, 2);
26
+ assert.strictEqual(proc.stdout, '');
27
+ assert.deepStrictEqual(JSON.parse(proc.stderr), {
28
+ error: 'invalid_stock',
29
+ sku: 'BAG',
30
+ available: 4,
31
+ requested: 5
32
+ });
33
+
34
+ process.stdout.write(JSON.stringify({ ok: true }) + '\n');
@@ -0,0 +1,25 @@
1
+ # F26 CLI payout ledger rules
2
+
3
+ ## Failure mode
4
+
5
+ This fixture detects settlement implementations that pass simple payout tests
6
+ while mishandling idempotent events, conflicting duplicates, fee ordering,
7
+ dispute fees, reserves, minimum payout holds, and top-level totals.
8
+
9
+ ## Pipeline phase target
10
+
11
+ PLAN must preserve event deduplication and arithmetic order. IMPLEMENT must keep
12
+ fee/reserve math in integer cents and avoid hardcoded rules. VERIFY should build
13
+ adversarial ledger examples with repeated IDs, refunds, disputes, and reserves.
14
+
15
+ ## Why existing fixtures do not cover it
16
+
17
+ F16 covers quote math and F25 covers cart promotions, but neither has ledger
18
+ idempotency or conflicting duplicate events. F21/F23 became oracle-control
19
+ fixtures, so this adds a fresh visible-contract stateful arithmetic candidate.
20
+
21
+ ## Retirement
22
+
23
+ Retire or replace this fixture if solo consistently reaches ceiling or if
24
+ another fixture provides the same idempotent-ledger signal with cleaner
25
+ full-pipeline pair lift.
@@ -0,0 +1,68 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/cli.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": ["not ok "]
8
+ },
9
+ {
10
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/exact-payout.js\"",
11
+ "exit_code": 0,
12
+ "stdout_contains": ["\"ok\":true"],
13
+ "stdout_not_contains": [],
14
+ "contract_refs": [
15
+ "Identical duplicate event IDs are applied only once before merchant totals are computed.",
16
+ "A payout with charges, a refund, and a dispute computes processing fees, dispute fees, reserves, merchant payouts, and top-level totals exactly.",
17
+ "Processing fees apply to charges only; refunds do not reverse processing fees.",
18
+ "Dispute events subtract the dispute amount and add one dispute fee per dispute event.",
19
+ "Merchant rows preserve first-seen merchant order after idempotent duplicate removal."
20
+ ]
21
+ },
22
+ {
23
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/conflicting-duplicate.js\"",
24
+ "exit_code": 0,
25
+ "stdout_contains": ["\"ok\":true"],
26
+ "stdout_not_contains": [],
27
+ "contract_refs": [
28
+ "Conflicting duplicate events use exact error shape `{ \"error\": \"conflicting_duplicate\", \"id\": string }`.",
29
+ "A conflicting duplicate exits `2`, prints one JSON error to stderr, and prints no stdout.",
30
+ "The conflicting duplicate error object includes `error` and `id`."
31
+ ]
32
+ },
33
+ {
34
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/rules-source.js\"",
35
+ "exit_code": 0,
36
+ "stdout_contains": ["\"ok\":true"],
37
+ "stdout_not_contains": [],
38
+ "contract_refs": [
39
+ "Processing fee percent, fixed fee, dispute fee, reserve percent, and minimum payout threshold come from `data/payout-rules.json`. Do not hardcode these values in the command implementation.",
40
+ "Changing `data/payout-rules.json` fee or reserve settings changes command output without code changes."
41
+ ]
42
+ }
43
+ ],
44
+ "forbidden_patterns": [
45
+ {
46
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
47
+ "description": "silent catch returning fallback in payout path",
48
+ "files": ["bin/cli.js"],
49
+ "severity": "disqualifier"
50
+ },
51
+ {
52
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
53
+ "description": "empty catch block",
54
+ "files": ["bin/cli.js"],
55
+ "severity": "disqualifier"
56
+ },
57
+ {
58
+ "pattern": "/\\*\\s*eslint-disable",
59
+ "description": "eslint-disable without scoped justification",
60
+ "files": ["bin/cli.js"],
61
+ "severity": "disqualifier"
62
+ }
63
+ ],
64
+ "required_files": ["bin/cli.js", "tests/cli.test.js", "data/payout-rules.json"],
65
+ "forbidden_files": [],
66
+ "max_deps_added": 0,
67
+ "spec_output_files": ["bin/cli.js", "tests/cli.test.js"]
68
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F26-cli-payout-ledger-rules",
3
+ "category": "high-risk",
4
+ "difficulty": "high",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add a bench-cli payout command that reads ledger events from a JSON file, applies idempotent event handling and payout rules from data/payout-rules.json, and prints exact merchant payout totals with processing fees, dispute fees, reserves, and payouts in integer cents."
10
+ }
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env bash
2
+ # F26 setup — seed payout ledger rules.
3
+ set -e
4
+
5
+ mkdir -p data
6
+
7
+ cat > data/payout-rules.json <<'JSON'
8
+ {
9
+ "processing_fee_percent": 2.9,
10
+ "fixed_fee_cents": 30,
11
+ "dispute_fee_cents": 1500,
12
+ "reserve_percent": 10,
13
+ "minimum_payout_cents": 1000
14
+ }
15
+ JSON
16
+
17
+ exit 0
@@ -0,0 +1,68 @@
1
+ ---
2
+ id: "F26-cli-payout-ledger-rules"
3
+ title: "Payout command with ledger rules"
4
+ status: planned
5
+ complexity: high
6
+ depends-on: []
7
+ ---
8
+
9
+ # F26 Payout command with ledger rules
10
+
11
+ ## Context
12
+
13
+ `bench-cli` currently has greeting and version commands only. The task:
14
+ add a `payout` command that reads ledger events from a JSON file, applies
15
+ idempotent event handling and payout rules from `data/payout-rules.json`, and
16
+ prints exact merchant payout totals with processing fees, dispute fees,
17
+ reserves, and payouts in integer cents.
18
+
19
+ This is settlement math, so duplicate events must not corrupt totals and every
20
+ public amount must be integer cents.
21
+
22
+ ## Requirements
23
+
24
+ - [ ] `bench-cli payout --input <path>` reads JSON shaped as `{ "events": [{ "id": string, "merchant_id": string, "type": "charge" | "refund" | "dispute", "amount_cents": number }] }`.
25
+ - [ ] Processing fee percent, fixed fee, dispute fee, reserve percent, and minimum payout threshold come from `data/payout-rules.json`. Do not hardcode these values in the command implementation.
26
+ - [ ] Events with the same `id` and identical JSON content are idempotent duplicates and are applied only once.
27
+ - [ ] Events with the same `id` but different JSON content are conflicting duplicates. Validation happens before payout totals are printed, exits `2`, writes exactly one JSON error object to stderr, and writes no stdout.
28
+ - [ ] Conflicting duplicate events use exact error shape `{ "error": "conflicting_duplicate", "id": string }`.
29
+ - [ ] Unknown event type, missing `merchant_id`, missing `id`, non-positive or non-integer `amount_cents`, missing `events`, invalid JSON, or unreadable input exits `2` and writes exactly one JSON error object to stderr.
30
+ - [ ] Merchant rows are emitted in first-seen merchant order after idempotent duplicate removal.
31
+ - [ ] A `charge` increases `gross_charge_cents` and adds a processing fee of `Math.round(amount_cents * processing_fee_percent / 100) + fixed_fee_cents`.
32
+ - [ ] A `refund` increases `refund_cents`. Refunds do not reverse processing fees.
33
+ - [ ] A `dispute` increases `dispute_cents` and adds `dispute_fee_cents` from the rules for each dispute event.
34
+ - [ ] For each merchant, compute `net_before_reserve = gross_charge_cents - refund_cents - dispute_cents - processing_fee_cents - dispute_fee_cents`.
35
+ - [ ] `reserve_cents` is `Math.round(net_before_reserve * reserve_percent / 100)` when `net_before_reserve > 0`; otherwise `0`.
36
+ - [ ] `payout_cents = net_before_reserve - reserve_cents`.
37
+ - [ ] If `0 < payout_cents < minimum_payout_cents`, keep the merchant row but set `payout_cents` to `0` and add the original positive amount into `reserve_cents`.
38
+ - [ ] On success, write exactly one JSON object to stdout and no stderr. Keys: `total_payout_cents`, `total_processing_fee_cents`, `total_dispute_fee_cents`, `total_reserve_cents`, `merchants`.
39
+ - [ ] Each merchant row has keys `merchant_id`, `gross_charge_cents`, `refund_cents`, `dispute_cents`, `processing_fee_cents`, `dispute_fee_cents`, `reserve_cents`, `payout_cents`.
40
+ - [ ] `tests/cli.test.js` is updated. Existing tests still pass AND at least two new tests cover `payout`: one successful payout and one validation failure.
41
+
42
+ ## Constraints
43
+
44
+ - **No new npm dependencies.**
45
+ - **No floating-money output.** All public amounts are integer cents.
46
+ - **No silent catches.** If parsing or file reading fails, emit a visible JSON error to stderr and exit `2`.
47
+ - **No extra stdout/stderr text** on the success path; downstream tooling parses stdout as JSON.
48
+
49
+ ## Out of Scope
50
+
51
+ - Persisting payouts or mutating a ledger.
52
+ - Currency conversion.
53
+ - Time zones, reporting periods, or settlement dates.
54
+ - Adding web UI or server routes.
55
+ - Touching `server/`, `web/`, or `tests/server.test.js`.
56
+
57
+ ## Verification
58
+
59
+ - `node --test tests/cli.test.js` exits 0.
60
+ - Identical duplicate event IDs are applied only once before merchant totals are computed.
61
+ - A payout with charges, a refund, and a dispute computes processing fees, dispute fees, reserves, merchant payouts, and top-level totals exactly.
62
+ - Processing fees apply to charges only; refunds do not reverse processing fees.
63
+ - Dispute events subtract the dispute amount and add one dispute fee per dispute event.
64
+ - Merchant rows preserve first-seen merchant order after idempotent duplicate removal.
65
+ - A conflicting duplicate exits `2`, prints one JSON error to stderr, and prints no stdout.
66
+ - The conflicting duplicate error object includes `error` and `id`.
67
+ - Changing `data/payout-rules.json` fee or reserve settings changes command output without code changes.
68
+ - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched (the payout rules seed comes from setup, not the arm).
@@ -0,0 +1,7 @@
1
+ Add a bench-cli payout command that reads ledger events from a JSON file, applies idempotent event handling and payout rules from data/payout-rules.json, and prints exact merchant payout totals with processing fees, dispute fees, reserves, and payouts in integer cents.
2
+
3
+ The command should be `bench-cli payout --input <path>`. Input JSON has an events array. Events have id, merchant_id, type, and amount_cents. Use the payout rules JSON for processing fee percent, fixed fee, dispute fee, reserve percent, and minimum payout threshold. Do not hardcode those values.
4
+
5
+ Identical duplicate event IDs are idempotent and should be applied only once. The same event ID with different content is a conflicting duplicate and must fail before printing totals. Successful output must be one JSON object with top-level totals and merchant rows in first-seen merchant order. Validation errors must exit 2, write one JSON error object to stderr, and write no stdout.
6
+
7
+ Update `tests/cli.test.js` so existing tests still pass and at least two new tests cover the payout command, including one successful payout and one validation failure. Do not add dependencies or touch the server/web files.
@@ -0,0 +1,29 @@
1
+ const assert = require('node:assert');
2
+ const fs = require('node:fs');
3
+ const os = require('node:os');
4
+ const path = require('node:path');
5
+ const { spawnSync } = require('node:child_process');
6
+
7
+ const workdir = process.env.BENCH_WORKDIR || process.cwd();
8
+ const input = path.join(os.tmpdir(), `payout-conflict-${process.pid}.json`);
9
+
10
+ fs.writeFileSync(input, JSON.stringify({
11
+ events: [
12
+ { id: 'evt-conflict', merchant_id: 'm_1', type: 'charge', amount_cents: 1000 },
13
+ { id: 'evt-conflict', merchant_id: 'm_1', type: 'charge', amount_cents: 1001 }
14
+ ]
15
+ }));
16
+
17
+ const proc = spawnSync('node', ['bin/cli.js', 'payout', '--input', input], {
18
+ cwd: workdir,
19
+ encoding: 'utf8'
20
+ });
21
+
22
+ assert.strictEqual(proc.status, 2);
23
+ assert.strictEqual(proc.stdout, '');
24
+ assert.deepStrictEqual(JSON.parse(proc.stderr), {
25
+ error: 'conflicting_duplicate',
26
+ id: 'evt-conflict'
27
+ });
28
+
29
+ process.stdout.write(JSON.stringify({ ok: true }) + '\n');
@@ -0,0 +1,58 @@
1
+ const assert = require('node:assert');
2
+ const fs = require('node:fs');
3
+ const os = require('node:os');
4
+ const path = require('node:path');
5
+ const { spawnSync } = require('node:child_process');
6
+
7
+ const workdir = process.env.BENCH_WORKDIR || process.cwd();
8
+ const input = path.join(os.tmpdir(), `payout-success-${process.pid}.json`);
9
+
10
+ const charge1 = { id: 'evt-1', merchant_id: 'm_1', type: 'charge', amount_cents: 10000 };
11
+ fs.writeFileSync(input, JSON.stringify({
12
+ events: [
13
+ charge1,
14
+ { id: 'evt-2', merchant_id: 'm_2', type: 'charge', amount_cents: 5000 },
15
+ charge1,
16
+ { id: 'evt-3', merchant_id: 'm_1', type: 'refund', amount_cents: 2500 },
17
+ { id: 'evt-4', merchant_id: 'm_1', type: 'charge', amount_cents: 3333 },
18
+ { id: 'evt-5', merchant_id: 'm_2', type: 'dispute', amount_cents: 2000 }
19
+ ]
20
+ }));
21
+
22
+ const proc = spawnSync('node', ['bin/cli.js', 'payout', '--input', input], {
23
+ cwd: workdir,
24
+ encoding: 'utf8'
25
+ });
26
+
27
+ assert.strictEqual(proc.status, 0, proc.stderr || proc.stdout);
28
+ assert.strictEqual(proc.stderr, '');
29
+ assert.deepStrictEqual(JSON.parse(proc.stdout), {
30
+ total_payout_cents: 10539,
31
+ total_processing_fee_cents: 622,
32
+ total_dispute_fee_cents: 1500,
33
+ total_reserve_cents: 1172,
34
+ merchants: [
35
+ {
36
+ merchant_id: 'm_1',
37
+ gross_charge_cents: 13333,
38
+ refund_cents: 2500,
39
+ dispute_cents: 0,
40
+ processing_fee_cents: 447,
41
+ dispute_fee_cents: 0,
42
+ reserve_cents: 1039,
43
+ payout_cents: 9347
44
+ },
45
+ {
46
+ merchant_id: 'm_2',
47
+ gross_charge_cents: 5000,
48
+ refund_cents: 0,
49
+ dispute_cents: 2000,
50
+ processing_fee_cents: 175,
51
+ dispute_fee_cents: 1500,
52
+ reserve_cents: 133,
53
+ payout_cents: 1192
54
+ }
55
+ ]
56
+ });
57
+
58
+ process.stdout.write(JSON.stringify({ ok: true }) + '\n');
@@ -0,0 +1,56 @@
1
+ const assert = require('node:assert');
2
+ const fs = require('node:fs');
3
+ const os = require('node:os');
4
+ const path = require('node:path');
5
+ const { spawnSync } = require('node:child_process');
6
+
7
+ const workdir = process.env.BENCH_WORKDIR || process.cwd();
8
+ const rulesPath = path.join(workdir, 'data', 'payout-rules.json');
9
+ const original = fs.readFileSync(rulesPath, 'utf8');
10
+
11
+ try {
12
+ fs.writeFileSync(rulesPath, JSON.stringify({
13
+ processing_fee_percent: 1,
14
+ fixed_fee_cents: 10,
15
+ dispute_fee_cents: 77,
16
+ reserve_percent: 0,
17
+ minimum_payout_cents: 1
18
+ }, null, 2) + '\n');
19
+
20
+ const input = path.join(os.tmpdir(), `payout-rules-${process.pid}.json`);
21
+ fs.writeFileSync(input, JSON.stringify({
22
+ events: [
23
+ { id: 'evt-1', merchant_id: 'm_1', type: 'charge', amount_cents: 10000 }
24
+ ]
25
+ }));
26
+
27
+ const proc = spawnSync('node', ['bin/cli.js', 'payout', '--input', input], {
28
+ cwd: workdir,
29
+ encoding: 'utf8'
30
+ });
31
+
32
+ assert.strictEqual(proc.status, 0, proc.stderr || proc.stdout);
33
+ assert.strictEqual(proc.stderr, '');
34
+ assert.deepStrictEqual(JSON.parse(proc.stdout), {
35
+ total_payout_cents: 9890,
36
+ total_processing_fee_cents: 110,
37
+ total_dispute_fee_cents: 0,
38
+ total_reserve_cents: 0,
39
+ merchants: [
40
+ {
41
+ merchant_id: 'm_1',
42
+ gross_charge_cents: 10000,
43
+ refund_cents: 0,
44
+ dispute_cents: 0,
45
+ processing_fee_cents: 110,
46
+ dispute_fee_cents: 0,
47
+ reserve_cents: 0,
48
+ payout_cents: 9890
49
+ }
50
+ ]
51
+ });
52
+ } finally {
53
+ fs.writeFileSync(rulesPath, original);
54
+ }
55
+
56
+ process.stdout.write(JSON.stringify({ ok: true }) + '\n');
@@ -37,7 +37,6 @@ so existing assertions continue to pass alongside new paging assertions.
37
37
  - **No breaking change to `/items/:id`.** The per-item route must keep its current contract (the fixture explicitly does NOT paginate single-item lookups).
38
38
  - **Backward-compat note**: clients that previously read `response.items` MUST still get the array at the same key inside the new envelope.
39
39
 
40
- - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
41
40
 
42
41
  ## Out of Scope
43
42
 
@@ -31,7 +31,6 @@ and italicized — using only the page's own CSS/JS.
31
31
  - **No inline JS frameworks.** Stick to the vanilla pattern already in `index.html`.
32
32
  - **Accessibility.** Both buttons must have accessible names equal to their visible labels; `#whisper` adds `aria-label="whisper"` only if its visible text differs (it doesn't, so leave it off).
33
33
 
34
- - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
35
34
 
36
35
  ## Out of Scope
37
36
 
@@ -31,7 +31,6 @@ Implement it so every test passes.
31
31
  - **Do not modify `tests/count.test.js`.** If a test looks wrong, that's a signal to revisit the implementation, not the test.
32
32
  - **No silent catches.** Errors reading stdin must surface with a clear message (not suppressed).
33
33
 
34
- - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
35
34
 
36
35
  ## Out of Scope
37
36
 
@@ -30,7 +30,6 @@ already provides everything needed; no external dependency is warranted.
30
30
  - **Stream-friendly.** Large files should not be read fully into memory. Use a hash stream (`crypto.createHash('sha256')` + pipe from `fs.createReadStream`).
31
31
  - **No silent catches.** File I/O errors must surface with an informative message and the appropriate exit code.
32
32
 
33
- - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
34
33
 
35
34
  ## Out of Scope
36
35
 
@@ -27,7 +27,6 @@ version without string manipulation. Add a `--format json` flag that makes
27
27
  - **Touch only `bin/cli.js` (`version` handler + argument parsing) and `tests/cli.test.js` (new test).** Do NOT modify the `hello` subcommand or any other file.
28
28
  - **No silent catches.** Unknown `--format` values must surface an error.
29
29
 
30
- - **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
31
30
 
32
31
  ## Out of Scope
33
32
 
@@ -42,9 +42,6 @@ inside `/devlyn:resolve` (no separate preflight skill in the 2-skill design).
42
42
  - **No silent catches.**
43
43
  - **Non-git-repo handling.** Do not assume the user is always in a repo.
44
44
 
45
- - **Lifecycle note.** The harness's CLEANUP/VERIFY phases may flip this
46
- spec's frontmatter `status` after implementation completes — that is
47
- benchmark lifecycle bookkeeping, not a scope violation.
48
45
 
49
46
  ## Out of Scope
50
47
 
@@ -73,7 +73,8 @@ Machine-readable acceptance criteria used by both `run-fixture.sh` verification
73
73
  "cmd": "node bin/cli.js doctor",
74
74
  "exit_code": 0,
75
75
  "stdout_contains": ["doctor: "],
76
- "stdout_not_contains": ["undefined"]
76
+ "stdout_not_contains": ["undefined"],
77
+ "contract_refs": []
77
78
  }
78
79
  ],
79
80
  "forbidden_patterns": [
@@ -86,14 +87,25 @@ Machine-readable acceptance criteria used by both `run-fixture.sh` verification
86
87
  ],
87
88
  "required_files": ["bin/cli.js"],
88
89
  "forbidden_files": [],
90
+ "tier_a_waivers": [],
91
+ "spec_output_files": ["bin/cli.js"],
89
92
  "max_deps_added": 0
90
93
  }
91
94
  ```
92
95
 
93
96
  - **verification_commands** — runner executes each. Each command's pass/fail contributes to the arm's `verify_score`.
97
+ Commands run with `BENCH_WORKDIR` (fresh arm work tree) and
98
+ `BENCH_FIXTURE_DIR` (the fixture directory outside the arm work tree) in
99
+ the environment. Put discriminator/oracle scripts under the fixture
100
+ directory when the arm should not read the verifier source.
101
+ Any command that references `BENCH_FIXTURE_DIR` is a hidden oracle and must
102
+ include `contract_refs`: exact substrings from `spec.md` proving the oracle
103
+ tests a visible contract rather than inventing a narrower one.
94
104
  - **forbidden_patterns** — regexes scanned across `diff.patch`. Match at `severity: "disqualifier"` is a hard-floor fail. Match at `severity: "warning"` goes into the judge's critical-findings report.
95
105
  - **required_files** — must exist after the arm runs.
96
106
  - **forbidden_files** — must NOT appear in the arm's diff.
107
+ - **tier_a_waivers** — optional globs for files the spec explicitly authorizes even though Tier A scope oracle would normally flag them.
108
+ - **spec_output_files** — files or globs that define the authorized output surface for Tier B scope tracing.
97
109
  - **max_deps_added** — count of new entries under `dependencies`/`devDependencies` in `package.json`. Exceeds → hard-floor fail.
98
110
 
99
111
  ## NOTES.md
@@ -0,0 +1,98 @@
1
+ #!/usr/bin/env python3
2
+ """Collect patch.diff files into SWE-bench prediction JSONL."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+
12
+ def read_jsonl(path: Path) -> list[dict[str, Any]]:
13
+ rows: list[dict[str, Any]] = []
14
+ with path.open(encoding="utf8") as f:
15
+ for line_no, line in enumerate(f, start=1):
16
+ if not line.strip():
17
+ continue
18
+ value = json.loads(line)
19
+ if not isinstance(value, dict):
20
+ raise ValueError(f"{path}:{line_no}: expected JSON object")
21
+ rows.append(value)
22
+ return rows
23
+
24
+
25
+ def instance_ids_from_jsonl(path: Path | None) -> set[str] | None:
26
+ if path is None:
27
+ return None
28
+ ids: set[str] = set()
29
+ for row in read_jsonl(path):
30
+ instance_id = row.get("instance_id")
31
+ if not isinstance(instance_id, str) or not instance_id:
32
+ raise ValueError(f"{path}: row missing non-empty instance_id")
33
+ ids.add(instance_id)
34
+ return ids
35
+
36
+
37
+ def collect_from_root(root: Path, patch_name: str, keep: set[str] | None) -> list[tuple[str, Path]]:
38
+ patches: list[tuple[str, Path]] = []
39
+ for patch_path in sorted(root.glob(f"*/{patch_name}")):
40
+ instance_id = patch_path.parent.name
41
+ if keep is not None and instance_id not in keep:
42
+ continue
43
+ patches.append((instance_id, patch_path))
44
+ return patches
45
+
46
+
47
+ def main() -> int:
48
+ parser = argparse.ArgumentParser()
49
+ parser.add_argument("--patch-root", required=True, type=Path)
50
+ parser.add_argument("--patch-name", default="patch.diff")
51
+ parser.add_argument("--instances-jsonl", type=Path, help="Optional filter/validation set.")
52
+ parser.add_argument("--model-name", required=True)
53
+ parser.add_argument("--out", required=True, type=Path)
54
+ parser.add_argument("--allow-empty", action="store_true")
55
+ args = parser.parse_args()
56
+
57
+ keep = instance_ids_from_jsonl(args.instances_jsonl)
58
+ patches = collect_from_root(args.patch_root, args.patch_name, keep)
59
+ if not patches:
60
+ raise ValueError(f"no {args.patch_name} files found under {args.patch_root}")
61
+
62
+ args.out.parent.mkdir(parents=True, exist_ok=True)
63
+ written = 0
64
+ skipped_empty: list[str] = []
65
+ with args.out.open("w", encoding="utf8") as f:
66
+ for instance_id, patch_path in patches:
67
+ patch = patch_path.read_text(encoding="utf8")
68
+ if not patch.strip():
69
+ if args.allow_empty:
70
+ skipped_empty.append(instance_id)
71
+ continue
72
+ raise ValueError(f"empty patch for {instance_id}: {patch_path}")
73
+ f.write(
74
+ json.dumps(
75
+ {
76
+ "instance_id": instance_id,
77
+ "model_name_or_path": args.model_name,
78
+ "model_patch": patch,
79
+ }
80
+ )
81
+ + "\n"
82
+ )
83
+ written += 1
84
+
85
+ report = {
86
+ "patch_root": str(args.patch_root),
87
+ "patch_name": args.patch_name,
88
+ "model_name_or_path": args.model_name,
89
+ "out": str(args.out),
90
+ "predictions_written": written,
91
+ "empty_skipped": skipped_empty,
92
+ }
93
+ print(json.dumps(report, indent=2))
94
+ return 0
95
+
96
+
97
+ if __name__ == "__main__":
98
+ raise SystemExit(main())