devlyn-cli 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/README.md +1 -1
- package/benchmark/auto-resolve/README.md +318 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/bin/devlyn.js +56 -10
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +312 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:ideate/SKILL.md +1 -1
- package/config/skills/devlyn:resolve/SKILL.md +62 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +32 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
const assert = require('node:assert');
|
|
2
|
+
const fs = require('node:fs');
|
|
3
|
+
const os = require('node:os');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const { spawnSync } = require('node:child_process');
|
|
6
|
+
|
|
7
|
+
const workdir = process.env.BENCH_WORKDIR || process.cwd();
|
|
8
|
+
const input = path.join(os.tmpdir(), `gift-card-balance-${process.pid}.json`);
|
|
9
|
+
|
|
10
|
+
fs.writeFileSync(input, JSON.stringify({
|
|
11
|
+
order_id: 'order-balance',
|
|
12
|
+
lines: [
|
|
13
|
+
{ sku: 'TEE', qty: 3 },
|
|
14
|
+
{ sku: 'BAG', qty: 1 }
|
|
15
|
+
],
|
|
16
|
+
redeems: [
|
|
17
|
+
{ card_id: 'GC-100', amount_cents: 3000 },
|
|
18
|
+
{ card_id: 'GC-100', amount_cents: 2500 }
|
|
19
|
+
]
|
|
20
|
+
}));
|
|
21
|
+
|
|
22
|
+
const proc = spawnSync('node', ['bin/cli.js', 'gift-card', '--input', input], {
|
|
23
|
+
cwd: workdir,
|
|
24
|
+
encoding: 'utf8'
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
assert.strictEqual(proc.status, 2);
|
|
28
|
+
assert.strictEqual(proc.stdout, '');
|
|
29
|
+
assert.deepStrictEqual(JSON.parse(proc.stderr), {
|
|
30
|
+
error: 'insufficient_balance',
|
|
31
|
+
card_id: 'GC-100',
|
|
32
|
+
available_cents: 5000,
|
|
33
|
+
requested_cents: 5500
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
process.stdout.write(JSON.stringify({ ok: true }) + '\n');
|
package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
const assert = require('node:assert');
|
|
2
|
+
const fs = require('node:fs');
|
|
3
|
+
const os = require('node:os');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const { spawnSync } = require('node:child_process');
|
|
6
|
+
|
|
7
|
+
const workdir = process.env.BENCH_WORKDIR || process.cwd();
|
|
8
|
+
const rulesPath = path.join(workdir, 'data', 'gift-cards.json');
|
|
9
|
+
const original = fs.readFileSync(rulesPath, 'utf8');
|
|
10
|
+
|
|
11
|
+
try {
|
|
12
|
+
fs.writeFileSync(rulesPath, JSON.stringify({
|
|
13
|
+
products: {
|
|
14
|
+
TEE: { unit_cents: 1000 }
|
|
15
|
+
},
|
|
16
|
+
cards: {
|
|
17
|
+
'GC-SOURCE': { balance_cents: 900, active: true }
|
|
18
|
+
}
|
|
19
|
+
}, null, 2) + '\n');
|
|
20
|
+
|
|
21
|
+
const input = path.join(os.tmpdir(), `gift-card-source-${process.pid}.json`);
|
|
22
|
+
fs.writeFileSync(input, JSON.stringify({
|
|
23
|
+
order_id: 'order-source',
|
|
24
|
+
lines: [
|
|
25
|
+
{ sku: 'TEE', qty: 1 }
|
|
26
|
+
],
|
|
27
|
+
redeems: [
|
|
28
|
+
{ card_id: 'GC-SOURCE', amount_cents: 700 }
|
|
29
|
+
]
|
|
30
|
+
}));
|
|
31
|
+
|
|
32
|
+
const proc = spawnSync('node', ['bin/cli.js', 'gift-card', '--input', input], {
|
|
33
|
+
cwd: workdir,
|
|
34
|
+
encoding: 'utf8'
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
assert.strictEqual(proc.status, 0, proc.stderr || proc.stdout);
|
|
38
|
+
assert.strictEqual(proc.stderr, '');
|
|
39
|
+
assert.deepStrictEqual(JSON.parse(proc.stdout), {
|
|
40
|
+
order_id: 'order-source',
|
|
41
|
+
subtotal_cents: 1000,
|
|
42
|
+
gift_card_applied_cents: 700,
|
|
43
|
+
amount_due_cents: 300,
|
|
44
|
+
items: [
|
|
45
|
+
{ sku: 'TEE', qty: 1, line_cents: 1000 }
|
|
46
|
+
],
|
|
47
|
+
redemptions: [
|
|
48
|
+
{ card_id: 'GC-SOURCE', applied_cents: 700, remaining_balance_cents: 200 }
|
|
49
|
+
]
|
|
50
|
+
});
|
|
51
|
+
} finally {
|
|
52
|
+
fs.writeFileSync(rulesPath, original);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
process.stdout.write(JSON.stringify({ ok: true }) + '\n');
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# F28 CLI rental quote rules
|
|
2
|
+
|
|
3
|
+
## Why this fixture exists
|
|
4
|
+
|
|
5
|
+
F27 was rejected because the direct bare arm passed every verifier. F28 returns
|
|
6
|
+
to the F16 pattern that produced valid lift: exact success shape plus exact
|
|
7
|
+
validation shape, with enough arithmetic and date handling that a direct
|
|
8
|
+
implementation is likely to leak extra fields or miss one contract.
|
|
9
|
+
|
|
10
|
+
## Pair expectation
|
|
11
|
+
|
|
12
|
+
PLAN must preserve the date-counting and duplicate-combine invariants.
|
|
13
|
+
IMPLEMENT must keep all public amounts in integer cents and read rules from
|
|
14
|
+
`data/rental-rules.json`. VERIFY should probe both the Friday-to-Tuesday
|
|
15
|
+
weekend count and the combined-stock exact error shape.
|
|
16
|
+
|
|
17
|
+
## Isolation
|
|
18
|
+
|
|
19
|
+
F16 covers checkout tax rules. F28 covers rental-day UTC math, weekend
|
|
20
|
+
surcharges, deposits, protection fees, and non-persistent inventory validation.
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
{
|
|
2
|
+
"verification_commands": [
|
|
3
|
+
{
|
|
4
|
+
"cmd": "node --test tests/cli.test.js",
|
|
5
|
+
"exit_code": 0,
|
|
6
|
+
"stdout_contains": [],
|
|
7
|
+
"stdout_not_contains": ["not ok "]
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/exact-success.js\"",
|
|
11
|
+
"exit_code": 0,
|
|
12
|
+
"stdout_contains": ["\"ok\":true"],
|
|
13
|
+
"stdout_not_contains": [],
|
|
14
|
+
"contract_refs": [
|
|
15
|
+
"A Friday-to-Tuesday rental counts four rental days and two weekend days.",
|
|
16
|
+
"Duplicate SKUs are combined before stock validation and pricing.",
|
|
17
|
+
"A successful quote emits exact integer-cent weekend surcharge, discount, protection, deposit, and total fields."
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/unavailable-inventory.js\"",
|
|
22
|
+
"exit_code": 0,
|
|
23
|
+
"stdout_contains": ["\"ok\":true"],
|
|
24
|
+
"stdout_not_contains": [],
|
|
25
|
+
"contract_refs": [
|
|
26
|
+
"A combined quantity over stock exits `2`, prints one JSON error to stderr, and prints no stdout.",
|
|
27
|
+
"Combined quantity over stock uses exact error shape `{ \"error\": \"unavailable_inventory\", \"sku\": string, \"available\": number, \"requested\": number }`.",
|
|
28
|
+
"The unavailable inventory error object includes `error`, `sku`, `available`, and `requested`."
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/rules-source.js\"",
|
|
33
|
+
"exit_code": 0,
|
|
34
|
+
"stdout_contains": ["\"ok\":true"],
|
|
35
|
+
"stdout_not_contains": [],
|
|
36
|
+
"contract_refs": [
|
|
37
|
+
"Item daily rates, stock, deposits, weekend surcharge percent, protection daily price, and coupons come from `data/rental-rules.json`. Do not hardcode those values in the command implementation.",
|
|
38
|
+
"Changing `data/rental-rules.json` rates, deposits, or surcharge settings changes command output without code changes."
|
|
39
|
+
]
|
|
40
|
+
}
|
|
41
|
+
],
|
|
42
|
+
"forbidden_patterns": [
|
|
43
|
+
{
|
|
44
|
+
"pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
|
|
45
|
+
"description": "silent catch returning fallback in rental quote path",
|
|
46
|
+
"files": ["bin/cli.js"],
|
|
47
|
+
"severity": "disqualifier"
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
|
|
51
|
+
"description": "empty catch block",
|
|
52
|
+
"files": ["bin/cli.js"],
|
|
53
|
+
"severity": "disqualifier"
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"pattern": "/\\*\\s*eslint-disable",
|
|
57
|
+
"description": "eslint-disable without scoped justification",
|
|
58
|
+
"files": ["bin/cli.js"],
|
|
59
|
+
"severity": "disqualifier"
|
|
60
|
+
}
|
|
61
|
+
],
|
|
62
|
+
"required_files": ["bin/cli.js", "tests/cli.test.js", "data/rental-rules.json"],
|
|
63
|
+
"forbidden_files": [],
|
|
64
|
+
"max_deps_added": 0,
|
|
65
|
+
"spec_output_files": ["bin/cli.js", "tests/cli.test.js"]
|
|
66
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "F28-cli-rental-quote-rules",
|
|
3
|
+
"category": "high-risk",
|
|
4
|
+
"difficulty": "high",
|
|
5
|
+
"timeout_seconds": 1500,
|
|
6
|
+
"required_tools": ["node"],
|
|
7
|
+
"browser": false,
|
|
8
|
+
"deps_change_expected": false,
|
|
9
|
+
"intent": "Add a bench-cli rental-quote command that prices equipment rentals from data/rental-rules.json using rental-day, weekend surcharge, discount, protection, deposit, and combined stock-validation rules."
|
|
10
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# F28 setup - seed rental quote rules.
|
|
3
|
+
set -e
|
|
4
|
+
|
|
5
|
+
mkdir -p data
|
|
6
|
+
|
|
7
|
+
cat > data/rental-rules.json <<'JSON'
|
|
8
|
+
{
|
|
9
|
+
"items": {
|
|
10
|
+
"CAM": { "daily_cents": 1200, "stock": 2, "deposit_cents": 5000 },
|
|
11
|
+
"LIGHT": { "daily_cents": 700, "stock": 3, "deposit_cents": 2000 },
|
|
12
|
+
"TRIPOD": { "daily_cents": 400, "stock": 5, "deposit_cents": 1000 }
|
|
13
|
+
},
|
|
14
|
+
"weekend_surcharge_percent": 25,
|
|
15
|
+
"protection_daily_cents": 300,
|
|
16
|
+
"coupons": {
|
|
17
|
+
"LONG3": { "percent": 10, "min_rental_days": 3 },
|
|
18
|
+
"NONE": { "percent": 0, "min_rental_days": 1 }
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
JSON
|
|
22
|
+
|
|
23
|
+
exit 0
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: "F28-cli-rental-quote-rules"
|
|
3
|
+
title: "Rental quote command with weekend and deposit rules"
|
|
4
|
+
status: planned
|
|
5
|
+
complexity: high
|
|
6
|
+
depends-on: []
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# F28 Rental quote command with weekend and deposit rules
|
|
10
|
+
|
|
11
|
+
## Context
|
|
12
|
+
|
|
13
|
+
`bench-cli` currently has greeting and version commands only. The task:
|
|
14
|
+
add a `rental-quote` command that reads an equipment rental request, prices it
|
|
15
|
+
from `data/rental-rules.json`, validates combined inventory, and prints one
|
|
16
|
+
exact integer-cents quote.
|
|
17
|
+
|
|
18
|
+
Rental quotes are operational contracts: duplicate item rows must be combined
|
|
19
|
+
before stock validation, weekend surcharge rules must be deterministic, and
|
|
20
|
+
success output must stay machine-readable without extra fields.
|
|
21
|
+
|
|
22
|
+
## Requirements
|
|
23
|
+
|
|
24
|
+
- [ ] `bench-cli rental-quote --input <path>` reads JSON shaped as `{ "start_date": "YYYY-MM-DD", "end_date": "YYYY-MM-DD", "coupon": string | null, "protection": boolean, "items": [{ "sku": string, "qty": number }] }`.
|
|
25
|
+
- [ ] Item daily rates, stock, deposits, weekend surcharge percent, protection daily price, and coupons come from `data/rental-rules.json`. Do not hardcode those values in the command implementation.
|
|
26
|
+
- [ ] `rental_days` is the number of calendar days from `start_date` inclusive to `end_date` exclusive, using UTC date math. `end_date` must be after `start_date`.
|
|
27
|
+
- [ ] A rental day is a weekend day when its UTC day is Saturday or Sunday.
|
|
28
|
+
- [ ] Combine duplicate SKUs before validating stock and before computing item rows. The output `items` array must contain one row per SKU in first-seen order.
|
|
29
|
+
- [ ] Validation happens before any quote is printed. Invalid JSON, missing `items`, unknown SKU, non-positive or non-integer `qty`, combined quantity over stock, invalid date format, `end_date` not after `start_date`, unknown coupon, or non-boolean `protection` exits `2` and writes exactly one JSON error object to stderr.
|
|
30
|
+
- [ ] Combined quantity over stock uses exact error shape `{ "error": "unavailable_inventory", "sku": string, "available": number, "requested": number }`.
|
|
31
|
+
- [ ] `subtotal_cents` is the sum of `daily_cents * combined_qty * rental_days` for all item rows.
|
|
32
|
+
- [ ] `weekend_surcharge_cents` is the sum of `Math.round(daily_cents * combined_qty * weekend_days * weekend_surcharge_percent / 100)` for all item rows.
|
|
33
|
+
- [ ] `discount_cents` is `Math.round((subtotal_cents + weekend_surcharge_cents) * coupon.percent / 100)` when a coupon is present and `rental_days >= coupon.min_rental_days`; otherwise `0`.
|
|
34
|
+
- [ ] `protection_cents` is `protection_daily_cents * total_combined_qty * rental_days` when `protection` is true; otherwise `0`.
|
|
35
|
+
- [ ] `deposit_cents` is the sum of `deposit_cents * combined_qty` for all item rows. Deposits are never discounted.
|
|
36
|
+
- [ ] `total_cents = subtotal_cents + weekend_surcharge_cents - discount_cents + protection_cents + deposit_cents`.
|
|
37
|
+
- [ ] On success, write exactly one JSON object to stdout and no stderr. Keys: `rental_days`, `weekend_days`, `subtotal_cents`, `weekend_surcharge_cents`, `discount_cents`, `protection_cents`, `deposit_cents`, `total_cents`, `items`.
|
|
38
|
+
- [ ] Each output item row has keys `sku`, `qty`, `rental_cents`, `deposit_cents`. `rental_cents` is `daily_cents * combined_qty * rental_days`, and row `deposit_cents` is the item's deposit times combined quantity.
|
|
39
|
+
- [ ] `tests/cli.test.js` is updated. Existing tests still pass AND at least two new tests cover `rental-quote`: one successful quote and one validation failure.
|
|
40
|
+
|
|
41
|
+
## Constraints
|
|
42
|
+
|
|
43
|
+
- **No new npm dependencies.**
|
|
44
|
+
- **No floating-money output.** All public amounts are integer cents.
|
|
45
|
+
- **No silent catches.** If parsing or file reading fails, emit a visible JSON error to stderr and exit `2`.
|
|
46
|
+
- **No extra stdout/stderr text** on the success path; downstream tooling parses stdout as JSON.
|
|
47
|
+
- **Lifecycle note.** The harness's DOCS phase flips this spec's frontmatter `status` after implementation completes — that is benchmark lifecycle bookkeeping, not a scope violation.
|
|
48
|
+
|
|
49
|
+
## Out of Scope
|
|
50
|
+
|
|
51
|
+
- Persisting reservations or mutating inventory.
|
|
52
|
+
- Hourly pricing, time zones beyond UTC date math, holidays, or blackout dates.
|
|
53
|
+
- Taxes, shipping, currencies, or payment capture.
|
|
54
|
+
- Adding server routes or web UI.
|
|
55
|
+
- Touching `server/`, `web/`, or `tests/server.test.js`.
|
|
56
|
+
|
|
57
|
+
## Verification
|
|
58
|
+
|
|
59
|
+
- `node --test tests/cli.test.js` exits 0.
|
|
60
|
+
- A Friday-to-Tuesday rental counts four rental days and two weekend days.
|
|
61
|
+
- Duplicate SKUs are combined before stock validation and pricing.
|
|
62
|
+
- A successful quote emits exact integer-cent weekend surcharge, discount, protection, deposit, and total fields.
|
|
63
|
+
- A combined quantity over stock exits `2`, prints one JSON error to stderr, and prints no stdout.
|
|
64
|
+
- The unavailable inventory error object includes `error`, `sku`, `available`, and `requested`.
|
|
65
|
+
- Changing `data/rental-rules.json` rates, deposits, or surcharge settings changes command output without code changes.
|
|
66
|
+
- `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched (the rental rules seed comes from setup, not the arm).
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Add a `rental-quote` command to `bench-cli` so users can run `bench-cli rental-quote --input <path>` with a JSON file shaped as `{ "start_date": "YYYY-MM-DD", "end_date": "YYYY-MM-DD", "coupon": string | null, "protection": boolean, "items": [{ "sku": string, "qty": number }] }`.
|
|
2
|
+
|
|
3
|
+
Read item daily rates, stock, deposits, weekend surcharge percent, protection daily price, and coupons from `data/rental-rules.json`. Do not hardcode those values. `rental_days` is the number of calendar days from `start_date` inclusive to `end_date` exclusive using UTC date math. A rental day is a weekend day when its UTC day is Saturday or Sunday.
|
|
4
|
+
|
|
5
|
+
Combine duplicate SKUs before stock validation and pricing. On success, write one JSON object to stdout and no stderr with keys `rental_days`, `weekend_days`, `subtotal_cents`, `weekend_surcharge_cents`, `discount_cents`, `protection_cents`, `deposit_cents`, `total_cents`, and `items`. Each item row has keys `sku`, `qty`, `rental_cents`, and `deposit_cents`.
|
|
6
|
+
|
|
7
|
+
Use these formulas: `subtotal_cents` is the sum of `daily_cents * combined_qty * rental_days`; `weekend_surcharge_cents` is the sum of `Math.round(daily_cents * combined_qty * weekend_days * weekend_surcharge_percent / 100)`; `discount_cents` is `Math.round((subtotal_cents + weekend_surcharge_cents) * coupon.percent / 100)` when a coupon is present and `rental_days >= coupon.min_rental_days`, otherwise `0`; `protection_cents` is `protection_daily_cents * total_combined_qty * rental_days` when `protection` is true, otherwise `0`; `deposit_cents` is the sum of item deposits times combined quantities; `total_cents = subtotal_cents + weekend_surcharge_cents - discount_cents + protection_cents + deposit_cents`.
|
|
8
|
+
|
|
9
|
+
Validation happens before any quote is printed. Invalid JSON, missing `items`, unknown SKU, non-positive or non-integer `qty`, combined quantity over stock, invalid date format, `end_date` not after `start_date`, unknown coupon, or non-boolean `protection` exits `2` and writes exactly one JSON error object to stderr. Combined quantity over stock must use exact shape `{ "error": "unavailable_inventory", "sku": string, "available": number, "requested": number }`.
|
|
10
|
+
|
|
11
|
+
Update `tests/cli.test.js` so existing tests still pass and at least two new tests cover `rental-quote`: one successful quote and one validation failure.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
const assert = require('node:assert');
|
|
2
|
+
const fs = require('node:fs');
|
|
3
|
+
const os = require('node:os');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const { spawnSync } = require('node:child_process');
|
|
6
|
+
|
|
7
|
+
const workdir = process.env.BENCH_WORKDIR || process.cwd();
|
|
8
|
+
const input = path.join(os.tmpdir(), `rental-success-${process.pid}.json`);
|
|
9
|
+
|
|
10
|
+
fs.writeFileSync(input, JSON.stringify({
|
|
11
|
+
start_date: '2026-05-08',
|
|
12
|
+
end_date: '2026-05-12',
|
|
13
|
+
coupon: 'LONG3',
|
|
14
|
+
protection: true,
|
|
15
|
+
items: [
|
|
16
|
+
{ sku: 'CAM', qty: 1 },
|
|
17
|
+
{ sku: 'LIGHT', qty: 1 },
|
|
18
|
+
{ sku: 'CAM', qty: 1 }
|
|
19
|
+
]
|
|
20
|
+
}));
|
|
21
|
+
|
|
22
|
+
const proc = spawnSync('node', ['bin/cli.js', 'rental-quote', '--input', input], {
|
|
23
|
+
cwd: workdir,
|
|
24
|
+
encoding: 'utf8'
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
assert.strictEqual(proc.status, 0, proc.stderr || proc.stdout);
|
|
28
|
+
assert.strictEqual(proc.stderr, '');
|
|
29
|
+
assert.deepStrictEqual(JSON.parse(proc.stdout), {
|
|
30
|
+
rental_days: 4,
|
|
31
|
+
weekend_days: 2,
|
|
32
|
+
subtotal_cents: 12400,
|
|
33
|
+
weekend_surcharge_cents: 1550,
|
|
34
|
+
discount_cents: 1395,
|
|
35
|
+
protection_cents: 3600,
|
|
36
|
+
deposit_cents: 12000,
|
|
37
|
+
total_cents: 28155,
|
|
38
|
+
items: [
|
|
39
|
+
{ sku: 'CAM', qty: 2, rental_cents: 9600, deposit_cents: 10000 },
|
|
40
|
+
{ sku: 'LIGHT', qty: 1, rental_cents: 2800, deposit_cents: 2000 }
|
|
41
|
+
]
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
process.stdout.write(JSON.stringify({ ok: true }) + '\n');
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
const assert = require('node:assert');
|
|
2
|
+
const fs = require('node:fs');
|
|
3
|
+
const os = require('node:os');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const { spawnSync } = require('node:child_process');
|
|
6
|
+
|
|
7
|
+
const workdir = process.env.BENCH_WORKDIR || process.cwd();
|
|
8
|
+
const rulesPath = path.join(workdir, 'data', 'rental-rules.json');
|
|
9
|
+
const original = fs.readFileSync(rulesPath, 'utf8');
|
|
10
|
+
|
|
11
|
+
try {
|
|
12
|
+
fs.writeFileSync(rulesPath, JSON.stringify({
|
|
13
|
+
items: {
|
|
14
|
+
CAM: { daily_cents: 1000, stock: 3, deposit_cents: 3000 }
|
|
15
|
+
},
|
|
16
|
+
weekend_surcharge_percent: 50,
|
|
17
|
+
protection_daily_cents: 100,
|
|
18
|
+
coupons: {
|
|
19
|
+
HALF: { percent: 50, min_rental_days: 1 }
|
|
20
|
+
}
|
|
21
|
+
}, null, 2) + '\n');
|
|
22
|
+
|
|
23
|
+
const input = path.join(os.tmpdir(), `rental-source-${process.pid}.json`);
|
|
24
|
+
fs.writeFileSync(input, JSON.stringify({
|
|
25
|
+
start_date: '2026-05-09',
|
|
26
|
+
end_date: '2026-05-11',
|
|
27
|
+
coupon: 'HALF',
|
|
28
|
+
protection: true,
|
|
29
|
+
items: [
|
|
30
|
+
{ sku: 'CAM', qty: 1 }
|
|
31
|
+
]
|
|
32
|
+
}));
|
|
33
|
+
|
|
34
|
+
const proc = spawnSync('node', ['bin/cli.js', 'rental-quote', '--input', input], {
|
|
35
|
+
cwd: workdir,
|
|
36
|
+
encoding: 'utf8'
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
assert.strictEqual(proc.status, 0, proc.stderr || proc.stdout);
|
|
40
|
+
assert.strictEqual(proc.stderr, '');
|
|
41
|
+
assert.deepStrictEqual(JSON.parse(proc.stdout), {
|
|
42
|
+
rental_days: 2,
|
|
43
|
+
weekend_days: 2,
|
|
44
|
+
subtotal_cents: 2000,
|
|
45
|
+
weekend_surcharge_cents: 1000,
|
|
46
|
+
discount_cents: 1500,
|
|
47
|
+
protection_cents: 200,
|
|
48
|
+
deposit_cents: 3000,
|
|
49
|
+
total_cents: 4700,
|
|
50
|
+
items: [
|
|
51
|
+
{ sku: 'CAM', qty: 1, rental_cents: 2000, deposit_cents: 3000 }
|
|
52
|
+
]
|
|
53
|
+
});
|
|
54
|
+
} finally {
|
|
55
|
+
fs.writeFileSync(rulesPath, original);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
process.stdout.write(JSON.stringify({ ok: true }) + '\n');
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
const assert = require('node:assert');
|
|
2
|
+
const fs = require('node:fs');
|
|
3
|
+
const os = require('node:os');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
const { spawnSync } = require('node:child_process');
|
|
6
|
+
|
|
7
|
+
const workdir = process.env.BENCH_WORKDIR || process.cwd();
|
|
8
|
+
const input = path.join(os.tmpdir(), `rental-stock-${process.pid}.json`);
|
|
9
|
+
|
|
10
|
+
fs.writeFileSync(input, JSON.stringify({
|
|
11
|
+
start_date: '2026-05-08',
|
|
12
|
+
end_date: '2026-05-10',
|
|
13
|
+
coupon: null,
|
|
14
|
+
protection: false,
|
|
15
|
+
items: [
|
|
16
|
+
{ sku: 'CAM', qty: 1 },
|
|
17
|
+
{ sku: 'CAM', qty: 2 }
|
|
18
|
+
]
|
|
19
|
+
}));
|
|
20
|
+
|
|
21
|
+
const proc = spawnSync('node', ['bin/cli.js', 'rental-quote', '--input', input], {
|
|
22
|
+
cwd: workdir,
|
|
23
|
+
encoding: 'utf8'
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
assert.strictEqual(proc.status, 2);
|
|
27
|
+
assert.strictEqual(proc.stdout, '');
|
|
28
|
+
assert.deepStrictEqual(JSON.parse(proc.stderr), {
|
|
29
|
+
error: 'unavailable_inventory',
|
|
30
|
+
sku: 'CAM',
|
|
31
|
+
available: 2,
|
|
32
|
+
requested: 3
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
process.stdout.write(JSON.stringify({ ok: true }) + '\n');
|
|
@@ -73,7 +73,8 @@ Machine-readable acceptance criteria used by both `run-fixture.sh` verification
|
|
|
73
73
|
"cmd": "node bin/cli.js doctor",
|
|
74
74
|
"exit_code": 0,
|
|
75
75
|
"stdout_contains": ["doctor: "],
|
|
76
|
-
"stdout_not_contains": ["undefined"]
|
|
76
|
+
"stdout_not_contains": ["undefined"],
|
|
77
|
+
"contract_refs": []
|
|
77
78
|
}
|
|
78
79
|
],
|
|
79
80
|
"forbidden_patterns": [
|
|
@@ -86,14 +87,25 @@ Machine-readable acceptance criteria used by both `run-fixture.sh` verification
|
|
|
86
87
|
],
|
|
87
88
|
"required_files": ["bin/cli.js"],
|
|
88
89
|
"forbidden_files": [],
|
|
90
|
+
"tier_a_waivers": [],
|
|
91
|
+
"spec_output_files": ["bin/cli.js"],
|
|
89
92
|
"max_deps_added": 0
|
|
90
93
|
}
|
|
91
94
|
```
|
|
92
95
|
|
|
93
96
|
- **verification_commands** — runner executes each. Each command's pass/fail contributes to the arm's `verify_score`.
|
|
97
|
+
Commands run with `BENCH_WORKDIR` (fresh arm work tree) and
|
|
98
|
+
`BENCH_FIXTURE_DIR` (the fixture directory outside the arm work tree) in
|
|
99
|
+
the environment. Put discriminator/oracle scripts under the fixture
|
|
100
|
+
directory when the arm should not read the verifier source.
|
|
101
|
+
Any command that references `BENCH_FIXTURE_DIR` is a hidden oracle and must
|
|
102
|
+
include `contract_refs`: exact substrings from `spec.md` proving the oracle
|
|
103
|
+
tests a visible contract rather than inventing a narrower one.
|
|
94
104
|
- **forbidden_patterns** — regexes scanned across `diff.patch`. Match at `severity: "disqualifier"` is a hard-floor fail. Match at `severity: "warning"` goes into the judge's critical-findings report.
|
|
95
105
|
- **required_files** — must exist after the arm runs.
|
|
96
106
|
- **forbidden_files** — must NOT appear in the arm's diff.
|
|
107
|
+
- **tier_a_waivers** — optional globs for files the spec explicitly authorizes even though Tier A scope oracle would normally flag them.
|
|
108
|
+
- **spec_output_files** — files or globs that define the authorized output surface for Tier B scope tracing.
|
|
97
109
|
- **max_deps_added** — count of new entries under `dependencies`/`devDependencies` in `package.json`. Exceeds → hard-floor fail.
|
|
98
110
|
|
|
99
111
|
## NOTES.md
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Collect patch.diff files into SWE-bench prediction JSONL."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
13
|
+
rows: list[dict[str, Any]] = []
|
|
14
|
+
with path.open(encoding="utf8") as f:
|
|
15
|
+
for line_no, line in enumerate(f, start=1):
|
|
16
|
+
if not line.strip():
|
|
17
|
+
continue
|
|
18
|
+
value = json.loads(line)
|
|
19
|
+
if not isinstance(value, dict):
|
|
20
|
+
raise ValueError(f"{path}:{line_no}: expected JSON object")
|
|
21
|
+
rows.append(value)
|
|
22
|
+
return rows
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def instance_ids_from_jsonl(path: Path | None) -> set[str] | None:
|
|
26
|
+
if path is None:
|
|
27
|
+
return None
|
|
28
|
+
ids: set[str] = set()
|
|
29
|
+
for row in read_jsonl(path):
|
|
30
|
+
instance_id = row.get("instance_id")
|
|
31
|
+
if not isinstance(instance_id, str) or not instance_id:
|
|
32
|
+
raise ValueError(f"{path}: row missing non-empty instance_id")
|
|
33
|
+
ids.add(instance_id)
|
|
34
|
+
return ids
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def collect_from_root(root: Path, patch_name: str, keep: set[str] | None) -> list[tuple[str, Path]]:
|
|
38
|
+
patches: list[tuple[str, Path]] = []
|
|
39
|
+
for patch_path in sorted(root.glob(f"*/{patch_name}")):
|
|
40
|
+
instance_id = patch_path.parent.name
|
|
41
|
+
if keep is not None and instance_id not in keep:
|
|
42
|
+
continue
|
|
43
|
+
patches.append((instance_id, patch_path))
|
|
44
|
+
return patches
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def main() -> int:
|
|
48
|
+
parser = argparse.ArgumentParser()
|
|
49
|
+
parser.add_argument("--patch-root", required=True, type=Path)
|
|
50
|
+
parser.add_argument("--patch-name", default="patch.diff")
|
|
51
|
+
parser.add_argument("--instances-jsonl", type=Path, help="Optional filter/validation set.")
|
|
52
|
+
parser.add_argument("--model-name", required=True)
|
|
53
|
+
parser.add_argument("--out", required=True, type=Path)
|
|
54
|
+
parser.add_argument("--allow-empty", action="store_true")
|
|
55
|
+
args = parser.parse_args()
|
|
56
|
+
|
|
57
|
+
keep = instance_ids_from_jsonl(args.instances_jsonl)
|
|
58
|
+
patches = collect_from_root(args.patch_root, args.patch_name, keep)
|
|
59
|
+
if not patches:
|
|
60
|
+
raise ValueError(f"no {args.patch_name} files found under {args.patch_root}")
|
|
61
|
+
|
|
62
|
+
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
written = 0
|
|
64
|
+
skipped_empty: list[str] = []
|
|
65
|
+
with args.out.open("w", encoding="utf8") as f:
|
|
66
|
+
for instance_id, patch_path in patches:
|
|
67
|
+
patch = patch_path.read_text(encoding="utf8")
|
|
68
|
+
if not patch.strip():
|
|
69
|
+
if args.allow_empty:
|
|
70
|
+
skipped_empty.append(instance_id)
|
|
71
|
+
continue
|
|
72
|
+
raise ValueError(f"empty patch for {instance_id}: {patch_path}")
|
|
73
|
+
f.write(
|
|
74
|
+
json.dumps(
|
|
75
|
+
{
|
|
76
|
+
"instance_id": instance_id,
|
|
77
|
+
"model_name_or_path": args.model_name,
|
|
78
|
+
"model_patch": patch,
|
|
79
|
+
}
|
|
80
|
+
)
|
|
81
|
+
+ "\n"
|
|
82
|
+
)
|
|
83
|
+
written += 1
|
|
84
|
+
|
|
85
|
+
report = {
|
|
86
|
+
"patch_root": str(args.patch_root),
|
|
87
|
+
"patch_name": args.patch_name,
|
|
88
|
+
"model_name_or_path": args.model_name,
|
|
89
|
+
"out": str(args.out),
|
|
90
|
+
"predictions_written": written,
|
|
91
|
+
"empty_skipped": skipped_empty,
|
|
92
|
+
}
|
|
93
|
+
print(json.dumps(report, indent=2))
|
|
94
|
+
return 0
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if __name__ == "__main__":
|
|
98
|
+
raise SystemExit(main())
|