devlyn-cli 2.3.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +80 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +210 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# S1-cli-lang-flag NOTES
|
|
2
|
+
|
|
3
|
+
## What failure mode does this fixture detect?
|
|
4
|
+
|
|
5
|
+
**Spec-compliance precision under input-parsing pressure.** Bare LLMs handling unknown enum values tend to silently fall back to a default — the natural Node implementation pattern is `const greeting = greetings[lang] || greetings.en;` which would silently degrade `--lang fr` instead of erroring. The spec explicitly forbids that behavior; the trap is whether the implementation respects the precise output contract under user-input-parsing context.
|
|
6
|
+
|
|
7
|
+
## What pipeline phase(s) is this testing?
|
|
8
|
+
|
|
9
|
+
- **BUILD**: implementation must surface an error on unknown language, not silently fall back.
|
|
10
|
+
- **BUILD_GATE**: verification_commands match exact UTF-8 output literals for each language.
|
|
11
|
+
- **CLEANUP**: tests must be added; no other subcommand may be touched (scope discipline).
|
|
12
|
+
|
|
13
|
+
## Why can't another fixture cover this?
|
|
14
|
+
|
|
15
|
+
F2 has the silent-catch trap on a system-state subcommand (`doctor`). S1 puts the same trap shape in user-input-parsing context with multibyte UTF-8 output literals. Different surface category: F2 catches "filesystem error swallowing", S1 catches "user-input fall-through default."
|
|
16
|
+
|
|
17
|
+
## When should this be retired?
|
|
18
|
+
|
|
19
|
+
When two consecutive ship-gate runs show bare DQ rate ≤ 30% on this fixture — meaning bare LLMs have learned the pattern and the trap is no longer load-bearing as a categorical reliability signal.
|
|
20
|
+
|
|
21
|
+
## Mutation source
|
|
22
|
+
|
|
23
|
+
Direct mutation of `F1-cli-trivial-flag` (which adds `--name <name>`). S1 keeps the trivial-flag scaffold but adds:
|
|
24
|
+
- Multi-value enum (4 languages instead of 1 free-form name).
|
|
25
|
+
- Explicit error path on unknown enum (the fall-through trap).
|
|
26
|
+
- Multibyte literal output (UTF-8 Korean / Japanese / Spanish strings as exact-match contract).
|
|
27
|
+
|
|
28
|
+
The combination produces a categorical-reliability gate F1 alone does not provide.
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
{
|
|
2
|
+
"verification_commands": [
|
|
3
|
+
{
|
|
4
|
+
"cmd": "node bin/cli.js hello",
|
|
5
|
+
"exit_code": 0,
|
|
6
|
+
"stdout_contains": ["Hello, world!"],
|
|
7
|
+
"stdout_not_contains": ["error", "Error", "undefined"]
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"cmd": "node bin/cli.js hello --lang en",
|
|
11
|
+
"exit_code": 0,
|
|
12
|
+
"stdout_contains": ["Hello, world!"],
|
|
13
|
+
"stdout_not_contains": []
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"cmd": "node bin/cli.js hello --lang ko",
|
|
17
|
+
"exit_code": 0,
|
|
18
|
+
"stdout_contains": ["안녕, world!"],
|
|
19
|
+
"stdout_not_contains": []
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"cmd": "node bin/cli.js hello --lang ja",
|
|
23
|
+
"exit_code": 0,
|
|
24
|
+
"stdout_contains": ["こんにちは, world!"],
|
|
25
|
+
"stdout_not_contains": []
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"cmd": "node bin/cli.js hello --lang es",
|
|
29
|
+
"exit_code": 0,
|
|
30
|
+
"stdout_contains": ["Hola, world!"],
|
|
31
|
+
"stdout_not_contains": []
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"cmd": "node bin/cli.js hello --lang fr 2>&1; true",
|
|
35
|
+
"exit_code": 0,
|
|
36
|
+
"stdout_contains": ["fr"],
|
|
37
|
+
"stdout_not_contains": []
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"cmd": "node bin/cli.js hello --lang ko --name alice",
|
|
41
|
+
"exit_code": 0,
|
|
42
|
+
"stdout_contains": ["안녕, alice!"],
|
|
43
|
+
"stdout_not_contains": []
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"cmd": "node --test tests/cli.test.js",
|
|
47
|
+
"exit_code": 0,
|
|
48
|
+
"stdout_contains": [],
|
|
49
|
+
"stdout_not_contains": ["fail "]
|
|
50
|
+
}
|
|
51
|
+
],
|
|
52
|
+
"forbidden_patterns": [
|
|
53
|
+
{
|
|
54
|
+
"pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(?:\\[\\]|null|undefined|false|''|\\{\\s*\\})",
|
|
55
|
+
"description": "silent catch returning a fallback value — violates no-silent-catches policy. Structured error returns like `return { error, code }` are NOT silent and must not match.",
|
|
56
|
+
"files": ["bin/cli.js"],
|
|
57
|
+
"severity": "disqualifier"
|
|
58
|
+
}
|
|
59
|
+
],
|
|
60
|
+
"required_files": ["bin/cli.js"],
|
|
61
|
+
"forbidden_files": [],
|
|
62
|
+
"max_deps_added": 0
|
|
63
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "S1-cli-lang-flag",
|
|
3
|
+
"category": "trivial",
|
|
4
|
+
"difficulty": "trivial",
|
|
5
|
+
"timeout_seconds": 600,
|
|
6
|
+
"required_tools": ["node"],
|
|
7
|
+
"browser": false,
|
|
8
|
+
"deps_change_expected": false,
|
|
9
|
+
"intent": "Add a --lang flag to the hello subcommand that switches greeting between en, ko, ja, es; reject unknown languages with exit 1."
|
|
10
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: "S1-cli-lang-flag"
|
|
3
|
+
title: "Add --lang flag to hello subcommand"
|
|
4
|
+
status: planned
|
|
5
|
+
complexity: trivial
|
|
6
|
+
depends-on: []
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# S1 Add `--lang` to `hello`
|
|
10
|
+
|
|
11
|
+
## Context
|
|
12
|
+
|
|
13
|
+
`bench-test-repo` testers are international; the hard-coded English greeting is awkward for non-English usage. Add a `--lang <code>` flag to the `hello` subcommand that switches the greeting between English, Korean, Japanese, and Spanish.
|
|
14
|
+
|
|
15
|
+
## Requirements
|
|
16
|
+
|
|
17
|
+
- [ ] `node bin/cli.js hello` (no flag) prints exactly `Hello, world!` (unchanged from baseline).
|
|
18
|
+
- [ ] `node bin/cli.js hello --lang en` prints exactly `Hello, world!`.
|
|
19
|
+
- [ ] `node bin/cli.js hello --lang ko` prints exactly `안녕, world!`.
|
|
20
|
+
- [ ] `node bin/cli.js hello --lang ja` prints exactly `こんにちは, world!`.
|
|
21
|
+
- [ ] `node bin/cli.js hello --lang es` prints exactly `Hola, world!`.
|
|
22
|
+
- [ ] `node bin/cli.js hello --lang fr` exits 1 with stderr or stdout containing the literal string `fr` (visible to the user that `fr` was rejected).
|
|
23
|
+
- [ ] `--lang` combines with `--name`: `node bin/cli.js hello --lang ko --name alice` prints exactly `안녕, alice!`.
|
|
24
|
+
|
|
25
|
+
## Constraints
|
|
26
|
+
|
|
27
|
+
- **Zero new npm dependencies.** Use only Node.js built-ins.
|
|
28
|
+
- **No silent catches.** Unknown `--lang` values must surface a user-visible error including the offending code; do NOT silently fall back to English.
|
|
29
|
+
- **Touch only `bin/cli.js` and `tests/cli.test.js`.** Do NOT modify any other subcommand's handler.
|
|
30
|
+
|
|
31
|
+
## Out of Scope
|
|
32
|
+
|
|
33
|
+
- Adding more languages beyond `en`, `ko`, `ja`, `es`.
|
|
34
|
+
- Changing the trailing punctuation (`!` stays for all languages).
|
|
35
|
+
- Localizing the `world` placeholder (the noun stays English when no `--name` provided).
|
|
36
|
+
- Modifying `version`, `count`, `doctor`, or any other subcommand.
|
|
37
|
+
|
|
38
|
+
## Verification
|
|
39
|
+
|
|
40
|
+
- `node bin/cli.js hello` prints exactly `Hello, world!`.
|
|
41
|
+
- `node bin/cli.js hello --lang ko` prints exactly `안녕, world!`.
|
|
42
|
+
- `node bin/cli.js hello --lang ja` prints exactly `こんにちは, world!`.
|
|
43
|
+
- `node bin/cli.js hello --lang es` prints exactly `Hola, world!`.
|
|
44
|
+
- `node bin/cli.js hello --lang fr` exits 1 with `fr` visible in output.
|
|
45
|
+
- `node bin/cli.js hello --lang ko --name alice` prints exactly `안녕, alice!`.
|
|
46
|
+
- `node --test tests/cli.test.js` passes (existing tests + at least one new test for `--lang`).
|
|
47
|
+
- `git diff -- package.json` is empty (no new deps).
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Add a --lang flag to the hello subcommand. Support these language codes: en (English, default), ko (Korean), ja (Japanese), es (Spanish). Print the appropriate localized greeting. Unknown language codes should exit 1 with an error message that mentions the rejected code. The flag should also work with --name (e.g. `--lang ko --name alice` prints `안녕, alice!`). No new dependencies. Update the existing tests to cover the new flag.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# S2-cli-inventory-reservation NOTES
|
|
2
|
+
|
|
3
|
+
## What failure mode does this fixture detect?
|
|
4
|
+
|
|
5
|
+
Priority-sensitive inventory mutation with exact output-shape obligations.
|
|
6
|
+
Bare implementations commonly process input order instead of priority order,
|
|
7
|
+
partially decrement stock for rejected orders, or emit a plausible but wrong
|
|
8
|
+
JSON shape.
|
|
9
|
+
|
|
10
|
+
## What pipeline phase(s) is this testing?
|
|
11
|
+
|
|
12
|
+
- **PLAN / RISK_PROBES**: must notice priority ordering, rollback, duplicate id,
|
|
13
|
+
and response-shape contracts as executable risks.
|
|
14
|
+
- **IMPLEMENT**: must add a CLI command without broadening scope or adding deps.
|
|
15
|
+
- **VERIFY**: hidden oracles exercise compound behavior that simple unit tests
|
|
16
|
+
often miss.
|
|
17
|
+
|
|
18
|
+
## Why can't another fixture cover this?
|
|
19
|
+
|
|
20
|
+
S1 checks enum parsing and UTF-8 output. F31/F32 cover similar high-risk
|
|
21
|
+
rollback ideas in golden/control space, but S2 keeps the experiment in shadow
|
|
22
|
+
so new inventory allocation variants can be tested before promotion.
|
|
23
|
+
|
|
24
|
+
## When should this fixture be retired?
|
|
25
|
+
|
|
26
|
+
Retire or replace it if two consecutive measured runs show both bare and
|
|
27
|
+
solo_claude consistently satisfy priority ordering, rollback, duplicate id
|
|
28
|
+
handling, and exact output shape without pair assistance.
|
|
29
|
+
|
|
30
|
+
## Calibration status
|
|
31
|
+
|
|
32
|
+
- `20260513-s2-inventory-headroom`: bare `33`, solo_claude `99`, headroom gate
|
|
33
|
+
`FAIL` because solo exceeded the `80` ceiling and timed out. Treat S2 as a
|
|
34
|
+
shadow control unless it is reworked to preserve solo_claude headroom.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"verification_commands": [
|
|
3
|
+
{
|
|
4
|
+
"cmd": "node --test tests/cli.test.js",
|
|
5
|
+
"exit_code": 0,
|
|
6
|
+
"stdout_contains": [],
|
|
7
|
+
"stdout_not_contains": ["not ok "]
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/priority-stock-reservation.js\"",
|
|
11
|
+
"exit_code": 0,
|
|
12
|
+
"stdout_contains": ["\"ok\":true"],
|
|
13
|
+
"stdout_not_contains": [],
|
|
14
|
+
"contract_refs": [
|
|
15
|
+
"Process orders globally by `priority` descending, then original input order ascending.",
|
|
16
|
+
"A reservation is all-or-nothing per order. It accepts only when the SKU exists and remaining stock has at least `qty`.",
|
|
17
|
+
"A rejected reservation must not change stock. Use reason `unknown_sku` when the SKU is absent, or `insufficient_stock` when the SKU exists but lacks enough remaining stock.",
|
|
18
|
+
"`reserved` is ordered by processing order. Each row has keys `id`, `sku`, `qty`.",
|
|
19
|
+
"`rejected` is ordered in the original input order. Each row has keys `id`, `reason`.",
|
|
20
|
+
"`stock` is an object whose keys are sorted alphabetically and whose values are remaining quantities.",
|
|
21
|
+
"On success, write exactly one JSON object to stdout and no stderr. Keys: `reserved`, `rejected`, `stock`."
|
|
22
|
+
]
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/duplicate-order-error.js\"",
|
|
26
|
+
"exit_code": 0,
|
|
27
|
+
"stdout_contains": ["\"ok\":true"],
|
|
28
|
+
"stdout_not_contains": [],
|
|
29
|
+
"contract_refs": [
|
|
30
|
+
"Before processing any order, duplicate order ids are invalid input: exit `2`, write exactly one JSON error object `{ \"error\": \"duplicate_order_id\", \"id\": string }` to stderr, and write no stdout."
|
|
31
|
+
]
|
|
32
|
+
}
|
|
33
|
+
],
|
|
34
|
+
"forbidden_patterns": [
|
|
35
|
+
{
|
|
36
|
+
"pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\}|\\[\\])",
|
|
37
|
+
"description": "silent catch returning fallback in reserve-stock path",
|
|
38
|
+
"files": ["bin/cli.js", "tests/cli.test.js"],
|
|
39
|
+
"severity": "disqualifier"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
|
|
43
|
+
"description": "empty catch block",
|
|
44
|
+
"files": ["bin/cli.js", "tests/cli.test.js"],
|
|
45
|
+
"severity": "disqualifier"
|
|
46
|
+
}
|
|
47
|
+
],
|
|
48
|
+
"required_files": ["bin/cli.js", "tests/cli.test.js"],
|
|
49
|
+
"forbidden_files": [],
|
|
50
|
+
"tier_a_waivers": [],
|
|
51
|
+
"spec_output_files": ["bin/cli.js", "tests/cli.test.js"],
|
|
52
|
+
"max_deps_added": 0
|
|
53
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "S2-cli-inventory-reservation",
|
|
3
|
+
"category": "high-risk",
|
|
4
|
+
"difficulty": "high",
|
|
5
|
+
"timeout_seconds": 900,
|
|
6
|
+
"required_tools": ["node"],
|
|
7
|
+
"browser": false,
|
|
8
|
+
"deps_change_expected": false,
|
|
9
|
+
"intent": "Add an inventory reservation CLI command that processes orders by priority, preserves per-order rollback on insufficient stock, rejects duplicate order ids, and emits an exact JSON output shape."
|
|
10
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: "S2-cli-inventory-reservation"
|
|
3
|
+
title: "Add inventory reservation command"
|
|
4
|
+
status: planned
|
|
5
|
+
complexity: high
|
|
6
|
+
depends-on: []
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# S2 Add Inventory Reservation Command
|
|
10
|
+
|
|
11
|
+
## Context
|
|
12
|
+
|
|
13
|
+
Warehouse operators need a deterministic CLI path to reserve stock for a batch
|
|
14
|
+
of orders. Add an inventory reservation CLI command that processes orders by
|
|
15
|
+
priority, preserves per-order rollback on insufficient stock, rejects duplicate
|
|
16
|
+
order ids, and emits an exact JSON output shape.
|
|
17
|
+
|
|
18
|
+
## Requirements
|
|
19
|
+
|
|
20
|
+
- [ ] Add `reserve-stock` to `bin/cli.js`.
|
|
21
|
+
- [ ] Accept `--stock <json>` as a JSON object mapping SKU strings to non-negative integer quantities.
|
|
22
|
+
- [ ] Accept `--orders <json>` as a JSON array of order objects. Each order has keys `id`, `sku`, `qty`, and `priority`.
|
|
23
|
+
- [ ] Process orders globally by `priority` descending, then original input order ascending.
|
|
24
|
+
- [ ] A reservation is all-or-nothing per order. It accepts only when the SKU exists and remaining stock has at least `qty`.
|
|
25
|
+
- [ ] A rejected reservation must not change stock. Use reason `unknown_sku` when the SKU is absent, or `insufficient_stock` when the SKU exists but lacks enough remaining stock.
|
|
26
|
+
- [ ] `reserved` is ordered by processing order. Each row has keys `id`, `sku`, `qty`.
|
|
27
|
+
- [ ] `rejected` is ordered in the original input order. Each row has keys `id`, `reason`.
|
|
28
|
+
- [ ] `stock` is an object whose keys are sorted alphabetically and whose values are remaining quantities.
|
|
29
|
+
- [ ] On success, write exactly one JSON object to stdout and no stderr. Keys: `reserved`, `rejected`, `stock`.
|
|
30
|
+
- [ ] Before processing any order, duplicate order ids are invalid input: exit `2`, write exactly one JSON error object `{ "error": "duplicate_order_id", "id": string }` to stderr, and write no stdout.
|
|
31
|
+
|
|
32
|
+
## Constraints
|
|
33
|
+
|
|
34
|
+
- Use only Node.js built-ins; add no npm dependencies.
|
|
35
|
+
- Touch only `bin/cli.js` and `tests/cli.test.js`.
|
|
36
|
+
- Do not silently catch JSON parse or validation errors. Surface invalid input as a user-visible error with nonzero exit.
|
|
37
|
+
|
|
38
|
+
## Out of Scope
|
|
39
|
+
|
|
40
|
+
- Reading input from files.
|
|
41
|
+
- Supporting fractional quantities.
|
|
42
|
+
- Supporting reservations across multiple warehouses.
|
|
43
|
+
- Changing `hello`, `version`, server routes, or package metadata.
|
|
44
|
+
|
|
45
|
+
## Verification
|
|
46
|
+
|
|
47
|
+
- `node --test tests/cli.test.js` passes.
|
|
48
|
+
- `node "$BENCH_FIXTURE_DIR/verifiers/priority-stock-reservation.js"` prints `{"ok":true}`.
|
|
49
|
+
- `node "$BENCH_FIXTURE_DIR/verifiers/duplicate-order-error.js"` prints `{"ok":true}`.
|
|
50
|
+
- Solo-headroom hypothesis: solo_claude is expected to miss all-or-nothing stock rollback or original-order rejected rows under priority processing; observable command `node "$BENCH_FIXTURE_DIR/verifiers/priority-stock-reservation.js"` exposes the miss.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Add a `reserve-stock` CLI command. It should accept `--stock <json>` with SKU quantities and `--orders <json>` with order objects containing `id`, `sku`, `qty`, and `priority`. Process higher priority orders first, reserve all-or-nothing per order, keep stock unchanged for rejected orders, reject duplicate order ids with exit 2 and a JSON error, and print exactly one JSON object with `reserved`, `rejected`, and `stock`. Use no new dependencies and update CLI tests.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
const assert = require('node:assert');
|
|
3
|
+
const { spawnSync } = require('node:child_process');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
|
|
6
|
+
const work = process.env.BENCH_WORKDIR || process.cwd();
|
|
7
|
+
const cli = path.join(work, 'bin', 'cli.js');
|
|
8
|
+
|
|
9
|
+
const stock = JSON.stringify({ widget: 5 });
|
|
10
|
+
const orders = JSON.stringify([
|
|
11
|
+
{ id: 'dup', sku: 'widget', qty: 1, priority: 2 },
|
|
12
|
+
{ id: 'dup', sku: 'widget', qty: 1, priority: 1 }
|
|
13
|
+
]);
|
|
14
|
+
|
|
15
|
+
const result = spawnSync('node', [cli, 'reserve-stock', '--stock', stock, '--orders', orders], {
|
|
16
|
+
cwd: work,
|
|
17
|
+
encoding: 'utf8'
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
assert.strictEqual(result.status, 2);
|
|
21
|
+
assert.strictEqual(result.stdout, '');
|
|
22
|
+
assert.deepStrictEqual(JSON.parse(result.stderr), {
|
|
23
|
+
error: 'duplicate_order_id',
|
|
24
|
+
id: 'dup'
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
console.log(JSON.stringify({ ok: true }));
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
const assert = require('node:assert');
|
|
3
|
+
const { spawnSync } = require('node:child_process');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
|
|
6
|
+
const work = process.env.BENCH_WORKDIR || process.cwd();
|
|
7
|
+
const cli = path.join(work, 'bin', 'cli.js');
|
|
8
|
+
|
|
9
|
+
const stock = JSON.stringify({
|
|
10
|
+
cable: 1,
|
|
11
|
+
widget: 3
|
|
12
|
+
});
|
|
13
|
+
const orders = JSON.stringify([
|
|
14
|
+
{ id: 'low-widget', sku: 'widget', qty: 2, priority: 1 },
|
|
15
|
+
{ id: 'vip-widget', sku: 'widget', qty: 2, priority: 10 },
|
|
16
|
+
{ id: 'vip-cable', sku: 'cable', qty: 2, priority: 9 },
|
|
17
|
+
{ id: 'std-widget', sku: 'widget', qty: 1, priority: 5 }
|
|
18
|
+
]);
|
|
19
|
+
|
|
20
|
+
const result = spawnSync('node', [cli, 'reserve-stock', '--stock', stock, '--orders', orders], {
|
|
21
|
+
cwd: work,
|
|
22
|
+
encoding: 'utf8'
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
assert.strictEqual(result.status, 0, result.stderr || result.stdout);
|
|
26
|
+
assert.strictEqual(result.stderr, '');
|
|
27
|
+
const parsed = JSON.parse(result.stdout);
|
|
28
|
+
|
|
29
|
+
assert.deepStrictEqual(parsed, {
|
|
30
|
+
reserved: [
|
|
31
|
+
{ id: 'vip-widget', sku: 'widget', qty: 2 },
|
|
32
|
+
{ id: 'std-widget', sku: 'widget', qty: 1 }
|
|
33
|
+
],
|
|
34
|
+
rejected: [
|
|
35
|
+
{ id: 'low-widget', reason: 'insufficient_stock' },
|
|
36
|
+
{ id: 'vip-cable', reason: 'insufficient_stock' }
|
|
37
|
+
],
|
|
38
|
+
stock: {
|
|
39
|
+
cable: 1,
|
|
40
|
+
widget: 0
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
console.log(JSON.stringify({ ok: true }));
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# S3-cli-ticket-assignment NOTES
|
|
2
|
+
|
|
3
|
+
## What failure mode does this fixture detect?
|
|
4
|
+
|
|
5
|
+
Priority-sensitive support ticket allocation with deterministic tie-breakers and
|
|
6
|
+
exact output-shape obligations. Bare implementations commonly process input
|
|
7
|
+
order, pick the first matching agent instead of the specified capacity/id
|
|
8
|
+
tie-breaker, or report rejected tickets in processing order.
|
|
9
|
+
|
|
10
|
+
## What pipeline phase(s) is this testing?
|
|
11
|
+
|
|
12
|
+
- **PLAN / RISK_PROBES**: must notice priority ordering, allocation tie-breakers,
|
|
13
|
+
capacity mutation, duplicate id handling, and output-shape contracts.
|
|
14
|
+
- **IMPLEMENT**: must add a CLI command without broadening scope or adding deps.
|
|
15
|
+
- **VERIFY**: hidden oracles exercise compound behavior that simple unit tests
|
|
16
|
+
often miss.
|
|
17
|
+
|
|
18
|
+
## Why can't another fixture cover this?
|
|
19
|
+
|
|
20
|
+
S2 checks inventory reservation against SKU stock. S3 changes the resource
|
|
21
|
+
shape to agent skill matching and capacity tie-breakers, so it catches a
|
|
22
|
+
different allocation failure while staying in shadow.
|
|
23
|
+
|
|
24
|
+
## When should this fixture be retired?
|
|
25
|
+
|
|
26
|
+
Retire or replace it if two consecutive measured runs show both bare and
|
|
27
|
+
solo_claude consistently satisfy priority ordering, agent tie-breakers,
|
|
28
|
+
duplicate id handling, and exact output shape without pair assistance.
|
|
29
|
+
|
|
30
|
+
## Calibration status
|
|
31
|
+
|
|
32
|
+
- `20260513-s3-ticket-headroom`: bare `33`, solo_claude `99`, headroom gate
|
|
33
|
+
`FAIL` because solo exceeded the `80` ceiling and timed out. Treat S3 as a
|
|
34
|
+
shadow control unless it is reworked to preserve solo_claude headroom.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
{
|
|
2
|
+
"verification_commands": [
|
|
3
|
+
{
|
|
4
|
+
"cmd": "node --test tests/cli.test.js",
|
|
5
|
+
"exit_code": 0,
|
|
6
|
+
"stdout_contains": [],
|
|
7
|
+
"stdout_not_contains": ["not ok "]
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/priority-agent-assignment.js\"",
|
|
11
|
+
"exit_code": 0,
|
|
12
|
+
"stdout_contains": ["\"ok\":true"],
|
|
13
|
+
"stdout_not_contains": [],
|
|
14
|
+
"contract_refs": [
|
|
15
|
+
"Process tickets globally by `priority` descending, then `created_at` ascending, then original input order ascending.",
|
|
16
|
+
"A ticket accepts only when at least one agent has the ticket skill and positive remaining capacity.",
|
|
17
|
+
"When multiple agents can accept a ticket, assign it to the agent with the most remaining capacity, then `id` ascending.",
|
|
18
|
+
"Accepted tickets decrement only the selected agent's remaining capacity by `1`.",
|
|
19
|
+
"Rejected tickets do not change any agent capacity. Use reason `no_agent` when no eligible agent is available at that point in processing.",
|
|
20
|
+
"`assigned` is ordered by processing order. Each row has keys `id`, `agent`.",
|
|
21
|
+
"`unassigned` is ordered in the original input order. Each row has keys `id`, `reason`.",
|
|
22
|
+
"`agents` is ordered by agent id ascending. Each row has keys `id`, `remaining`.",
|
|
23
|
+
"On success, write exactly one JSON object to stdout and no stderr. Keys: `assigned`, `unassigned`, `agents`."
|
|
24
|
+
]
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/duplicate-ticket-error.js\"",
|
|
28
|
+
"exit_code": 0,
|
|
29
|
+
"stdout_contains": ["\"ok\":true"],
|
|
30
|
+
"stdout_not_contains": [],
|
|
31
|
+
"contract_refs": [
|
|
32
|
+
"Before processing any ticket, duplicate ticket ids are invalid input: exit `2`, write exactly one JSON error object `{ \"error\": \"duplicate_ticket_id\", \"id\": string }` to stderr, and write no stdout."
|
|
33
|
+
]
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"forbidden_patterns": [
|
|
37
|
+
{
|
|
38
|
+
"pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\}|\\[\\])",
|
|
39
|
+
"description": "silent catch returning fallback in assign-tickets path",
|
|
40
|
+
"files": ["bin/cli.js", "tests/cli.test.js"],
|
|
41
|
+
"severity": "disqualifier"
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
|
|
45
|
+
"description": "empty catch block",
|
|
46
|
+
"files": ["bin/cli.js", "tests/cli.test.js"],
|
|
47
|
+
"severity": "disqualifier"
|
|
48
|
+
}
|
|
49
|
+
],
|
|
50
|
+
"required_files": ["bin/cli.js", "tests/cli.test.js"],
|
|
51
|
+
"forbidden_files": [],
|
|
52
|
+
"tier_a_waivers": [],
|
|
53
|
+
"spec_output_files": ["bin/cli.js", "tests/cli.test.js"],
|
|
54
|
+
"max_deps_added": 0
|
|
55
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "S3-cli-ticket-assignment",
|
|
3
|
+
"category": "high-risk",
|
|
4
|
+
"difficulty": "high",
|
|
5
|
+
"timeout_seconds": 900,
|
|
6
|
+
"required_tools": ["node"],
|
|
7
|
+
"browser": false,
|
|
8
|
+
"deps_change_expected": false,
|
|
9
|
+
"intent": "Add a support ticket assignment CLI command that processes tickets by priority, allocates capacity by deterministic agent tie-breakers, rejects duplicates, and emits an exact JSON output shape."
|
|
10
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: "S3-cli-ticket-assignment"
|
|
3
|
+
title: "Add ticket assignment command"
|
|
4
|
+
status: planned
|
|
5
|
+
complexity: high
|
|
6
|
+
depends-on: []
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# S3 Add Ticket Assignment Command
|
|
10
|
+
|
|
11
|
+
## Context
|
|
12
|
+
|
|
13
|
+
Support operations need a deterministic CLI command that assigns tickets to
|
|
14
|
+
agents with matching skills and limited capacity. The command must process
|
|
15
|
+
higher-priority tickets first, apply a precise agent tie-breaker, preserve
|
|
16
|
+
remaining capacity only for accepted assignments, and emit an exact JSON shape.
|
|
17
|
+
|
|
18
|
+
## Requirements
|
|
19
|
+
|
|
20
|
+
- [ ] Add `assign-tickets` to `bin/cli.js`.
|
|
21
|
+
- [ ] Accept `--agents <json>` as a JSON array of agent objects. Each agent has keys `id`, `skills`, and `capacity`.
|
|
22
|
+
- [ ] Accept `--tickets <json>` as a JSON array of ticket objects. Each ticket has keys `id`, `skill`, `priority`, and `created_at`.
|
|
23
|
+
- [ ] Before processing any ticket, duplicate ticket ids are invalid input: exit `2`, write exactly one JSON error object `{ "error": "duplicate_ticket_id", "id": string }` to stderr, and write no stdout.
|
|
24
|
+
- [ ] Process tickets globally by `priority` descending, then `created_at` ascending, then original input order ascending.
|
|
25
|
+
- [ ] A ticket accepts only when at least one agent has the ticket skill and positive remaining capacity.
|
|
26
|
+
- [ ] When multiple agents can accept a ticket, assign it to the agent with the most remaining capacity, then `id` ascending.
|
|
27
|
+
- [ ] Accepted tickets decrement only the selected agent's remaining capacity by `1`.
|
|
28
|
+
- [ ] Rejected tickets do not change any agent capacity. Use reason `no_agent` when no eligible agent is available at that point in processing.
|
|
29
|
+
- [ ] `assigned` is ordered by processing order. Each row has keys `id`, `agent`.
|
|
30
|
+
- [ ] `unassigned` is ordered in the original input order. Each row has keys `id`, `reason`.
|
|
31
|
+
- [ ] `agents` is ordered by agent id ascending. Each row has keys `id`, `remaining`.
|
|
32
|
+
- [ ] On success, write exactly one JSON object to stdout and no stderr. Keys: `assigned`, `unassigned`, `agents`.
|
|
33
|
+
|
|
34
|
+
## Constraints
|
|
35
|
+
|
|
36
|
+
- Use only Node.js built-ins; add no npm dependencies.
|
|
37
|
+
- Touch only `bin/cli.js` and `tests/cli.test.js`.
|
|
38
|
+
- Do not silently catch JSON parse or validation errors. Surface invalid input as a user-visible error with nonzero exit.
|
|
39
|
+
- Do not persist assignments or capacity between command invocations.
|
|
40
|
+
|
|
41
|
+
## Out of Scope
|
|
42
|
+
|
|
43
|
+
- Reading input from files.
|
|
44
|
+
- Weighted skills, agent schedules, or SLA clocks.
|
|
45
|
+
- Changing `hello`, `version`, server routes, or package metadata.
|
|
46
|
+
|
|
47
|
+
## Verification
|
|
48
|
+
|
|
49
|
+
- `node --test tests/cli.test.js` passes.
|
|
50
|
+
- `node "$BENCH_FIXTURE_DIR/verifiers/priority-agent-assignment.js"` prints `{"ok":true}`.
|
|
51
|
+
- `node "$BENCH_FIXTURE_DIR/verifiers/duplicate-ticket-error.js"` prints `{"ok":true}`.
|
|
52
|
+
- Solo-headroom hypothesis: solo_claude is expected to miss the remaining-capacity tie-breaker or original-order unassigned rows under priority processing; observable command `node "$BENCH_FIXTURE_DIR/verifiers/priority-agent-assignment.js"` exposes the miss.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Add an `assign-tickets` CLI command. It should accept `--agents <json>` with agent capacity and skills, and `--tickets <json>` with ticket ids, skills, priorities, and creation times. Process higher priority tickets first, assign each accepted ticket to the eligible agent with the most remaining capacity then lowest id, reject duplicate ticket ids with exit 2 and a JSON error, and print exactly one JSON object with `assigned`, `unassigned`, and `agents`. Use no new dependencies and update CLI tests.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
const assert = require('node:assert');
|
|
3
|
+
const { spawnSync } = require('node:child_process');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
|
|
6
|
+
const work = process.env.BENCH_WORKDIR || process.cwd();
|
|
7
|
+
const cli = path.join(work, 'bin', 'cli.js');
|
|
8
|
+
|
|
9
|
+
const agents = JSON.stringify([
|
|
10
|
+
{ id: 'a-west', skills: ['billing'], capacity: 1 }
|
|
11
|
+
]);
|
|
12
|
+
const tickets = JSON.stringify([
|
|
13
|
+
{ id: 'dup', skill: 'billing', priority: 2, created_at: '2026-01-01T00:00:00Z' },
|
|
14
|
+
{ id: 'dup', skill: 'billing', priority: 1, created_at: '2026-01-02T00:00:00Z' }
|
|
15
|
+
]);
|
|
16
|
+
|
|
17
|
+
const result = spawnSync('node', [cli, 'assign-tickets', '--agents', agents, '--tickets', tickets], {
|
|
18
|
+
cwd: work,
|
|
19
|
+
encoding: 'utf8'
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
assert.strictEqual(result.status, 2);
|
|
23
|
+
assert.strictEqual(result.stdout, '');
|
|
24
|
+
assert.deepStrictEqual(JSON.parse(result.stderr), {
|
|
25
|
+
error: 'duplicate_ticket_id',
|
|
26
|
+
id: 'dup'
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
console.log(JSON.stringify({ ok: true }));
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
const assert = require('node:assert');
|
|
3
|
+
const { spawnSync } = require('node:child_process');
|
|
4
|
+
const path = require('node:path');
|
|
5
|
+
|
|
6
|
+
const work = process.env.BENCH_WORKDIR || process.cwd();
|
|
7
|
+
const cli = path.join(work, 'bin', 'cli.js');
|
|
8
|
+
|
|
9
|
+
const agents = JSON.stringify([
|
|
10
|
+
{ id: 'a-west', skills: ['billing'], capacity: 1 },
|
|
11
|
+
{ id: 'b-flex', skills: ['billing', 'tech'], capacity: 2 },
|
|
12
|
+
{ id: 'c-tech', skills: ['tech'], capacity: 1 }
|
|
13
|
+
]);
|
|
14
|
+
const tickets = JSON.stringify([
|
|
15
|
+
{ id: 'low-billing', skill: 'billing', priority: 1, created_at: '2026-01-01T00:00:00Z' },
|
|
16
|
+
{ id: 'vip-tech', skill: 'tech', priority: 9, created_at: '2026-01-01T00:00:00Z' },
|
|
17
|
+
{ id: 'vip-billing', skill: 'billing', priority: 10, created_at: '2026-01-02T00:00:00Z' },
|
|
18
|
+
{ id: 'std-tech', skill: 'tech', priority: 5, created_at: '2026-01-01T00:00:00Z' },
|
|
19
|
+
{ id: 'late-billing', skill: 'billing', priority: 8, created_at: '2026-01-01T00:00:00Z' }
|
|
20
|
+
]);
|
|
21
|
+
|
|
22
|
+
const result = spawnSync('node', [cli, 'assign-tickets', '--agents', agents, '--tickets', tickets], {
|
|
23
|
+
cwd: work,
|
|
24
|
+
encoding: 'utf8'
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
assert.strictEqual(result.status, 0, result.stderr || result.stdout);
|
|
28
|
+
assert.strictEqual(result.stderr, '');
|
|
29
|
+
const parsed = JSON.parse(result.stdout);
|
|
30
|
+
|
|
31
|
+
assert.deepStrictEqual(parsed, {
|
|
32
|
+
assigned: [
|
|
33
|
+
{ id: 'vip-billing', agent: 'b-flex' },
|
|
34
|
+
{ id: 'vip-tech', agent: 'b-flex' },
|
|
35
|
+
{ id: 'late-billing', agent: 'a-west' },
|
|
36
|
+
{ id: 'std-tech', agent: 'c-tech' }
|
|
37
|
+
],
|
|
38
|
+
unassigned: [
|
|
39
|
+
{ id: 'low-billing', reason: 'no_agent' }
|
|
40
|
+
],
|
|
41
|
+
agents: [
|
|
42
|
+
{ id: 'a-west', remaining: 0 },
|
|
43
|
+
{ id: 'b-flex', remaining: 0 },
|
|
44
|
+
{ id: 'c-tech', remaining: 0 }
|
|
45
|
+
]
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
console.log(JSON.stringify({ ok: true }));
|