devlyn-cli 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +1 -1
- package/CLAUDE.md +2 -2
- package/README.md +82 -29
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +211 -18
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +3 -2
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/runtime-principles.md +5 -8
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +49 -18
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
- package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3642 -92
- /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
|
@@ -10,7 +10,7 @@ Independent quality layer. You answer one question: did the diff deliver what th
|
|
|
10
10
|
- `spec.md` (or `.devlyn/criteria.generated.md` for free-form mode) — the contract.
|
|
11
11
|
- `spec.expected.json` — the mechanical acceptance contract per `_shared/expected.schema.json`.
|
|
12
12
|
- The cumulative diff against `state.base_ref.sha`.
|
|
13
|
-
- The
|
|
13
|
+
- The source hash (`state.source.spec_sha256` for spec mode, `state.source.criteria_sha256` for generated free-form mode) — re-read the source contract from disk and confirm the hash matches; if it does not, write `state.phases.verify.verdict: "BLOCKED"` with reason `source_sha256_mismatch` and stop.
|
|
14
14
|
|
|
15
15
|
You do NOT receive: PLAN, IMPLEMENT's reasoning, BUILD_GATE's findings, CLEANUP's allowlist negotiations. Reading those would compromise independence.
|
|
16
16
|
</input>
|
|
@@ -21,10 +21,7 @@ You do NOT receive: PLAN, IMPLEMENT's reasoning, BUILD_GATE's findings, CLEANUP'
|
|
|
21
21
|
|
|
22
22
|
Re-run the mechanical checks fresh, independent of BUILD_GATE's earlier run:
|
|
23
23
|
|
|
24
|
-
1. `python3 .claude/skills/_shared/spec-verify-check.py --include-risk-probes` against the post-CLEANUP code.
|
|
25
|
-
2. Re-scan `spec.expected.json.forbidden_patterns` against the diff (Python re.search; honor each pattern's `files` allowlist).
|
|
26
|
-
3. Confirm `required_files` exist post-diff; confirm `forbidden_files` do not appear in the diff.
|
|
27
|
-
4. Confirm `max_deps_added` is not exceeded (`git diff -- package.json` for Node; equivalent for other ecosystems).
|
|
24
|
+
1. `SPEC_VERIFY_PHASE=verify_mechanical SPEC_VERIFY_FINDINGS_FILE=verify-mechanical.findings.jsonl SPEC_VERIFY_FINDING_PREFIX=VERIFY-MECH python3 .claude/skills/_shared/spec-verify-check.py --include-risk-probes` against the post-CLEANUP code. In spec mode, sibling `spec.expected.json` wins; a malformed sibling is CRITICAL, not a fallback. When `state.risk_profile.risk_probes_enabled == true`, missing `.devlyn/risk-probes.jsonl` is also CRITICAL. The script also checks `forbidden_patterns`, `required_files`, `forbidden_files`, and `max_deps_added`.
|
|
28
25
|
|
|
29
26
|
Emit findings to `.devlyn/verify-mechanical.findings.jsonl`. Each match = one finding. Severity from the pattern's `severity` field (disqualifier → CRITICAL, warning → MEDIUM).
|
|
30
27
|
|
|
@@ -87,28 +84,40 @@ design/style concerns remain non-binding MEDIUM and produce `PASS_WITH_ISSUES`.
|
|
|
87
84
|
|
|
88
85
|
### Pair-mode (when triggered by orchestrator)
|
|
89
86
|
|
|
90
|
-
Pair-mode is eligible only after MECHANICAL
|
|
91
|
-
Deterministic blockers
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
87
|
+
Pair-mode is eligible only after MECHANICAL and the primary JUDGE have no
|
|
88
|
+
verdict-binding findings. Deterministic blockers and primary JUDGE blockers
|
|
89
|
+
already decide the verdict and route to the fix loop; a second judge there
|
|
90
|
+
duplicates evidence and wastes wall-time. If MECHANICAL or the primary JUDGE
|
|
91
|
+
has a verdict-binding finding, record `pair_judge: null` and do not spawn the
|
|
92
|
+
second VERIFY agent.
|
|
95
93
|
|
|
96
94
|
When eligible, trigger pair-mode if any of these are true:
|
|
97
|
-
- `--pair-verify` was set.
|
|
95
|
+
- `state.pair_verify == true` (`--pair-verify` was set).
|
|
98
96
|
- `state.mode == "verify-only"`.
|
|
99
|
-
- The spec frontmatter has `complexity: high
|
|
100
|
-
|
|
97
|
+
- The spec frontmatter has `complexity: high`; legacy/external spec
|
|
98
|
+
`complexity: large` is accepted for compatibility, but new specs use `high`.
|
|
99
|
+
- Current free-form `state.complexity` is `"large"`; legacy `"high"` state remains accepted by the merge validator only for archived run compatibility.
|
|
101
100
|
- `state.risk_profile.high_risk == true`.
|
|
102
101
|
- `.devlyn/risk-probes.jsonl` exists or `state.risk_profile.risk_probes_enabled == true`.
|
|
103
|
-
-
|
|
102
|
+
- The spec includes an actionable solo-headroom hypothesis.
|
|
103
|
+
- MECHANICAL or the primary JUDGE emitted warning-level findings but no
|
|
104
|
+
verdict-binding blockers.
|
|
104
105
|
- `state.verify.coverage_failed == true`.
|
|
105
106
|
|
|
107
|
+
Malformed `state.risk_profile` is a VERIFY contract violation: it must be an
|
|
108
|
+
object, `high_risk` / `risk_probes_enabled` / `pair_default_enabled` must be
|
|
109
|
+
JSON booleans when present, and `reasons` must be a string array. Do not treat
|
|
110
|
+
missing or malformed risk state as low-risk; `verify-merge-findings.py` blocks
|
|
111
|
+
it because it can hide `risk.high` or `risk_probes.enabled` pair triggers.
|
|
112
|
+
|
|
106
113
|
If `--no-pair` was set, do not spawn the OTHER-engine judge. Record
|
|
107
114
|
`pair_trigger: { eligible: false, reasons: [], skipped_reason: "user_no_pair" }`
|
|
108
115
|
and continue with solo VERIFY. This is an explicit user opt-out, not an engine
|
|
109
|
-
availability fallback.
|
|
116
|
+
availability fallback. `--pair-verify` and `--no-pair` are mutually exclusive;
|
|
117
|
+
if both are present, stop with `BLOCKED:invalid-flags`.
|
|
110
118
|
|
|
111
|
-
|
|
119
|
+
After MECHANICAL and the primary JUDGE finish, compute and persist this before
|
|
120
|
+
spawning the OTHER-engine pair judge:
|
|
112
121
|
|
|
113
122
|
```json
|
|
114
123
|
"pair_trigger": {
|
|
@@ -118,9 +127,23 @@ Before JUDGE spawn, compute and persist:
|
|
|
118
127
|
}
|
|
119
128
|
```
|
|
120
129
|
|
|
121
|
-
If `eligible == true
|
|
122
|
-
|
|
123
|
-
|
|
130
|
+
If `eligible == true`, `reasons` must be non-empty and include every applicable canonical reason; for example, a spec with an actionable solo-headroom
|
|
131
|
+
hypothesis must include `spec.solo_headroom_hypothesis` even when another reason
|
|
132
|
+
such as `risk.high` also applies. The OTHER-engine judge is mandatory. Skipping
|
|
133
|
+
it is a VERIFY contract violation. If ineligible, record the
|
|
134
|
+
reason, e.g. `"mechanical_blocker"` or `"primary_judge_blocker"`.
|
|
135
|
+
|
|
136
|
+
`pair_trigger` is a strict contract, not advisory metadata. `eligible: true`
|
|
137
|
+
requires a non-empty `reasons` list and `skipped_reason: null`; `eligible: false`
|
|
138
|
+
requires an empty `reasons` list and a string/null `skipped_reason`. Do not emit
|
|
139
|
+
contradictory states such as `eligible: true` with `skipped_reason`, or
|
|
140
|
+
`eligible: false` with trigger reasons. `verify-merge-findings.py` blocks VERIFY
|
|
141
|
+
on malformed trigger state. Eligible triggers must contain only canonical
|
|
142
|
+
reasons and at least one reason: `mode.verify-only`, `complexity.high`, `complexity.large`,
|
|
143
|
+
`mode.pair-verify`, `spec.complexity.high`, `spec.complexity.large`,
|
|
144
|
+
`spec.solo_headroom_hypothesis`, `risk.high`, `risk_probes.enabled`,
|
|
145
|
+
`risk_probes.present`, `coverage.failed`, `mechanical.warning`, or
|
|
146
|
+
`judge.warning`.
|
|
124
147
|
|
|
125
148
|
The `--engine` flag never disables this rule. Explicit `--engine claude` means
|
|
126
149
|
Claude is the primary judge; if pair-mode triggers, Codex is still the mandatory
|
|
@@ -160,12 +183,22 @@ When eligible and the orchestrator spawns a second VERIFY agent with the OTHER e
|
|
|
160
183
|
after the first verdict-binding finding and emit JSONL. If both probes pass
|
|
161
184
|
and static scope/dependency checks show no blocker, emit PASS; do not continue
|
|
162
185
|
exhaustive exploration.
|
|
186
|
+
If the spec includes a solo-headroom hypothesis, one of the two targeted
|
|
187
|
+
probes must exercise that hypothesis with the visible command/input shape and
|
|
188
|
+
compare the full externally visible result. The probe must use the
|
|
189
|
+
hypothesis's backticked observable command as its command anchor before adding
|
|
190
|
+
bounded input variations. Do not substitute a neighboring easier edge case;
|
|
191
|
+
the pair judge exists to test the stated expected solo miss.
|
|
163
192
|
A targeted probe must compare the full externally visible result
|
|
164
193
|
(stdout/stderr/exit and full parsed output object, including accepted/scheduled
|
|
165
194
|
rows, rejected rows, and remaining state when present), not just a single
|
|
166
|
-
property.
|
|
167
|
-
|
|
168
|
-
|
|
195
|
+
property. When the spec names exact keys, row shapes, JSON object shape, or an
|
|
196
|
+
exact error body, compare parsed key sets/deep equality so aliased keys,
|
|
197
|
+
missing keys, and extra keys are verdict-binding failures. Use the spec's
|
|
198
|
+
visible input key names literally when constructing the probe input. For
|
|
199
|
+
priority/stateful specs, at least one probe must include an earlier input
|
|
200
|
+
entity that would succeed under input-order processing, a later higher-priority
|
|
201
|
+
entity that consumes or blocks the critical resource, and a
|
|
169
202
|
failure/blocked/rollback edge that determines a later entity's state. This is
|
|
170
203
|
the minimum compound shape for priority + failure/state-mutation bugs.
|
|
171
204
|
Scope qualifiers are binding for the pair judge too: do not reinterpret
|
|
@@ -181,12 +214,13 @@ When eligible and the orchestrator spawns a second VERIFY agent with the OTHER e
|
|
|
181
214
|
(or scheduled) and rejected rows.
|
|
182
215
|
|
|
183
216
|
Codex pair-JUDGE is read-only. Invoke `codex-monitored.sh` directly with
|
|
184
|
-
`-c model_reasoning_effort=medium`; this
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
217
|
+
`CODEX_MONITORED_ISOLATED=1` and `-c model_reasoning_effort=medium`; this is a
|
|
218
|
+
bounded two-probe review, not implementation. Isolation blocks user config,
|
|
219
|
+
AGENTS.md, pyx-memory, hooks, and project rules from hidden context/tool
|
|
220
|
+
side effects. Do not pipe it to `tail`, `head`, `grep`, `sed`, or `awk`.
|
|
221
|
+
Capture stdout/stderr directly. The Codex judge must return JSONL findings on
|
|
222
|
+
stdout; the orchestrator writes `.devlyn/verify.pair.findings.jsonl` and merges
|
|
223
|
+
verdicts. Do not ask Codex to `apply_patch` or edit `.devlyn`.
|
|
190
224
|
The Codex prompt must include a bounded-output contract: no harness-doc reads,
|
|
191
225
|
maximum two targeted probes before first output, stop on the first
|
|
192
226
|
verdict-binding finding, and emit PASS immediately after the bounded checks pass.
|
|
@@ -11,6 +11,7 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
|
|
|
11
11
|
"started_at": "2026-04-30T12:00:00Z",
|
|
12
12
|
"engine": "claude",
|
|
13
13
|
"mode": "spec",
|
|
14
|
+
"pair_verify": false,
|
|
14
15
|
"complexity": null,
|
|
15
16
|
"risk_profile": { "high_risk": false, "reasons": [], "risk_probes_enabled": false, "pair_default_enabled": true },
|
|
16
17
|
"base_ref": { "branch": "main", "sha": "abc123..." },
|
|
@@ -44,16 +45,18 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
|
|
|
44
45
|
|
|
45
46
|
- **version** — string. Bump major on a breaking schema change.
|
|
46
47
|
- **mode** — `"free-form" | "spec" | "verify-only"`.
|
|
48
|
+
- **pair_verify** — boolean. Set true only when the user passed `--pair-verify`; otherwise false. This is the durable state evidence for the `mode.pair-verify` pair-trigger reason. It is mutually exclusive with `risk_profile.pair_default_enabled == false` from `--no-pair`; `verify-merge-findings.py` blocks the contradictory state.
|
|
47
49
|
- **complexity** — `null | "trivial" | "medium" | "large"`. Free-form mode populates this; spec/verify-only mode leaves it null.
|
|
48
50
|
- **engine** — `"claude" | "codex" | "auto"` initially; a required unavailable engine stops the run with `BLOCKED:<engine>-unavailable`.
|
|
49
|
-
- **
|
|
51
|
+
- **source** — provenance for the contract all downstream phases read. Spec and verify-only mode set `type: "spec"`, `spec_path`, and `spec_sha256`. Free-form mode sets `type: "generated"`, leaves `spec_path`/`spec_sha256` null, and must set `criteria_path: ".devlyn/criteria.generated.md"` plus `criteria_sha256` from the generated file's raw bytes. VERIFY re-checks the matching hash before judging.
|
|
52
|
+
- **risk_profile** — PHASE 0 classification for conditional defaults. `high_risk` records durable-risk signals from the goal/spec; `risk_probes_enabled` is true for explicit `--risk-probes` or high-risk specs unless `--no-risk-probes`; `pair_default_enabled` is false only for explicit `--no-pair`. `risk_profile` must remain an object with boolean `high_risk`, `risk_probes_enabled`, and `pair_default_enabled` fields when present, plus `reasons` as a list of strings. Malformed `risk_profile` blocks VERIFY because pair-trigger reasons derive `risk.high` and `risk_probes.enabled` from this state.
|
|
50
53
|
- **rounds.global** — incremented every fix-loop pass (BUILD_GATE → fix-loop OR VERIFY → fix-loop).
|
|
51
54
|
- **phases.probe_derive** — optional PHASE 1.5 entry when `--risk-probes` is enabled. Artifacts include `.devlyn/risk-probes.jsonl`. Probe failures later surface through BUILD_GATE/VERIFY as `correctness.risk-probe-failed`.
|
|
52
55
|
- **bypasses** — array of phase names from `--bypass`. Valid: `"build-gate" | "cleanup"`. PLAN, IMPLEMENT, VERIFY are non-bypassable (orchestrator rejects at parse time).
|
|
53
56
|
- **implement_passed_sha** — captured at end of PHASE 2; null until then. Activates the post-implement invariant for CLEANUP and VERIFY.
|
|
54
57
|
- **criteria** — generated from spec's `## Requirements` checklist (one per `- [ ]`). `status: pending → implemented` is the legal transition. `failed_by_finding_ids` populates when VERIFY surfaces a finding tied to a criterion.
|
|
55
|
-
- **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for verify-only mode, high-risk specs, active risk probes, `complexity: high` specs, or `state.complexity` of `"
|
|
56
|
-
- **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. If eligible with
|
|
58
|
+
- **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for `state.pair_verify == true`, verify-only mode, high-risk specs, active risk probes, actionable solo-headroom hypotheses, `complexity: high` specs, or current free-form `state.complexity` of `"large"` when MECHANICAL and the primary JUDGE have no verdict-binding blockers. Legacy/external spec `complexity: large` remains accepted for compatibility; new specs use `high`. Legacy `"high"` state remains accepted by the merge validator only for archived run compatibility.
|
|
59
|
+
- **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. The shape is strict: `eligible: true` requires a non-empty reasons list containing every applicable canonical eligible reason and only canonical eligible reasons, plus `skipped_reason: null`; `eligible: false` requires an empty reasons list and may set only `user_no_pair`, `mechanical_blocker`, `primary_judge_blocker`, or null as the skip cause. Canonical eligible reasons are `mode.verify-only`, `mode.pair-verify`, `complexity.high`, `complexity.large`, `spec.complexity.high`, `spec.complexity.large`, `spec.solo_headroom_hypothesis`, `risk.high`, `risk_probes.enabled`, `risk_probes.present`, `coverage.failed`, `mechanical.warning`, and `judge.warning`. `user_no_pair` is valid only when `risk_profile.pair_default_enabled == false` from an explicit `--no-pair`; `mechanical_blocker` and `primary_judge_blocker` are valid only when the matching source has a verdict-binding finding. If state implies a pair decision is required but `pair_trigger` is missing, if it records `eligible:false` with no supported skip reason, if an eligible trigger omits an applicable reason such as `spec.solo_headroom_hypothesis`, or if any combination is malformed, `verify-merge-findings.py` blocks VERIFY.
|
|
57
60
|
|
|
58
61
|
## Per-phase shape
|
|
59
62
|
|
|
@@ -107,7 +110,7 @@ Per-phase summary table: `phase | verdict | duration_ms | round | triggered_by |
|
|
|
107
110
|
|
|
108
111
|
Findings table (post-IMPLEMENT phases only — they are findings-only): each finding's `severity | rule_id | file:line | message | confidence`.
|
|
109
112
|
|
|
110
|
-
Follow-up notes: any `--continue-on-large` assumptions, pair/risk-probe opt-out state, engine setup guidance for `BLOCKED:<engine>-unavailable`, any `state.verify.coverage_failed` axes.
|
|
113
|
+
Follow-up notes: any `--continue-on-large` assumptions, pair/risk-probe opt-out state, engine setup guidance for `BLOCKED:<engine>-unavailable`, `/devlyn:ideate` guidance for `BLOCKED:solo-headroom-hypothesis-required` that asks for the visible behavior `solo_claude` is expected to miss, `/devlyn:ideate` guidance for `BLOCKED:solo-ceiling-avoidance-required` that asks for the concrete difference from rejected or solo-saturated controls such as `S2`-`S6`, and any `state.verify.coverage_failed` axes.
|
|
111
114
|
|
|
112
115
|
## Archive contract
|
|
113
116
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "devlyn-cli",
|
|
3
|
-
"version": "2.3.
|
|
4
|
-
"description": "AI development toolkit for Claude Code — ideate,
|
|
3
|
+
"version": "2.3.2",
|
|
4
|
+
"description": "AI development toolkit for Claude Code — ideate, resolve, and ship with context engineering and agent orchestration",
|
|
5
5
|
"homepage": "https://github.com/fysoul17/devlyn-cli#readme",
|
|
6
6
|
"bin": {
|
|
7
7
|
"devlyn": "bin/devlyn.js"
|
|
@@ -20,13 +20,58 @@
|
|
|
20
20
|
"agents-config",
|
|
21
21
|
"optional-skills",
|
|
22
22
|
"benchmark/auto-resolve/BENCHMARK-DESIGN.md",
|
|
23
|
+
"benchmark/auto-resolve/BENCHMARK-RESULTS.md",
|
|
23
24
|
"benchmark/auto-resolve/README.md",
|
|
24
25
|
"benchmark/auto-resolve/RUBRIC.md",
|
|
26
|
+
"benchmark/auto-resolve/run-real-benchmark.md",
|
|
25
27
|
"benchmark/auto-resolve/fixtures/SCHEMA.md",
|
|
26
28
|
"benchmark/auto-resolve/fixtures/F*/**",
|
|
29
|
+
"benchmark/auto-resolve/fixtures/retired/F*/**",
|
|
30
|
+
"benchmark/auto-resolve/shadow-fixtures/S*/**",
|
|
27
31
|
"benchmark/auto-resolve/fixtures/test-repo/**",
|
|
28
32
|
"!benchmark/auto-resolve/fixtures/test-repo/node_modules/**",
|
|
33
|
+
"benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md",
|
|
34
|
+
"benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json",
|
|
35
|
+
"benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md",
|
|
36
|
+
"benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json",
|
|
37
|
+
"benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.md",
|
|
38
|
+
"benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.json",
|
|
39
|
+
"benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.md",
|
|
40
|
+
"benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json",
|
|
41
|
+
"benchmark/auto-resolve/results/20260507-f10-f11-tier1-full-pipeline/headroom-gate.md",
|
|
42
|
+
"benchmark/auto-resolve/results/20260507-f10-f11-tier1-full-pipeline/headroom-gate.json",
|
|
43
|
+
"benchmark/auto-resolve/results/20260508-f22-exact-error-headroom/headroom-gate.md",
|
|
44
|
+
"benchmark/auto-resolve/results/20260508-f22-exact-error-headroom/headroom-gate.json",
|
|
45
|
+
"benchmark/auto-resolve/results/20260508-f26-headroom/headroom-gate.md",
|
|
46
|
+
"benchmark/auto-resolve/results/20260508-f26-headroom/headroom-gate.json",
|
|
47
|
+
"benchmark/auto-resolve/results/20260511-f3-http-error-headroom/headroom-gate.md",
|
|
48
|
+
"benchmark/auto-resolve/results/20260511-f3-http-error-headroom/headroom-gate.json",
|
|
49
|
+
"benchmark/auto-resolve/results/20260511-f12-webhook-headroom/headroom-gate.md",
|
|
50
|
+
"benchmark/auto-resolve/results/20260511-f12-webhook-headroom/headroom-gate.json",
|
|
51
|
+
"benchmark/auto-resolve/results/20260511-f15-concurrency-headroom/headroom-gate.md",
|
|
52
|
+
"benchmark/auto-resolve/results/20260511-f15-concurrency-headroom/headroom-gate.json",
|
|
53
|
+
"benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.md",
|
|
54
|
+
"benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.json",
|
|
55
|
+
"benchmark/auto-resolve/results/20260512-f4-web-headroom/headroom-gate.md",
|
|
56
|
+
"benchmark/auto-resolve/results/20260512-f4-web-headroom/headroom-gate.json",
|
|
57
|
+
"benchmark/auto-resolve/results/20260512-f5-fixloop-headroom/headroom-gate.md",
|
|
58
|
+
"benchmark/auto-resolve/results/20260512-f5-fixloop-headroom/headroom-gate.json",
|
|
59
|
+
"benchmark/auto-resolve/results/20260512-f6-checksum-headroom/headroom-gate.md",
|
|
60
|
+
"benchmark/auto-resolve/results/20260512-f6-checksum-headroom/headroom-gate.json",
|
|
61
|
+
"benchmark/auto-resolve/results/20260512-f7-scope-headroom/headroom-gate.md",
|
|
62
|
+
"benchmark/auto-resolve/results/20260512-f7-scope-headroom/headroom-gate.json",
|
|
63
|
+
"benchmark/auto-resolve/results/20260512-f9-e2e-headroom/headroom-gate.md",
|
|
64
|
+
"benchmark/auto-resolve/results/20260512-f9-e2e-headroom/headroom-gate.json",
|
|
65
|
+
"benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.md",
|
|
66
|
+
"benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.json",
|
|
67
|
+
"benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.md",
|
|
68
|
+
"benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.json",
|
|
29
69
|
"benchmark/auto-resolve/scripts/**",
|
|
70
|
+
"!**/__pycache__",
|
|
71
|
+
"!**/__pycache__/**",
|
|
72
|
+
"!**/*.pyc",
|
|
73
|
+
"scripts/lint-fixtures.sh",
|
|
74
|
+
"scripts/lint-shadow-fixtures.sh",
|
|
30
75
|
"scripts/lint-skills.sh",
|
|
31
76
|
"CLAUDE.md",
|
|
32
77
|
"AGENTS.md"
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# lint-fixtures.sh — schema validity + structural check for golden fixtures/.
|
|
3
|
+
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
7
|
+
FIXTURES_DIR="${DEVLYN_FIXTURES_DIR:-$REPO_ROOT/benchmark/auto-resolve/fixtures}"
|
|
8
|
+
FIXTURE_GLOB="${DEVLYN_FIXTURE_GLOB:-F*}"
|
|
9
|
+
RETIRED_FIXTURE_GLOB="${DEVLYN_RETIRED_FIXTURE_GLOB:-F*}"
|
|
10
|
+
REJECTED_REGISTRY="${DEVLYN_REJECTED_FIXTURE_REGISTRY:-$REPO_ROOT/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh}"
|
|
11
|
+
SCHEMA="${DEVLYN_EXPECTED_SCHEMA:-$REPO_ROOT/config/skills/_shared/expected.schema.json}"
|
|
12
|
+
SPEC_VERIFY_CHECK="$REPO_ROOT/config/skills/_shared/spec-verify-check.py"
|
|
13
|
+
SOLO_HEADROOM_CHECK="$REPO_ROOT/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py"
|
|
14
|
+
|
|
15
|
+
[ -d "$FIXTURES_DIR" ] || { echo "✗ $FIXTURES_DIR missing"; exit 1; }
|
|
16
|
+
[ -f "$SCHEMA" ] || { echo "✗ $SCHEMA missing"; exit 1; }
|
|
17
|
+
[ -f "$SPEC_VERIFY_CHECK" ] || { echo "✗ $SPEC_VERIFY_CHECK missing"; exit 1; }
|
|
18
|
+
[ -f "$SOLO_HEADROOM_CHECK" ] || { echo "✗ solo-headroom checker missing: $SOLO_HEADROOM_CHECK"; exit 1; }
|
|
19
|
+
[ -f "$REJECTED_REGISTRY" ] || { echo "✗ rejected fixture registry missing: $REJECTED_REGISTRY"; exit 1; }
|
|
20
|
+
|
|
21
|
+
# shellcheck source=/dev/null
|
|
22
|
+
source "$REJECTED_REGISTRY"
|
|
23
|
+
if ! declare -F rejected_pair_fixture_reason >/dev/null; then
|
|
24
|
+
echo "✗ rejected fixture registry must define rejected_pair_fixture_reason: $REJECTED_REGISTRY"
|
|
25
|
+
exit 1
|
|
26
|
+
fi
|
|
27
|
+
|
|
28
|
+
REQUIRED_FILES=(metadata.json spec.md task.txt expected.json setup.sh NOTES.md)
|
|
29
|
+
|
|
30
|
+
ERRORS=0
|
|
31
|
+
COUNT=0
|
|
32
|
+
RETIRED_COUNT=0
|
|
33
|
+
|
|
34
|
+
for d in "$FIXTURES_DIR"/$FIXTURE_GLOB/; do
|
|
35
|
+
[ -d "$d" ] || continue
|
|
36
|
+
COUNT=$((COUNT + 1))
|
|
37
|
+
fid="$(basename "$d")"
|
|
38
|
+
|
|
39
|
+
for f in "${REQUIRED_FILES[@]}"; do
|
|
40
|
+
if [ ! -f "$d/$f" ]; then
|
|
41
|
+
echo "✗ $fid: missing $f"
|
|
42
|
+
ERRORS=$((ERRORS + 1))
|
|
43
|
+
fi
|
|
44
|
+
done
|
|
45
|
+
|
|
46
|
+
if [ -f "$d/metadata.json" ]; then
|
|
47
|
+
meta_id=$(python3 -c "import json,sys; print(json.load(open('$d/metadata.json'))['id'])" 2>/dev/null || echo "")
|
|
48
|
+
if [ "$meta_id" != "$fid" ]; then
|
|
49
|
+
echo "✗ $fid: metadata.json id='$meta_id' does not match dir name"
|
|
50
|
+
ERRORS=$((ERRORS + 1))
|
|
51
|
+
fi
|
|
52
|
+
|
|
53
|
+
python3 - "$d/metadata.json" "$d/spec.md" "$fid" <<'PY' || ERRORS=$((ERRORS + 1))
|
|
54
|
+
import json
|
|
55
|
+
import re
|
|
56
|
+
import sys
|
|
57
|
+
|
|
58
|
+
metadata_path, spec_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
59
|
+
try:
|
|
60
|
+
metadata = json.load(open(metadata_path, encoding="utf-8"))
|
|
61
|
+
except Exception:
|
|
62
|
+
sys.exit(0)
|
|
63
|
+
if metadata.get("category") != "high-risk":
|
|
64
|
+
sys.exit(0)
|
|
65
|
+
intent = str(metadata.get("intent") or "")
|
|
66
|
+
try:
|
|
67
|
+
spec = open(spec_path, encoding="utf-8").read()
|
|
68
|
+
except FileNotFoundError:
|
|
69
|
+
spec = ""
|
|
70
|
+
text = f"{intent}\n{spec}".lower()
|
|
71
|
+
risk_pattern = re.compile(
|
|
72
|
+
r"\b("
|
|
73
|
+
r"auth|authz|permissions?|security|tokens?|sessions?|"
|
|
74
|
+
r"payments?|money|billing|invoices?|pricing|tax|ledger|"
|
|
75
|
+
r"persistence|persist\w*|data mutation|delet\w*|migrations?|"
|
|
76
|
+
r"idempoten\w*|replay|duplicates?|api|webhook|raw-body|signatures?|"
|
|
77
|
+
r"allocation|scheduling|inventory|rollback|transaction|"
|
|
78
|
+
r"priority|error-priority|output-shape|output shape|response-shape|response shape"
|
|
79
|
+
r")\b"
|
|
80
|
+
)
|
|
81
|
+
if not risk_pattern.search(text):
|
|
82
|
+
print(
|
|
83
|
+
f"✗ {fid}: high-risk fixture must include a resolve risk-trigger term "
|
|
84
|
+
"in metadata intent or spec.md"
|
|
85
|
+
)
|
|
86
|
+
sys.exit(1)
|
|
87
|
+
PY
|
|
88
|
+
fi
|
|
89
|
+
|
|
90
|
+
if [ -f "$d/spec.md" ]; then
|
|
91
|
+
spec_id=$(python3 - "$d/spec.md" <<'PY' 2>/dev/null || true
|
|
92
|
+
import re, sys
|
|
93
|
+
text = open(sys.argv[1], encoding="utf-8").read()
|
|
94
|
+
m = re.search(r'^id:\s*"?([^"\n]+)"?\s*$', text, re.M)
|
|
95
|
+
print(m.group(1) if m else "")
|
|
96
|
+
PY
|
|
97
|
+
)
|
|
98
|
+
if [ "$spec_id" != "$fid" ]; then
|
|
99
|
+
echo "✗ $fid: spec.md frontmatter id='$spec_id' does not match dir name"
|
|
100
|
+
ERRORS=$((ERRORS + 1))
|
|
101
|
+
fi
|
|
102
|
+
fi
|
|
103
|
+
|
|
104
|
+
if [ -f "$d/expected.json" ]; then
|
|
105
|
+
if ! python3 - "$d/expected.json" "$fid" <<'PY'
|
|
106
|
+
import json
|
|
107
|
+
import sys
|
|
108
|
+
|
|
109
|
+
expected_path, fid = sys.argv[1], sys.argv[2]
|
|
110
|
+
try:
|
|
111
|
+
data = json.load(open(expected_path, encoding="utf-8"))
|
|
112
|
+
except json.JSONDecodeError:
|
|
113
|
+
print(f"✗ {fid}: expected.json is not valid JSON")
|
|
114
|
+
sys.exit(1)
|
|
115
|
+
if not isinstance(data, dict):
|
|
116
|
+
print(f"✗ {fid}: expected.json must be an object")
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
PY
|
|
119
|
+
then
|
|
120
|
+
ERRORS=$((ERRORS + 1))
|
|
121
|
+
continue
|
|
122
|
+
fi
|
|
123
|
+
|
|
124
|
+
n_cmds=$(python3 - "$d/expected.json" <<'PY'
|
|
125
|
+
import json
|
|
126
|
+
import sys
|
|
127
|
+
|
|
128
|
+
data = json.load(open(sys.argv[1], encoding="utf-8"))
|
|
129
|
+
commands = data.get("verification_commands", [])
|
|
130
|
+
print(len(commands) if isinstance(commands, list) else 0)
|
|
131
|
+
PY
|
|
132
|
+
)
|
|
133
|
+
if [ "$n_cmds" -lt 1 ]; then
|
|
134
|
+
echo "✗ $fid: expected.json has 0 verification_commands (need ≥1)"
|
|
135
|
+
ERRORS=$((ERRORS + 1))
|
|
136
|
+
fi
|
|
137
|
+
|
|
138
|
+
schema_ok=1
|
|
139
|
+
if ! python3 - "$SCHEMA" "$d/expected.json" "$fid" <<'PY'
|
|
140
|
+
import json, os, sys
|
|
141
|
+
schema_path, expected_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
142
|
+
schema = json.load(open(schema_path))
|
|
143
|
+
data = json.load(open(expected_path))
|
|
144
|
+
|
|
145
|
+
def is_string_list(value):
|
|
146
|
+
return isinstance(value, list) and all(isinstance(item, str) and item for item in value)
|
|
147
|
+
|
|
148
|
+
def fallback_validate():
|
|
149
|
+
allowed = set(schema["properties"])
|
|
150
|
+
errors = []
|
|
151
|
+
if not isinstance(data, dict):
|
|
152
|
+
return ["expected.json must be an object"]
|
|
153
|
+
unknown = sorted(set(data) - allowed)
|
|
154
|
+
if unknown:
|
|
155
|
+
errors.append(f"expected.json has unknown key(s): {', '.join(unknown)}")
|
|
156
|
+
commands = data.get("verification_commands", [])
|
|
157
|
+
if not isinstance(commands, list):
|
|
158
|
+
errors.append("verification_commands must be an array")
|
|
159
|
+
else:
|
|
160
|
+
for idx, command in enumerate(commands):
|
|
161
|
+
if not isinstance(command, dict):
|
|
162
|
+
errors.append(f"verification_commands[{idx}] must be an object")
|
|
163
|
+
continue
|
|
164
|
+
unknown_command = sorted(set(command) - {"cmd", "exit_code", "stdout_contains", "stdout_not_contains", "contract_refs"})
|
|
165
|
+
if unknown_command:
|
|
166
|
+
errors.append(f"verification_commands[{idx}] has unknown key(s): {', '.join(unknown_command)}")
|
|
167
|
+
if not isinstance(command.get("cmd"), str) or not command.get("cmd"):
|
|
168
|
+
errors.append(f"verification_commands[{idx}].cmd must be a non-empty string")
|
|
169
|
+
exit_code = command.get("exit_code", 0)
|
|
170
|
+
if isinstance(exit_code, bool) or not isinstance(exit_code, int):
|
|
171
|
+
errors.append(f"verification_commands[{idx}].exit_code must be an integer")
|
|
172
|
+
for key in ("stdout_contains", "stdout_not_contains", "contract_refs"):
|
|
173
|
+
if key in command and not is_string_list(command[key]):
|
|
174
|
+
errors.append(f"verification_commands[{idx}].{key} must be an array of non-empty strings")
|
|
175
|
+
patterns = data.get("forbidden_patterns", [])
|
|
176
|
+
if not isinstance(patterns, list):
|
|
177
|
+
errors.append("forbidden_patterns must be an array")
|
|
178
|
+
else:
|
|
179
|
+
for idx, pattern in enumerate(patterns):
|
|
180
|
+
if not isinstance(pattern, dict):
|
|
181
|
+
errors.append(f"forbidden_patterns[{idx}] must be an object")
|
|
182
|
+
continue
|
|
183
|
+
unknown_pattern = sorted(set(pattern) - {"pattern", "description", "files", "severity"})
|
|
184
|
+
if unknown_pattern:
|
|
185
|
+
errors.append(f"forbidden_patterns[{idx}] has unknown key(s): {', '.join(unknown_pattern)}")
|
|
186
|
+
for key in ("pattern", "description"):
|
|
187
|
+
if not isinstance(pattern.get(key), str) or not pattern.get(key):
|
|
188
|
+
errors.append(f"forbidden_patterns[{idx}].{key} must be a non-empty string")
|
|
189
|
+
if pattern.get("severity") not in {"disqualifier", "warning"}:
|
|
190
|
+
errors.append(f"forbidden_patterns[{idx}].severity must be disqualifier or warning")
|
|
191
|
+
if "files" in pattern and not is_string_list(pattern["files"]):
|
|
192
|
+
errors.append(f"forbidden_patterns[{idx}].files must be an array of non-empty strings")
|
|
193
|
+
for key in ("required_files", "forbidden_files", "tier_a_waivers", "spec_output_files"):
|
|
194
|
+
if key in data and not is_string_list(data[key]):
|
|
195
|
+
errors.append(f"{key} must be an array of non-empty strings")
|
|
196
|
+
max_deps_added = data.get("max_deps_added", 0)
|
|
197
|
+
if isinstance(max_deps_added, bool) or not isinstance(max_deps_added, int) or max_deps_added < 0:
|
|
198
|
+
errors.append("max_deps_added must be an integer >= 0")
|
|
199
|
+
return errors
|
|
200
|
+
|
|
201
|
+
force_fallback = os.environ.get("DEVLYN_LINT_FIXTURES_NO_JSONSCHEMA") == "1"
|
|
202
|
+
try:
|
|
203
|
+
if force_fallback:
|
|
204
|
+
raise ImportError
|
|
205
|
+
import jsonschema
|
|
206
|
+
except ImportError:
|
|
207
|
+
fallback_errors = fallback_validate()
|
|
208
|
+
if fallback_errors:
|
|
209
|
+
for error in fallback_errors:
|
|
210
|
+
print(f"✗ {fid}: expected.json schema violation: {error}")
|
|
211
|
+
sys.exit(1)
|
|
212
|
+
else:
|
|
213
|
+
try:
|
|
214
|
+
jsonschema.validate(data, schema)
|
|
215
|
+
except jsonschema.ValidationError as e:
|
|
216
|
+
print(f"✗ {fid}: expected.json schema violation: {e.message}")
|
|
217
|
+
sys.exit(1)
|
|
218
|
+
PY
|
|
219
|
+
then
|
|
220
|
+
ERRORS=$((ERRORS + 1))
|
|
221
|
+
schema_ok=0
|
|
222
|
+
fi
|
|
223
|
+
|
|
224
|
+
if [ "$schema_ok" -eq 1 ]; then
|
|
225
|
+
if ! python3 "$SPEC_VERIFY_CHECK" --check "$d/spec.md"; then
|
|
226
|
+
echo "✗ $fid: spec-verify-check --check failed"
|
|
227
|
+
ERRORS=$((ERRORS + 1))
|
|
228
|
+
fi
|
|
229
|
+
if ! python3 "$SPEC_VERIFY_CHECK" --check-expected "$d/expected.json"; then
|
|
230
|
+
echo "✗ $fid: spec-verify-check --check-expected failed"
|
|
231
|
+
ERRORS=$((ERRORS + 1))
|
|
232
|
+
fi
|
|
233
|
+
|
|
234
|
+
python3 - "$d/spec.md" "$d/expected.json" "$fid" <<'PY' || ERRORS=$((ERRORS + 1))
|
|
235
|
+
import json, pathlib, re, sys
|
|
236
|
+
spec_path, expected_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
237
|
+
spec = open(spec_path, encoding="utf-8").read()
|
|
238
|
+
expected = json.load(open(expected_path, encoding="utf-8"))
|
|
239
|
+
fixture_dir = pathlib.Path(expected_path).parent
|
|
240
|
+
fixture_root = fixture_dir.resolve()
|
|
241
|
+
errors = []
|
|
242
|
+
for idx, command in enumerate(expected.get("verification_commands", [])):
|
|
243
|
+
cmd = str(command.get("cmd", ""))
|
|
244
|
+
if "BENCH_FIXTURE_DIR" not in cmd:
|
|
245
|
+
continue
|
|
246
|
+
fixture_refs = re.findall(r"(?:\$\{BENCH_FIXTURE_DIR\}|\$BENCH_FIXTURE_DIR)/([^\"'\s]+)", cmd)
|
|
247
|
+
if not fixture_refs:
|
|
248
|
+
errors.append(
|
|
249
|
+
f"verification_commands[{idx}] hidden oracle must reference an explicit $BENCH_FIXTURE_DIR/... file"
|
|
250
|
+
)
|
|
251
|
+
stdout_contains = command.get("stdout_contains", [])
|
|
252
|
+
if '"ok":true' not in stdout_contains:
|
|
253
|
+
errors.append(
|
|
254
|
+
f"verification_commands[{idx}] hidden oracle must assert stdout_contains includes '\"ok\":true'"
|
|
255
|
+
)
|
|
256
|
+
for fixture_ref in fixture_refs:
|
|
257
|
+
target = (fixture_dir / fixture_ref).resolve(strict=False)
|
|
258
|
+
try:
|
|
259
|
+
target.relative_to(fixture_root)
|
|
260
|
+
except ValueError:
|
|
261
|
+
errors.append(
|
|
262
|
+
f"verification_commands[{idx}] BENCH_FIXTURE_DIR file escapes fixture dir: {fixture_ref!r}"
|
|
263
|
+
)
|
|
264
|
+
continue
|
|
265
|
+
if not target.is_file():
|
|
266
|
+
errors.append(
|
|
267
|
+
f"verification_commands[{idx}] BENCH_FIXTURE_DIR file not found: {fixture_ref!r}"
|
|
268
|
+
)
|
|
269
|
+
refs = command.get("contract_refs", [])
|
|
270
|
+
if not refs:
|
|
271
|
+
errors.append(f"verification_commands[{idx}] hidden oracle missing contract_refs")
|
|
272
|
+
continue
|
|
273
|
+
for ref in refs:
|
|
274
|
+
if ref not in spec:
|
|
275
|
+
errors.append(
|
|
276
|
+
f"verification_commands[{idx}] contract_ref not found in spec.md: {ref!r}"
|
|
277
|
+
)
|
|
278
|
+
if errors:
|
|
279
|
+
for err in errors:
|
|
280
|
+
print(f"✗ {fid}: {err}")
|
|
281
|
+
sys.exit(1)
|
|
282
|
+
PY
|
|
283
|
+
fi
|
|
284
|
+
fi
|
|
285
|
+
|
|
286
|
+
if [ -f "$d/setup.sh" ] && [ ! -x "$d/setup.sh" ]; then
|
|
287
|
+
echo "✗ $fid: setup.sh not executable (run: chmod +x $d/setup.sh)"
|
|
288
|
+
ERRORS=$((ERRORS + 1))
|
|
289
|
+
fi
|
|
290
|
+
|
|
291
|
+
if [ -f "$d/NOTES.md" ] \
|
|
292
|
+
&& { { grep -Fq 'headroom gate' "$d/NOTES.md" && grep -Eq '`?FAIL`?' "$d/NOTES.md"; } \
|
|
293
|
+
|| { grep -Fq 'pair-lift evidence' "$d/NOTES.md" && grep -Eiq 'reject|rejected' "$d/NOTES.md"; }; } \
|
|
294
|
+
&& ! rejected_pair_fixture_reason "$fid" >/dev/null 2>&1; then
|
|
295
|
+
echo "✗ $fid: NOTES.md records pair-candidate rejection but pair-rejected-fixtures.sh has no rejected reason"
|
|
296
|
+
ERRORS=$((ERRORS + 1))
|
|
297
|
+
fi
|
|
298
|
+
|
|
299
|
+
if [ -f "$d/NOTES.md" ] \
|
|
300
|
+
&& grep -Fq 'pair_evidence_passed' "$d/NOTES.md" \
|
|
301
|
+
&& ! python3 "$SOLO_HEADROOM_CHECK" --expected-json "$d/expected.json" "$d/spec.md"; then
|
|
302
|
+
echo "✗ $fid: pair_evidence_passed fixture spec.md must document an actionable solo-headroom hypothesis with solo_claude miss and observable command from expected.json"
|
|
303
|
+
ERRORS=$((ERRORS + 1))
|
|
304
|
+
fi
|
|
305
|
+
done
|
|
306
|
+
|
|
307
|
+
for d in "$FIXTURES_DIR"/retired/$RETIRED_FIXTURE_GLOB/; do
|
|
308
|
+
[ -d "$d" ] || continue
|
|
309
|
+
RETIRED_COUNT=$((RETIRED_COUNT + 1))
|
|
310
|
+
fid="$(basename "$d")"
|
|
311
|
+
|
|
312
|
+
if [ ! -f "$d/RETIRED.md" ]; then
|
|
313
|
+
echo "✗ retired/$fid: missing RETIRED.md"
|
|
314
|
+
ERRORS=$((ERRORS + 1))
|
|
315
|
+
fi
|
|
316
|
+
|
|
317
|
+
for f in "${REQUIRED_FILES[@]}"; do
|
|
318
|
+
if [ ! -f "$d/$f" ]; then
|
|
319
|
+
echo "✗ retired/$fid: missing preserved $f"
|
|
320
|
+
ERRORS=$((ERRORS + 1))
|
|
321
|
+
fi
|
|
322
|
+
done
|
|
323
|
+
|
|
324
|
+
if [ -f "$d/metadata.json" ]; then
|
|
325
|
+
meta_id=$(python3 -c "import json,sys; print(json.load(open('$d/metadata.json'))['id'])" 2>/dev/null || echo "")
|
|
326
|
+
if [ "$meta_id" != "$fid" ]; then
|
|
327
|
+
echo "✗ retired/$fid: metadata.json id='$meta_id' does not match dir name"
|
|
328
|
+
ERRORS=$((ERRORS + 1))
|
|
329
|
+
fi
|
|
330
|
+
fi
|
|
331
|
+
|
|
332
|
+
if [ -f "$d/setup.sh" ] && [ ! -x "$d/setup.sh" ]; then
|
|
333
|
+
echo "✗ retired/$fid: setup.sh not executable (run: chmod +x $d/setup.sh)"
|
|
334
|
+
ERRORS=$((ERRORS + 1))
|
|
335
|
+
fi
|
|
336
|
+
done
|
|
337
|
+
|
|
338
|
+
if [ $COUNT -eq 0 ]; then
|
|
339
|
+
echo "✗ no fixtures found in $FIXTURES_DIR"
|
|
340
|
+
exit 1
|
|
341
|
+
fi
|
|
342
|
+
|
|
343
|
+
if [ $ERRORS -gt 0 ]; then
|
|
344
|
+
echo ""
|
|
345
|
+
echo "✗ lint-fixtures: $ERRORS error(s) across $COUNT active fixture(s) and $RETIRED_COUNT retired fixture(s)"
|
|
346
|
+
exit 1
|
|
347
|
+
fi
|
|
348
|
+
|
|
349
|
+
echo "✓ lint-fixtures: $COUNT active fixture(s) passed schema + structural checks; $RETIRED_COUNT retired fixture(s) preserved"
|