devlyn-cli 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +2 -2
- package/CLAUDE.md +4 -4
- package/README.md +85 -34
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
- package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
- package/benchmark/auto-resolve/README.md +307 -44
- package/benchmark/auto-resolve/RUBRIC.md +23 -14
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
- package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
- package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
- package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
- package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
- package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
- package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
- package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
- package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
- package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
- package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
- package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
- package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
- package/benchmark/auto-resolve/scripts/judge.sh +153 -26
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
- package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
- package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
- package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
- package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
- package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
- package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
- package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
- package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
- package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
- package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
- package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
- package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
- package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
- package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
- package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
- package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
- package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
- package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
- package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
- package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
- package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
- package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
- package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
- package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
- package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
- package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
- package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
- package/bin/devlyn.js +221 -17
- package/config/skills/_shared/adapters/README.md +3 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
- package/config/skills/_shared/adapters/opus-4-7.md +9 -1
- package/config/skills/_shared/archive_run.py +78 -6
- package/config/skills/_shared/codex-config.md +5 -4
- package/config/skills/_shared/codex-monitored.sh +46 -1
- package/config/skills/_shared/collect-codex-findings.py +20 -5
- package/config/skills/_shared/engine-preflight.md +17 -13
- package/config/skills/_shared/runtime-principles.md +6 -9
- package/config/skills/_shared/spec-verify-check.py +2664 -107
- package/config/skills/_shared/verify-merge-findings.py +1369 -19
- package/config/skills/devlyn:design-ui/SKILL.md +364 -0
- package/config/skills/devlyn:ideate/SKILL.md +7 -4
- package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
- package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
- package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
- package/config/skills/devlyn:resolve/SKILL.md +78 -26
- package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
- package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
- package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
- package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
- package/package.json +47 -2
- package/scripts/lint-fixtures.sh +349 -0
- package/scripts/lint-shadow-fixtures.sh +58 -0
- package/scripts/lint-skills.sh +3645 -95
|
@@ -11,7 +11,9 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
|
|
|
11
11
|
"started_at": "2026-04-30T12:00:00Z",
|
|
12
12
|
"engine": "claude",
|
|
13
13
|
"mode": "spec",
|
|
14
|
+
"pair_verify": false,
|
|
14
15
|
"complexity": null,
|
|
16
|
+
"risk_profile": { "high_risk": false, "reasons": [], "risk_probes_enabled": false, "pair_default_enabled": true },
|
|
15
17
|
"base_ref": { "branch": "main", "sha": "abc123..." },
|
|
16
18
|
"rounds": { "max_rounds": 4, "global": 0 },
|
|
17
19
|
"bypasses": [],
|
|
@@ -43,15 +45,18 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
|
|
|
43
45
|
|
|
44
46
|
- **version** — string. Bump major on a breaking schema change.
|
|
45
47
|
- **mode** — `"free-form" | "spec" | "verify-only"`.
|
|
48
|
+
- **pair_verify** — boolean. Set true only when the user passed `--pair-verify`; otherwise false. This is the durable state evidence for the `mode.pair-verify` pair-trigger reason. It is mutually exclusive with `risk_profile.pair_default_enabled == false` from `--no-pair`; `verify-merge-findings.py` blocks the contradictory state.
|
|
46
49
|
- **complexity** — `null | "trivial" | "medium" | "large"`. Free-form mode populates this; spec/verify-only mode leaves it null.
|
|
47
|
-
- **engine** — `"claude" | "codex" | "auto"` initially;
|
|
50
|
+
- **engine** — `"claude" | "codex" | "auto"` initially; a required unavailable engine stops the run with `BLOCKED:<engine>-unavailable`.
|
|
51
|
+
- **source** — provenance for the contract all downstream phases read. Spec and verify-only mode set `type: "spec"`, `spec_path`, and `spec_sha256`. Free-form mode sets `type: "generated"`, leaves `spec_path`/`spec_sha256` null, and must set `criteria_path: ".devlyn/criteria.generated.md"` plus `criteria_sha256` from the generated file's raw bytes. VERIFY re-checks the matching hash before judging.
|
|
52
|
+
- **risk_profile** — PHASE 0 classification for conditional defaults. `high_risk` records durable-risk signals from the goal/spec; `risk_probes_enabled` is true for explicit `--risk-probes` or high-risk specs unless `--no-risk-probes`; `pair_default_enabled` is false only for explicit `--no-pair`. `risk_profile` must remain an object with boolean `high_risk`, `risk_probes_enabled`, and `pair_default_enabled` fields when present, plus `reasons` as a list of strings. Malformed `risk_profile` blocks VERIFY because pair-trigger reasons derive `risk.high` and `risk_probes.enabled` from this state.
|
|
48
53
|
- **rounds.global** — incremented every fix-loop pass (BUILD_GATE → fix-loop OR VERIFY → fix-loop).
|
|
49
54
|
- **phases.probe_derive** — optional PHASE 1.5 entry when `--risk-probes` is enabled. Artifacts include `.devlyn/risk-probes.jsonl`. Probe failures later surface through BUILD_GATE/VERIFY as `correctness.risk-probe-failed`.
|
|
50
55
|
- **bypasses** — array of phase names from `--bypass`. Valid: `"build-gate" | "cleanup"`. PLAN, IMPLEMENT, VERIFY are non-bypassable (orchestrator rejects at parse time).
|
|
51
56
|
- **implement_passed_sha** — captured at end of PHASE 2; null until then. Activates the post-implement invariant for CLEANUP and VERIFY.
|
|
52
57
|
- **criteria** — generated from spec's `## Requirements` checklist (one per `- [ ]`). `status: pending → implemented` is the legal transition. `failed_by_finding_ids` populates when VERIFY surfaces a finding tied to a criterion.
|
|
53
|
-
- **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for `complexity: high` specs or `state.complexity` of `"
|
|
54
|
-
- **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. If eligible with
|
|
58
|
+
- **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for `state.pair_verify == true`, verify-only mode, high-risk specs, active risk probes, actionable solo-headroom hypotheses, `complexity: high` specs, or current free-form `state.complexity` of `"large"` when MECHANICAL and the primary JUDGE have no verdict-binding blockers. Legacy/external spec `complexity: large` remains accepted for compatibility; new specs use `high`. Legacy `"high"` state remains accepted by the merge validator only for archived run compatibility.
|
|
59
|
+
- **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. The shape is strict: `eligible: true` requires a non-empty reasons list containing every applicable canonical eligible reason and only canonical eligible reasons, plus `skipped_reason: null`; `eligible: false` requires an empty reasons list and may set only `user_no_pair`, `mechanical_blocker`, `primary_judge_blocker`, or null as the skip cause. Canonical eligible reasons are `mode.verify-only`, `mode.pair-verify`, `complexity.high`, `complexity.large`, `spec.complexity.high`, `spec.complexity.large`, `spec.solo_headroom_hypothesis`, `risk.high`, `risk_probes.enabled`, `risk_probes.present`, `coverage.failed`, `mechanical.warning`, and `judge.warning`. `user_no_pair` is valid only when `risk_profile.pair_default_enabled == false` from an explicit `--no-pair`; `mechanical_blocker` and `primary_judge_blocker` are valid only when the matching source has a verdict-binding finding. If state implies a pair decision is required but `pair_trigger` is missing, if it records `eligible:false` with no supported skip reason, if an eligible trigger omits an applicable reason such as `spec.solo_headroom_hypothesis`, or if any combination is malformed, `verify-merge-findings.py` blocks VERIFY.
|
|
55
60
|
|
|
56
61
|
## Per-phase shape
|
|
57
62
|
|
|
@@ -105,7 +110,7 @@ Per-phase summary table: `phase | verdict | duration_ms | round | triggered_by |
|
|
|
105
110
|
|
|
106
111
|
Findings table (post-IMPLEMENT phases only — they are findings-only): each finding's `severity | rule_id | file:line | message | confidence`.
|
|
107
112
|
|
|
108
|
-
Follow-up notes: any `--continue-on-large` assumptions,
|
|
113
|
+
Follow-up notes: any `--continue-on-large` assumptions, pair/risk-probe opt-out state, engine setup guidance for `BLOCKED:<engine>-unavailable`, `/devlyn:ideate` guidance for `BLOCKED:solo-headroom-hypothesis-required` that asks for the visible behavior `solo_claude` is expected to miss, `/devlyn:ideate` guidance for `BLOCKED:solo-ceiling-avoidance-required` that asks for the concrete difference from rejected or solo-saturated controls such as `S2`-`S6`, and any `state.verify.coverage_failed` axes.
|
|
109
114
|
|
|
110
115
|
## Archive contract
|
|
111
116
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "devlyn-cli",
|
|
3
|
-
"version": "2.
|
|
4
|
-
"description": "AI development toolkit for Claude Code — ideate,
|
|
3
|
+
"version": "2.3.1",
|
|
4
|
+
"description": "AI development toolkit for Claude Code — ideate, resolve, and ship with context engineering and agent orchestration",
|
|
5
5
|
"homepage": "https://github.com/fysoul17/devlyn-cli#readme",
|
|
6
6
|
"bin": {
|
|
7
7
|
"devlyn": "bin/devlyn.js"
|
|
@@ -20,13 +20,58 @@
|
|
|
20
20
|
"agents-config",
|
|
21
21
|
"optional-skills",
|
|
22
22
|
"benchmark/auto-resolve/BENCHMARK-DESIGN.md",
|
|
23
|
+
"benchmark/auto-resolve/BENCHMARK-RESULTS.md",
|
|
23
24
|
"benchmark/auto-resolve/README.md",
|
|
24
25
|
"benchmark/auto-resolve/RUBRIC.md",
|
|
26
|
+
"benchmark/auto-resolve/run-real-benchmark.md",
|
|
25
27
|
"benchmark/auto-resolve/fixtures/SCHEMA.md",
|
|
26
28
|
"benchmark/auto-resolve/fixtures/F*/**",
|
|
29
|
+
"benchmark/auto-resolve/fixtures/retired/F*/**",
|
|
30
|
+
"benchmark/auto-resolve/shadow-fixtures/S*/**",
|
|
27
31
|
"benchmark/auto-resolve/fixtures/test-repo/**",
|
|
28
32
|
"!benchmark/auto-resolve/fixtures/test-repo/node_modules/**",
|
|
33
|
+
"benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md",
|
|
34
|
+
"benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json",
|
|
35
|
+
"benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md",
|
|
36
|
+
"benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json",
|
|
37
|
+
"benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.md",
|
|
38
|
+
"benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.json",
|
|
39
|
+
"benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.md",
|
|
40
|
+
"benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json",
|
|
41
|
+
"benchmark/auto-resolve/results/20260507-f10-f11-tier1-full-pipeline/headroom-gate.md",
|
|
42
|
+
"benchmark/auto-resolve/results/20260507-f10-f11-tier1-full-pipeline/headroom-gate.json",
|
|
43
|
+
"benchmark/auto-resolve/results/20260508-f22-exact-error-headroom/headroom-gate.md",
|
|
44
|
+
"benchmark/auto-resolve/results/20260508-f22-exact-error-headroom/headroom-gate.json",
|
|
45
|
+
"benchmark/auto-resolve/results/20260508-f26-headroom/headroom-gate.md",
|
|
46
|
+
"benchmark/auto-resolve/results/20260508-f26-headroom/headroom-gate.json",
|
|
47
|
+
"benchmark/auto-resolve/results/20260511-f3-http-error-headroom/headroom-gate.md",
|
|
48
|
+
"benchmark/auto-resolve/results/20260511-f3-http-error-headroom/headroom-gate.json",
|
|
49
|
+
"benchmark/auto-resolve/results/20260511-f12-webhook-headroom/headroom-gate.md",
|
|
50
|
+
"benchmark/auto-resolve/results/20260511-f12-webhook-headroom/headroom-gate.json",
|
|
51
|
+
"benchmark/auto-resolve/results/20260511-f15-concurrency-headroom/headroom-gate.md",
|
|
52
|
+
"benchmark/auto-resolve/results/20260511-f15-concurrency-headroom/headroom-gate.json",
|
|
53
|
+
"benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.md",
|
|
54
|
+
"benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.json",
|
|
55
|
+
"benchmark/auto-resolve/results/20260512-f4-web-headroom/headroom-gate.md",
|
|
56
|
+
"benchmark/auto-resolve/results/20260512-f4-web-headroom/headroom-gate.json",
|
|
57
|
+
"benchmark/auto-resolve/results/20260512-f5-fixloop-headroom/headroom-gate.md",
|
|
58
|
+
"benchmark/auto-resolve/results/20260512-f5-fixloop-headroom/headroom-gate.json",
|
|
59
|
+
"benchmark/auto-resolve/results/20260512-f6-checksum-headroom/headroom-gate.md",
|
|
60
|
+
"benchmark/auto-resolve/results/20260512-f6-checksum-headroom/headroom-gate.json",
|
|
61
|
+
"benchmark/auto-resolve/results/20260512-f7-scope-headroom/headroom-gate.md",
|
|
62
|
+
"benchmark/auto-resolve/results/20260512-f7-scope-headroom/headroom-gate.json",
|
|
63
|
+
"benchmark/auto-resolve/results/20260512-f9-e2e-headroom/headroom-gate.md",
|
|
64
|
+
"benchmark/auto-resolve/results/20260512-f9-e2e-headroom/headroom-gate.json",
|
|
65
|
+
"benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.md",
|
|
66
|
+
"benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.json",
|
|
67
|
+
"benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.md",
|
|
68
|
+
"benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.json",
|
|
29
69
|
"benchmark/auto-resolve/scripts/**",
|
|
70
|
+
"!**/__pycache__",
|
|
71
|
+
"!**/__pycache__/**",
|
|
72
|
+
"!**/*.pyc",
|
|
73
|
+
"scripts/lint-fixtures.sh",
|
|
74
|
+
"scripts/lint-shadow-fixtures.sh",
|
|
30
75
|
"scripts/lint-skills.sh",
|
|
31
76
|
"CLAUDE.md",
|
|
32
77
|
"AGENTS.md"
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# lint-fixtures.sh — schema validity + structural check for golden fixtures/.
|
|
3
|
+
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
7
|
+
FIXTURES_DIR="${DEVLYN_FIXTURES_DIR:-$REPO_ROOT/benchmark/auto-resolve/fixtures}"
|
|
8
|
+
FIXTURE_GLOB="${DEVLYN_FIXTURE_GLOB:-F*}"
|
|
9
|
+
RETIRED_FIXTURE_GLOB="${DEVLYN_RETIRED_FIXTURE_GLOB:-F*}"
|
|
10
|
+
REJECTED_REGISTRY="${DEVLYN_REJECTED_FIXTURE_REGISTRY:-$REPO_ROOT/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh}"
|
|
11
|
+
SCHEMA="${DEVLYN_EXPECTED_SCHEMA:-$REPO_ROOT/config/skills/_shared/expected.schema.json}"
|
|
12
|
+
SPEC_VERIFY_CHECK="$REPO_ROOT/config/skills/_shared/spec-verify-check.py"
|
|
13
|
+
SOLO_HEADROOM_CHECK="$REPO_ROOT/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py"
|
|
14
|
+
|
|
15
|
+
[ -d "$FIXTURES_DIR" ] || { echo "✗ $FIXTURES_DIR missing"; exit 1; }
|
|
16
|
+
[ -f "$SCHEMA" ] || { echo "✗ $SCHEMA missing"; exit 1; }
|
|
17
|
+
[ -f "$SPEC_VERIFY_CHECK" ] || { echo "✗ $SPEC_VERIFY_CHECK missing"; exit 1; }
|
|
18
|
+
[ -f "$SOLO_HEADROOM_CHECK" ] || { echo "✗ solo-headroom checker missing: $SOLO_HEADROOM_CHECK"; exit 1; }
|
|
19
|
+
[ -f "$REJECTED_REGISTRY" ] || { echo "✗ rejected fixture registry missing: $REJECTED_REGISTRY"; exit 1; }
|
|
20
|
+
|
|
21
|
+
# shellcheck source=/dev/null
|
|
22
|
+
source "$REJECTED_REGISTRY"
|
|
23
|
+
if ! declare -F rejected_pair_fixture_reason >/dev/null; then
|
|
24
|
+
echo "✗ rejected fixture registry must define rejected_pair_fixture_reason: $REJECTED_REGISTRY"
|
|
25
|
+
exit 1
|
|
26
|
+
fi
|
|
27
|
+
|
|
28
|
+
REQUIRED_FILES=(metadata.json spec.md task.txt expected.json setup.sh NOTES.md)
|
|
29
|
+
|
|
30
|
+
ERRORS=0
|
|
31
|
+
COUNT=0
|
|
32
|
+
RETIRED_COUNT=0
|
|
33
|
+
|
|
34
|
+
for d in "$FIXTURES_DIR"/$FIXTURE_GLOB/; do
|
|
35
|
+
[ -d "$d" ] || continue
|
|
36
|
+
COUNT=$((COUNT + 1))
|
|
37
|
+
fid="$(basename "$d")"
|
|
38
|
+
|
|
39
|
+
for f in "${REQUIRED_FILES[@]}"; do
|
|
40
|
+
if [ ! -f "$d/$f" ]; then
|
|
41
|
+
echo "✗ $fid: missing $f"
|
|
42
|
+
ERRORS=$((ERRORS + 1))
|
|
43
|
+
fi
|
|
44
|
+
done
|
|
45
|
+
|
|
46
|
+
if [ -f "$d/metadata.json" ]; then
|
|
47
|
+
meta_id=$(python3 -c "import json,sys; print(json.load(open('$d/metadata.json'))['id'])" 2>/dev/null || echo "")
|
|
48
|
+
if [ "$meta_id" != "$fid" ]; then
|
|
49
|
+
echo "✗ $fid: metadata.json id='$meta_id' does not match dir name"
|
|
50
|
+
ERRORS=$((ERRORS + 1))
|
|
51
|
+
fi
|
|
52
|
+
|
|
53
|
+
python3 - "$d/metadata.json" "$d/spec.md" "$fid" <<'PY' || ERRORS=$((ERRORS + 1))
|
|
54
|
+
import json
|
|
55
|
+
import re
|
|
56
|
+
import sys
|
|
57
|
+
|
|
58
|
+
metadata_path, spec_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
59
|
+
try:
|
|
60
|
+
metadata = json.load(open(metadata_path, encoding="utf-8"))
|
|
61
|
+
except Exception:
|
|
62
|
+
sys.exit(0)
|
|
63
|
+
if metadata.get("category") != "high-risk":
|
|
64
|
+
sys.exit(0)
|
|
65
|
+
intent = str(metadata.get("intent") or "")
|
|
66
|
+
try:
|
|
67
|
+
spec = open(spec_path, encoding="utf-8").read()
|
|
68
|
+
except FileNotFoundError:
|
|
69
|
+
spec = ""
|
|
70
|
+
text = f"{intent}\n{spec}".lower()
|
|
71
|
+
risk_pattern = re.compile(
|
|
72
|
+
r"\b("
|
|
73
|
+
r"auth|authz|permissions?|security|tokens?|sessions?|"
|
|
74
|
+
r"payments?|money|billing|invoices?|pricing|tax|ledger|"
|
|
75
|
+
r"persistence|persist\w*|data mutation|delet\w*|migrations?|"
|
|
76
|
+
r"idempoten\w*|replay|duplicates?|api|webhook|raw-body|signatures?|"
|
|
77
|
+
r"allocation|scheduling|inventory|rollback|transaction|"
|
|
78
|
+
r"priority|error-priority|output-shape|output shape|response-shape|response shape"
|
|
79
|
+
r")\b"
|
|
80
|
+
)
|
|
81
|
+
if not risk_pattern.search(text):
|
|
82
|
+
print(
|
|
83
|
+
f"✗ {fid}: high-risk fixture must include a resolve risk-trigger term "
|
|
84
|
+
"in metadata intent or spec.md"
|
|
85
|
+
)
|
|
86
|
+
sys.exit(1)
|
|
87
|
+
PY
|
|
88
|
+
fi
|
|
89
|
+
|
|
90
|
+
if [ -f "$d/spec.md" ]; then
|
|
91
|
+
spec_id=$(python3 - "$d/spec.md" <<'PY' 2>/dev/null || true
|
|
92
|
+
import re, sys
|
|
93
|
+
text = open(sys.argv[1], encoding="utf-8").read()
|
|
94
|
+
m = re.search(r'^id:\s*"?([^"\n]+)"?\s*$', text, re.M)
|
|
95
|
+
print(m.group(1) if m else "")
|
|
96
|
+
PY
|
|
97
|
+
)
|
|
98
|
+
if [ "$spec_id" != "$fid" ]; then
|
|
99
|
+
echo "✗ $fid: spec.md frontmatter id='$spec_id' does not match dir name"
|
|
100
|
+
ERRORS=$((ERRORS + 1))
|
|
101
|
+
fi
|
|
102
|
+
fi
|
|
103
|
+
|
|
104
|
+
if [ -f "$d/expected.json" ]; then
|
|
105
|
+
if ! python3 - "$d/expected.json" "$fid" <<'PY'
|
|
106
|
+
import json
|
|
107
|
+
import sys
|
|
108
|
+
|
|
109
|
+
expected_path, fid = sys.argv[1], sys.argv[2]
|
|
110
|
+
try:
|
|
111
|
+
data = json.load(open(expected_path, encoding="utf-8"))
|
|
112
|
+
except json.JSONDecodeError:
|
|
113
|
+
print(f"✗ {fid}: expected.json is not valid JSON")
|
|
114
|
+
sys.exit(1)
|
|
115
|
+
if not isinstance(data, dict):
|
|
116
|
+
print(f"✗ {fid}: expected.json must be an object")
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
PY
|
|
119
|
+
then
|
|
120
|
+
ERRORS=$((ERRORS + 1))
|
|
121
|
+
continue
|
|
122
|
+
fi
|
|
123
|
+
|
|
124
|
+
n_cmds=$(python3 - "$d/expected.json" <<'PY'
|
|
125
|
+
import json
|
|
126
|
+
import sys
|
|
127
|
+
|
|
128
|
+
data = json.load(open(sys.argv[1], encoding="utf-8"))
|
|
129
|
+
commands = data.get("verification_commands", [])
|
|
130
|
+
print(len(commands) if isinstance(commands, list) else 0)
|
|
131
|
+
PY
|
|
132
|
+
)
|
|
133
|
+
if [ "$n_cmds" -lt 1 ]; then
|
|
134
|
+
echo "✗ $fid: expected.json has 0 verification_commands (need ≥1)"
|
|
135
|
+
ERRORS=$((ERRORS + 1))
|
|
136
|
+
fi
|
|
137
|
+
|
|
138
|
+
schema_ok=1
|
|
139
|
+
if ! python3 - "$SCHEMA" "$d/expected.json" "$fid" <<'PY'
|
|
140
|
+
import json, os, sys
|
|
141
|
+
schema_path, expected_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
142
|
+
schema = json.load(open(schema_path))
|
|
143
|
+
data = json.load(open(expected_path))
|
|
144
|
+
|
|
145
|
+
def is_string_list(value):
|
|
146
|
+
return isinstance(value, list) and all(isinstance(item, str) and item for item in value)
|
|
147
|
+
|
|
148
|
+
def fallback_validate():
|
|
149
|
+
allowed = set(schema["properties"])
|
|
150
|
+
errors = []
|
|
151
|
+
if not isinstance(data, dict):
|
|
152
|
+
return ["expected.json must be an object"]
|
|
153
|
+
unknown = sorted(set(data) - allowed)
|
|
154
|
+
if unknown:
|
|
155
|
+
errors.append(f"expected.json has unknown key(s): {', '.join(unknown)}")
|
|
156
|
+
commands = data.get("verification_commands", [])
|
|
157
|
+
if not isinstance(commands, list):
|
|
158
|
+
errors.append("verification_commands must be an array")
|
|
159
|
+
else:
|
|
160
|
+
for idx, command in enumerate(commands):
|
|
161
|
+
if not isinstance(command, dict):
|
|
162
|
+
errors.append(f"verification_commands[{idx}] must be an object")
|
|
163
|
+
continue
|
|
164
|
+
unknown_command = sorted(set(command) - {"cmd", "exit_code", "stdout_contains", "stdout_not_contains", "contract_refs"})
|
|
165
|
+
if unknown_command:
|
|
166
|
+
errors.append(f"verification_commands[{idx}] has unknown key(s): {', '.join(unknown_command)}")
|
|
167
|
+
if not isinstance(command.get("cmd"), str) or not command.get("cmd"):
|
|
168
|
+
errors.append(f"verification_commands[{idx}].cmd must be a non-empty string")
|
|
169
|
+
exit_code = command.get("exit_code", 0)
|
|
170
|
+
if isinstance(exit_code, bool) or not isinstance(exit_code, int):
|
|
171
|
+
errors.append(f"verification_commands[{idx}].exit_code must be an integer")
|
|
172
|
+
for key in ("stdout_contains", "stdout_not_contains", "contract_refs"):
|
|
173
|
+
if key in command and not is_string_list(command[key]):
|
|
174
|
+
errors.append(f"verification_commands[{idx}].{key} must be an array of non-empty strings")
|
|
175
|
+
patterns = data.get("forbidden_patterns", [])
|
|
176
|
+
if not isinstance(patterns, list):
|
|
177
|
+
errors.append("forbidden_patterns must be an array")
|
|
178
|
+
else:
|
|
179
|
+
for idx, pattern in enumerate(patterns):
|
|
180
|
+
if not isinstance(pattern, dict):
|
|
181
|
+
errors.append(f"forbidden_patterns[{idx}] must be an object")
|
|
182
|
+
continue
|
|
183
|
+
unknown_pattern = sorted(set(pattern) - {"pattern", "description", "files", "severity"})
|
|
184
|
+
if unknown_pattern:
|
|
185
|
+
errors.append(f"forbidden_patterns[{idx}] has unknown key(s): {', '.join(unknown_pattern)}")
|
|
186
|
+
for key in ("pattern", "description"):
|
|
187
|
+
if not isinstance(pattern.get(key), str) or not pattern.get(key):
|
|
188
|
+
errors.append(f"forbidden_patterns[{idx}].{key} must be a non-empty string")
|
|
189
|
+
if pattern.get("severity") not in {"disqualifier", "warning"}:
|
|
190
|
+
errors.append(f"forbidden_patterns[{idx}].severity must be disqualifier or warning")
|
|
191
|
+
if "files" in pattern and not is_string_list(pattern["files"]):
|
|
192
|
+
errors.append(f"forbidden_patterns[{idx}].files must be an array of non-empty strings")
|
|
193
|
+
for key in ("required_files", "forbidden_files", "tier_a_waivers", "spec_output_files"):
|
|
194
|
+
if key in data and not is_string_list(data[key]):
|
|
195
|
+
errors.append(f"{key} must be an array of non-empty strings")
|
|
196
|
+
max_deps_added = data.get("max_deps_added", 0)
|
|
197
|
+
if isinstance(max_deps_added, bool) or not isinstance(max_deps_added, int) or max_deps_added < 0:
|
|
198
|
+
errors.append("max_deps_added must be an integer >= 0")
|
|
199
|
+
return errors
|
|
200
|
+
|
|
201
|
+
force_fallback = os.environ.get("DEVLYN_LINT_FIXTURES_NO_JSONSCHEMA") == "1"
|
|
202
|
+
try:
|
|
203
|
+
if force_fallback:
|
|
204
|
+
raise ImportError
|
|
205
|
+
import jsonschema
|
|
206
|
+
except ImportError:
|
|
207
|
+
fallback_errors = fallback_validate()
|
|
208
|
+
if fallback_errors:
|
|
209
|
+
for error in fallback_errors:
|
|
210
|
+
print(f"✗ {fid}: expected.json schema violation: {error}")
|
|
211
|
+
sys.exit(1)
|
|
212
|
+
else:
|
|
213
|
+
try:
|
|
214
|
+
jsonschema.validate(data, schema)
|
|
215
|
+
except jsonschema.ValidationError as e:
|
|
216
|
+
print(f"✗ {fid}: expected.json schema violation: {e.message}")
|
|
217
|
+
sys.exit(1)
|
|
218
|
+
PY
|
|
219
|
+
then
|
|
220
|
+
ERRORS=$((ERRORS + 1))
|
|
221
|
+
schema_ok=0
|
|
222
|
+
fi
|
|
223
|
+
|
|
224
|
+
if [ "$schema_ok" -eq 1 ]; then
|
|
225
|
+
if ! python3 "$SPEC_VERIFY_CHECK" --check "$d/spec.md"; then
|
|
226
|
+
echo "✗ $fid: spec-verify-check --check failed"
|
|
227
|
+
ERRORS=$((ERRORS + 1))
|
|
228
|
+
fi
|
|
229
|
+
if ! python3 "$SPEC_VERIFY_CHECK" --check-expected "$d/expected.json"; then
|
|
230
|
+
echo "✗ $fid: spec-verify-check --check-expected failed"
|
|
231
|
+
ERRORS=$((ERRORS + 1))
|
|
232
|
+
fi
|
|
233
|
+
|
|
234
|
+
python3 - "$d/spec.md" "$d/expected.json" "$fid" <<'PY' || ERRORS=$((ERRORS + 1))
|
|
235
|
+
import json, pathlib, re, sys
|
|
236
|
+
spec_path, expected_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
|
|
237
|
+
spec = open(spec_path, encoding="utf-8").read()
|
|
238
|
+
expected = json.load(open(expected_path, encoding="utf-8"))
|
|
239
|
+
fixture_dir = pathlib.Path(expected_path).parent
|
|
240
|
+
fixture_root = fixture_dir.resolve()
|
|
241
|
+
errors = []
|
|
242
|
+
for idx, command in enumerate(expected.get("verification_commands", [])):
|
|
243
|
+
cmd = str(command.get("cmd", ""))
|
|
244
|
+
if "BENCH_FIXTURE_DIR" not in cmd:
|
|
245
|
+
continue
|
|
246
|
+
fixture_refs = re.findall(r"(?:\$\{BENCH_FIXTURE_DIR\}|\$BENCH_FIXTURE_DIR)/([^\"'\s]+)", cmd)
|
|
247
|
+
if not fixture_refs:
|
|
248
|
+
errors.append(
|
|
249
|
+
f"verification_commands[{idx}] hidden oracle must reference an explicit $BENCH_FIXTURE_DIR/... file"
|
|
250
|
+
)
|
|
251
|
+
stdout_contains = command.get("stdout_contains", [])
|
|
252
|
+
if '"ok":true' not in stdout_contains:
|
|
253
|
+
errors.append(
|
|
254
|
+
f"verification_commands[{idx}] hidden oracle must assert stdout_contains includes '\"ok\":true'"
|
|
255
|
+
)
|
|
256
|
+
for fixture_ref in fixture_refs:
|
|
257
|
+
target = (fixture_dir / fixture_ref).resolve(strict=False)
|
|
258
|
+
try:
|
|
259
|
+
target.relative_to(fixture_root)
|
|
260
|
+
except ValueError:
|
|
261
|
+
errors.append(
|
|
262
|
+
f"verification_commands[{idx}] BENCH_FIXTURE_DIR file escapes fixture dir: {fixture_ref!r}"
|
|
263
|
+
)
|
|
264
|
+
continue
|
|
265
|
+
if not target.is_file():
|
|
266
|
+
errors.append(
|
|
267
|
+
f"verification_commands[{idx}] BENCH_FIXTURE_DIR file not found: {fixture_ref!r}"
|
|
268
|
+
)
|
|
269
|
+
refs = command.get("contract_refs", [])
|
|
270
|
+
if not refs:
|
|
271
|
+
errors.append(f"verification_commands[{idx}] hidden oracle missing contract_refs")
|
|
272
|
+
continue
|
|
273
|
+
for ref in refs:
|
|
274
|
+
if ref not in spec:
|
|
275
|
+
errors.append(
|
|
276
|
+
f"verification_commands[{idx}] contract_ref not found in spec.md: {ref!r}"
|
|
277
|
+
)
|
|
278
|
+
if errors:
|
|
279
|
+
for err in errors:
|
|
280
|
+
print(f"✗ {fid}: {err}")
|
|
281
|
+
sys.exit(1)
|
|
282
|
+
PY
|
|
283
|
+
fi
|
|
284
|
+
fi
|
|
285
|
+
|
|
286
|
+
if [ -f "$d/setup.sh" ] && [ ! -x "$d/setup.sh" ]; then
|
|
287
|
+
echo "✗ $fid: setup.sh not executable (run: chmod +x $d/setup.sh)"
|
|
288
|
+
ERRORS=$((ERRORS + 1))
|
|
289
|
+
fi
|
|
290
|
+
|
|
291
|
+
if [ -f "$d/NOTES.md" ] \
|
|
292
|
+
&& { { grep -Fq 'headroom gate' "$d/NOTES.md" && grep -Eq '`?FAIL`?' "$d/NOTES.md"; } \
|
|
293
|
+
|| { grep -Fq 'pair-lift evidence' "$d/NOTES.md" && grep -Eiq 'reject|rejected' "$d/NOTES.md"; }; } \
|
|
294
|
+
&& ! rejected_pair_fixture_reason "$fid" >/dev/null 2>&1; then
|
|
295
|
+
echo "✗ $fid: NOTES.md records pair-candidate rejection but pair-rejected-fixtures.sh has no rejected reason"
|
|
296
|
+
ERRORS=$((ERRORS + 1))
|
|
297
|
+
fi
|
|
298
|
+
|
|
299
|
+
if [ -f "$d/NOTES.md" ] \
|
|
300
|
+
&& grep -Fq 'pair_evidence_passed' "$d/NOTES.md" \
|
|
301
|
+
&& ! python3 "$SOLO_HEADROOM_CHECK" --expected-json "$d/expected.json" "$d/spec.md"; then
|
|
302
|
+
echo "✗ $fid: pair_evidence_passed fixture spec.md must document an actionable solo-headroom hypothesis with solo_claude miss and observable command from expected.json"
|
|
303
|
+
ERRORS=$((ERRORS + 1))
|
|
304
|
+
fi
|
|
305
|
+
done
|
|
306
|
+
|
|
307
|
+
for d in "$FIXTURES_DIR"/retired/$RETIRED_FIXTURE_GLOB/; do
|
|
308
|
+
[ -d "$d" ] || continue
|
|
309
|
+
RETIRED_COUNT=$((RETIRED_COUNT + 1))
|
|
310
|
+
fid="$(basename "$d")"
|
|
311
|
+
|
|
312
|
+
if [ ! -f "$d/RETIRED.md" ]; then
|
|
313
|
+
echo "✗ retired/$fid: missing RETIRED.md"
|
|
314
|
+
ERRORS=$((ERRORS + 1))
|
|
315
|
+
fi
|
|
316
|
+
|
|
317
|
+
for f in "${REQUIRED_FILES[@]}"; do
|
|
318
|
+
if [ ! -f "$d/$f" ]; then
|
|
319
|
+
echo "✗ retired/$fid: missing preserved $f"
|
|
320
|
+
ERRORS=$((ERRORS + 1))
|
|
321
|
+
fi
|
|
322
|
+
done
|
|
323
|
+
|
|
324
|
+
if [ -f "$d/metadata.json" ]; then
|
|
325
|
+
meta_id=$(python3 -c "import json,sys; print(json.load(open('$d/metadata.json'))['id'])" 2>/dev/null || echo "")
|
|
326
|
+
if [ "$meta_id" != "$fid" ]; then
|
|
327
|
+
echo "✗ retired/$fid: metadata.json id='$meta_id' does not match dir name"
|
|
328
|
+
ERRORS=$((ERRORS + 1))
|
|
329
|
+
fi
|
|
330
|
+
fi
|
|
331
|
+
|
|
332
|
+
if [ -f "$d/setup.sh" ] && [ ! -x "$d/setup.sh" ]; then
|
|
333
|
+
echo "✗ retired/$fid: setup.sh not executable (run: chmod +x $d/setup.sh)"
|
|
334
|
+
ERRORS=$((ERRORS + 1))
|
|
335
|
+
fi
|
|
336
|
+
done
|
|
337
|
+
|
|
338
|
+
if [ $COUNT -eq 0 ]; then
|
|
339
|
+
echo "✗ no fixtures found in $FIXTURES_DIR"
|
|
340
|
+
exit 1
|
|
341
|
+
fi
|
|
342
|
+
|
|
343
|
+
if [ $ERRORS -gt 0 ]; then
|
|
344
|
+
echo ""
|
|
345
|
+
echo "✗ lint-fixtures: $ERRORS error(s) across $COUNT active fixture(s) and $RETIRED_COUNT retired fixture(s)"
|
|
346
|
+
exit 1
|
|
347
|
+
fi
|
|
348
|
+
|
|
349
|
+
echo "✓ lint-fixtures: $COUNT active fixture(s) passed schema + structural checks; $RETIRED_COUNT retired fixture(s) preserved"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# lint-shadow-fixtures.sh — run the standard fixture lint over shadow-fixtures/.
|
|
3
|
+
|
|
4
|
+
set -euo pipefail
|
|
5
|
+
|
|
6
|
+
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
7
|
+
SHADOW_FIXTURES_DIR="${DEVLYN_SHADOW_FIXTURES_DIR:-$REPO_ROOT/benchmark/auto-resolve/shadow-fixtures}"
|
|
8
|
+
|
|
9
|
+
DEVLYN_FIXTURES_DIR="$SHADOW_FIXTURES_DIR" \
|
|
10
|
+
DEVLYN_FIXTURE_GLOB="S*" \
|
|
11
|
+
DEVLYN_RETIRED_FIXTURE_GLOB="S*" \
|
|
12
|
+
bash "$REPO_ROOT/scripts/lint-fixtures.sh"
|
|
13
|
+
|
|
14
|
+
has_actionable_solo_headroom_hypothesis() {
|
|
15
|
+
python3 "$REPO_ROOT/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py" "$@"
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
has_solo_ceiling_avoidance_note() {
|
|
19
|
+
local notes="$1"
|
|
20
|
+
python3 "$REPO_ROOT/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py" "$notes"
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
errors=0
|
|
24
|
+
for d in "$SHADOW_FIXTURES_DIR"/S*/; do
|
|
25
|
+
[ -d "$d" ] || continue
|
|
26
|
+
fid="$(basename "$d")"
|
|
27
|
+
meta="$d/metadata.json"
|
|
28
|
+
spec="$d/spec.md"
|
|
29
|
+
notes="$d/NOTES.md"
|
|
30
|
+
has_failed_headroom=0
|
|
31
|
+
if [ -f "$notes" ] && grep -Fq 'headroom' "$notes" && grep -Eq '`?FAIL`?' "$notes"; then
|
|
32
|
+
has_failed_headroom=1
|
|
33
|
+
fi
|
|
34
|
+
category="$(
|
|
35
|
+
python3 - "$meta" <<'PY'
|
|
36
|
+
import json
|
|
37
|
+
import sys
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
with open(sys.argv[1], encoding="utf-8") as handle:
|
|
41
|
+
print(json.load(handle).get("category", ""))
|
|
42
|
+
except FileNotFoundError:
|
|
43
|
+
print("")
|
|
44
|
+
PY
|
|
45
|
+
)"
|
|
46
|
+
if [ "$category" = "high-risk" ] && [ "$has_failed_headroom" -eq 0 ]; then
|
|
47
|
+
if ! has_actionable_solo_headroom_hypothesis --expected-json "$d/expected.json" "$spec"; then
|
|
48
|
+
echo "✗ $fid: unmeasured high-risk shadow fixture spec.md must document a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend"
|
|
49
|
+
errors=$((errors + 1))
|
|
50
|
+
fi
|
|
51
|
+
if ! has_solo_ceiling_avoidance_note "$notes"; then
|
|
52
|
+
echo "✗ $fid: unmeasured high-risk shadow fixture NOTES.md must include ## Solo ceiling avoidance naming how it differs from solo-saturated controls before provider spend"
|
|
53
|
+
errors=$((errors + 1))
|
|
54
|
+
fi
|
|
55
|
+
fi
|
|
56
|
+
done
|
|
57
|
+
|
|
58
|
+
[ "$errors" -eq 0 ] || exit 1
|