devlyn-cli 1.15.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +104 -0
- package/CLAUDE.md +135 -21
- package/README.md +43 -125
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
- package/benchmark/auto-resolve/README.md +114 -0
- package/benchmark/auto-resolve/RUBRIC.md +162 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
- package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
- package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
- package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
- package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
- package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
- package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
- package/benchmark/auto-resolve/scripts/judge.sh +359 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
- package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
- package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
- package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
- package/bin/devlyn.js +175 -17
- package/config/skills/_shared/adapters/README.md +64 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
- package/config/skills/_shared/adapters/opus-4-7.md +29 -0
- package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
- package/config/skills/_shared/codex-config.md +54 -0
- package/config/skills/_shared/codex-monitored.sh +141 -0
- package/config/skills/_shared/engine-preflight.md +35 -0
- package/config/skills/_shared/expected.schema.json +93 -0
- package/config/skills/_shared/pair-plan-schema.md +298 -0
- package/config/skills/_shared/runtime-principles.md +110 -0
- package/config/skills/_shared/spec-verify-check.py +519 -0
- package/config/skills/devlyn:ideate/SKILL.md +99 -429
- package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
- package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
- package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
- package/config/skills/devlyn:resolve/SKILL.md +172 -184
- package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
- package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
- package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
- package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
- package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
- package/package.json +12 -2
- package/scripts/lint-skills.sh +431 -0
- package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
- package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
- package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
- package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
- package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
- package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
- package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
- package/config/skills/devlyn:clean/SKILL.md +0 -285
- package/config/skills/devlyn:design-ui/SKILL.md +0 -351
- package/config/skills/devlyn:discover-product/SKILL.md +0 -124
- package/config/skills/devlyn:evaluate/SKILL.md +0 -564
- package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
- package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
- package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
- package/config/skills/devlyn:preflight/SKILL.md +0 -355
- package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
- package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
- package/config/skills/devlyn:product-spec/SKILL.md +0 -603
- package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
- package/config/skills/devlyn:review/SKILL.md +0 -161
- package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
- package/config/skills/devlyn:team-review/SKILL.md +0 -493
- package/config/skills/devlyn:update-docs/SKILL.md +0 -463
- package/config/skills/workflow-routing/SKILL.md +0 -73
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# codex-monitored.sh — run `codex exec` in a monitored shape that keeps the
|
|
3
|
+
# outer claude -p API stream from going silent during long Codex calls.
|
|
4
|
+
#
|
|
5
|
+
# WHY (iter-0009, post iter-0006/0007/0008):
|
|
6
|
+
# • iter-0007 isolation proved a single foreground `codex exec` Bash dispatch
|
|
7
|
+
# can starve the outer API stream of bytes during a 10+ min run; Anthropic's
|
|
8
|
+
# byte-level idle watchdog fires (~300s) and kills the orchestrator.
|
|
9
|
+
# • iter-0008 saw the orchestrator pick `codex exec ... 2>&1 | tail -200` from
|
|
10
|
+
# its own pattern prior — `tail` on a pipe buffers until EOF, suppressing
|
|
11
|
+
# ALL bytes. Same starvation, amplified.
|
|
12
|
+
# • iter-0008 also documented codex 0.124.0 reads stdin as a `<stdin>` block
|
|
13
|
+
# when the prompt is passed as an arg AND stdin is open; without
|
|
14
|
+
# `< /dev/null` the call hangs indefinitely.
|
|
15
|
+
#
|
|
16
|
+
# WHAT THIS WRAPPER DOES:
|
|
17
|
+
# 1. Refuses to run if stdout is a pipe. Piping wrapper output to text tools
|
|
18
|
+
# (tail/head/awk/sed/grep without --line-buffered) re-introduces the
|
|
19
|
+
# iter-0008 starvation mechanism — the downstream tool buffers until EOF
|
|
20
|
+
# and the outer claude -p byte-watchdog never sees bytes. Exits 64 with a
|
|
21
|
+
# clear message so the orchestrator can self-correct on retry.
|
|
22
|
+
# (Round 2 finding #1 fix: shim alone does not defeat `| tail`; the
|
|
23
|
+
# wrapper must reject the pipe shape directly.)
|
|
24
|
+
# 2. Closes stdin (`< /dev/null`) — kills the codex 0.124.0 stdin hang.
|
|
25
|
+
# 3. Streams codex stdout to OUR stdout line-by-line — the orchestrator reads
|
|
26
|
+
# stdout as the subagent reply (per `_shared/codex-config.md`) so we MUST
|
|
27
|
+
# NOT swallow it (e.g. `tail -n 200`). codex stderr forwards to OUR stderr.
|
|
28
|
+
# 4. Emits a `[codex-monitored] heartbeat` line every CODEX_MONITORED_HEARTBEAT
|
|
29
|
+
# seconds (default 30s) on STDERR while codex is alive. Heartbeat-on-stderr
|
|
30
|
+
# keeps the orchestrator's combined-output stream non-silent without
|
|
31
|
+
# polluting the codex-reply view of stdout.
|
|
32
|
+
# 5. Forwards SIGTERM/SIGINT from the outer watchdog to the codex child so a
|
|
33
|
+
# timeout actually reaps codex (otherwise process group kill races with
|
|
34
|
+
# backgrounded codex).
|
|
35
|
+
# 6. Preserves codex's exact exit code.
|
|
36
|
+
#
|
|
37
|
+
# USAGE:
|
|
38
|
+
# bash codex-monitored.sh -C <repo> -s read-only -c model_reasoning_effort=xhigh "<prompt>"
|
|
39
|
+
# bash codex-monitored.sh resume --last
|
|
40
|
+
# (Args after the script name are passed verbatim to `codex exec`.)
|
|
41
|
+
#
|
|
42
|
+
# ENV OVERRIDES:
|
|
43
|
+
# CODEX_MONITORED_HEARTBEAT — heartbeat interval seconds (default 30).
|
|
44
|
+
# CODEX_BIN — real codex binary path. Default: `codex`.
|
|
45
|
+
# Set this when the shim has put us first
|
|
46
|
+
# on PATH.
|
|
47
|
+
# CODEX_MONITORED_ALLOW_PIPED — set non-empty to skip the pipe-stdout
|
|
48
|
+
# refusal. Reserved for tests; don't use
|
|
49
|
+
# in skill prompts.
|
|
50
|
+
|
|
51
|
+
set -uo pipefail
|
|
52
|
+
|
|
53
|
+
# iter-0019 — solo_claude (L1) arm enforcement (defense in depth alongside
|
|
54
|
+
# scripts/codex-shim/codex). If this env is set, the wrapper refuses to invoke
|
|
55
|
+
# codex at all, regardless of how it was reached. Two enforcement points
|
|
56
|
+
# protect against the case where one is bypassed: the shim catches PATH-based
|
|
57
|
+
# resolution, and this wrapper catches direct-path invocations of
|
|
58
|
+
# codex-monitored.sh that don't go through the shim.
|
|
59
|
+
if [ -n "${CODEX_BLOCKED:-}" ]; then
|
|
60
|
+
printf '[codex-monitored] CODEX_BLOCKED=%s — refusing codex invocation (solo_claude / L1 arm enforcement). args: %s\n' \
|
|
61
|
+
"${CODEX_BLOCKED}" "$*" >&2
|
|
62
|
+
exit 126
|
|
63
|
+
fi
|
|
64
|
+
|
|
65
|
+
HEARTBEAT_SEC="${CODEX_MONITORED_HEARTBEAT:-30}"
|
|
66
|
+
CODEX_BIN="${CODEX_BIN:-codex}"
|
|
67
|
+
START=$(date +%s)
|
|
68
|
+
|
|
69
|
+
# --- Pipe-stdout refusal (iter-0009 R2 finding #1) -------------------------
|
|
70
|
+
# `[ -p /dev/stdout ]` is the POSIX test for "is fd 1 a FIFO/pipe". Verified
|
|
71
|
+
# correct on macOS via lsof: distinguishes piped (`| cat`) from redirected
|
|
72
|
+
# (`> file`) and from claude-bash-tool capture (regular file). Without this
|
|
73
|
+
# refusal, `bash WRAPPER ... 2>&1 | tail -200` would buffer wrapper output —
|
|
74
|
+
# including the heartbeat on stderr after `2>&1` — until EOF, reproducing
|
|
75
|
+
# the iter-0008 byte-watchdog kill.
|
|
76
|
+
if [ -z "${CODEX_MONITORED_ALLOW_PIPED:-}" ] && [ -p /dev/stdout ]; then
|
|
77
|
+
cat >&2 <<'EOF'
|
|
78
|
+
[codex-monitored] error: stdout is a pipe.
|
|
79
|
+
|
|
80
|
+
Piping the wrapper to tail/head/awk/sed/grep buffers wrapper output until EOF,
|
|
81
|
+
which starves the outer claude -p byte-watchdog (iter-0008 starvation mechanism)
|
|
82
|
+
and kills the run after ~300s with empty transcript.
|
|
83
|
+
|
|
84
|
+
Fix: invoke the wrapper directly so the bash tool captures its stdout. The
|
|
85
|
+
wrapper streams full Codex output and emits a heartbeat on stderr; you do NOT
|
|
86
|
+
need to truncate.
|
|
87
|
+
|
|
88
|
+
WRONG: bash codex-monitored.sh ... 2>&1 | tail -200
|
|
89
|
+
RIGHT: bash codex-monitored.sh ...
|
|
90
|
+
|
|
91
|
+
If you absolutely must filter, use a line-buffered tool (e.g. `grep --line-buffered`)
|
|
92
|
+
and set CODEX_MONITORED_ALLOW_PIPED=1 in the wrapper's environment.
|
|
93
|
+
EOF
|
|
94
|
+
exit 64
|
|
95
|
+
fi
|
|
96
|
+
|
|
97
|
+
# --- Heartbeat + signal forwarding ----------------------------------------
|
|
98
|
+
heartbeat_loop() {
|
|
99
|
+
local pid="$1"
|
|
100
|
+
while kill -0 "$pid" 2>/dev/null; do
|
|
101
|
+
sleep "$HEARTBEAT_SEC"
|
|
102
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
103
|
+
local elapsed=$(( $(date +%s) - START ))
|
|
104
|
+
printf '[codex-monitored] heartbeat: elapsed=%ds\n' "$elapsed" >&2
|
|
105
|
+
fi
|
|
106
|
+
done
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
forward_signal() {
|
|
110
|
+
local sig="$1"
|
|
111
|
+
if [ -n "${CODEX_PID:-}" ] && kill -0 "$CODEX_PID" 2>/dev/null; then
|
|
112
|
+
kill -"$sig" "$CODEX_PID" 2>/dev/null || true
|
|
113
|
+
fi
|
|
114
|
+
if [ -n "${HB_PID:-}" ] && kill -0 "$HB_PID" 2>/dev/null; then
|
|
115
|
+
kill -TERM "$HB_PID" 2>/dev/null || true
|
|
116
|
+
fi
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
trap 'forward_signal TERM' TERM
|
|
120
|
+
trap 'forward_signal INT' INT
|
|
121
|
+
|
|
122
|
+
printf '[codex-monitored] start: ts=%s heartbeat=%ds bin=%s\n' \
|
|
123
|
+
"$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$CODEX_BIN" >&2
|
|
124
|
+
|
|
125
|
+
# Launch codex with stdin closed; output streams directly to OUR stdout/stderr.
|
|
126
|
+
"$CODEX_BIN" exec "$@" < /dev/null &
|
|
127
|
+
CODEX_PID=$!
|
|
128
|
+
printf '[codex-monitored] codex pid=%d\n' "$CODEX_PID" >&2
|
|
129
|
+
|
|
130
|
+
heartbeat_loop "$CODEX_PID" &
|
|
131
|
+
HB_PID=$!
|
|
132
|
+
|
|
133
|
+
wait "$CODEX_PID"
|
|
134
|
+
EXIT=$?
|
|
135
|
+
|
|
136
|
+
kill -TERM "$HB_PID" 2>/dev/null || true
|
|
137
|
+
wait "$HB_PID" 2>/dev/null || true
|
|
138
|
+
|
|
139
|
+
printf '[codex-monitored] codex exited: code=%d elapsed=%ds\n' \
|
|
140
|
+
"$EXIT" $(( $(date +%s) - START )) >&2
|
|
141
|
+
exit "$EXIT"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Shared — `--engine` Pre-flight
|
|
2
|
+
|
|
3
|
+
Used by `/devlyn:resolve` and `/devlyn:ideate`. One shared availability rule so every skill routes identically.
|
|
4
|
+
|
|
5
|
+
## Rule
|
|
6
|
+
|
|
7
|
+
Each skill resolves the effective engine from its own SKILL.md default plus any explicit `--engine` flag passed by the user. This pre-flight runs **only when the resolved engine is `auto` or `codex`** — when the resolved engine is `claude` (whether by skill default or explicit flag), the Codex check is skipped entirely.
|
|
8
|
+
|
|
9
|
+
When the resolved engine is `auto` or `codex`, on entry (before spawning any phase that could route to Codex):
|
|
10
|
+
|
|
11
|
+
1. Check if the Codex CLI is installed: `command -v codex >/dev/null 2>&1` (or equivalent bash test).
|
|
12
|
+
2. On failure → silently set `engine = "claude"` for the remainder of this run AND log `engine downgraded: codex-unavailable` into the skill's final summary/report header.
|
|
13
|
+
3. On success → proceed with the original engine value.
|
|
14
|
+
|
|
15
|
+
Never prompt the user. Never abort the run on missing CLI.
|
|
16
|
+
|
|
17
|
+
Per-skill defaults: `/devlyn:resolve` defaults to `claude` (post iter-0020 close-out — Codex BUILD/IMPLEMENT below quality floor; iter-0033g + iter-0034 close-out — PLAN-pair research-only until container/sandbox infra justifies a measurement); `/devlyn:ideate` defaults to `auto` for the CHALLENGE phase's cross-model GAN-critic dynamic. Each skill's SKILL.md flag block is the source of truth for that skill's default.
|
|
18
|
+
|
|
19
|
+
## Why this is the one permitted silent fallback
|
|
20
|
+
|
|
21
|
+
`CLAUDE.md` sets the no-silent-fallback rule for this repo. This downgrade is documented there as the single explicit exception because the hands-free contract — skills the user walks away from — would otherwise fail every run whenever the Codex CLI is absent. The user-visible behavior is identical to an explicit `--engine claude` invocation, and the banner in the final report removes the silence. Any other silent fallback in skills code is a bug.
|
|
22
|
+
|
|
23
|
+
## What a skill must log after downgrade
|
|
24
|
+
|
|
25
|
+
When the resolved engine was `auto` / `codex` and the Codex CLI was absent, the final user-facing report/summary shows both the requested and effective mode:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
Engine: claude (downgraded from auto — codex-unavailable)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
If no downgrade happened (either Codex was available, or the resolved engine was already `claude`), omit the parenthetical. That single line is the contract — the user can always see why Codex did or did not participate.
|
|
32
|
+
|
|
33
|
+
## Canonical Codex invocation
|
|
34
|
+
|
|
35
|
+
See `config/skills/_shared/codex-config.md` for the canonical wrapper invocation and flag set skills should use after the availability check passes.
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
|
+
"$id": "https://github.com/fysoul17/devlyn-cli/config/skills/_shared/expected.schema.json",
|
|
4
|
+
"title": "spec.expected.json — mechanical acceptance contract",
|
|
5
|
+
"description": "Load-bearing LLM-agnostic decoupler for the devlyn-cli harness. Defines the machine-readable acceptance criteria a spec ships alongside spec.md. Stable across model upgrades — when Opus 5 / GPT-6 / Qwen / Gemini land, this schema does not move; only the per-model adapter files in _shared/adapters/ do.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"additionalProperties": false,
|
|
8
|
+
"properties": {
|
|
9
|
+
"verification_commands": {
|
|
10
|
+
"type": "array",
|
|
11
|
+
"description": "Each command is executed against the post-BUILD code. Each pass/fail contributes to verify_score. At least one entry is required when the spec has any observable runtime check (CLI, test command, HTTP request).",
|
|
12
|
+
"items": {
|
|
13
|
+
"type": "object",
|
|
14
|
+
"additionalProperties": false,
|
|
15
|
+
"required": ["cmd"],
|
|
16
|
+
"properties": {
|
|
17
|
+
"cmd": {
|
|
18
|
+
"type": "string",
|
|
19
|
+
"description": "Shell command, executed via `subprocess.run(..., shell=True)` from the build's working directory.",
|
|
20
|
+
"minLength": 1
|
|
21
|
+
},
|
|
22
|
+
"exit_code": {
|
|
23
|
+
"type": "integer",
|
|
24
|
+
"description": "Required exit code. Default 0 if omitted.",
|
|
25
|
+
"default": 0
|
|
26
|
+
},
|
|
27
|
+
"stdout_contains": {
|
|
28
|
+
"type": "array",
|
|
29
|
+
"description": "Each substring must appear verbatim in (stdout + stderr) for pass.",
|
|
30
|
+
"items": { "type": "string", "minLength": 1 },
|
|
31
|
+
"default": []
|
|
32
|
+
},
|
|
33
|
+
"stdout_not_contains": {
|
|
34
|
+
"type": "array",
|
|
35
|
+
"description": "None of these substrings may appear in (stdout + stderr) for pass.",
|
|
36
|
+
"items": { "type": "string", "minLength": 1 },
|
|
37
|
+
"default": []
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"forbidden_patterns": {
|
|
43
|
+
"type": "array",
|
|
44
|
+
"description": "Regex patterns scanned across diff.patch. Match at severity=disqualifier is a hard-floor fail; match at severity=warning is judge-only critical-finding.",
|
|
45
|
+
"items": {
|
|
46
|
+
"type": "object",
|
|
47
|
+
"additionalProperties": false,
|
|
48
|
+
"required": ["pattern", "description", "severity"],
|
|
49
|
+
"properties": {
|
|
50
|
+
"pattern": {
|
|
51
|
+
"type": "string",
|
|
52
|
+
"description": "Python re.search-compatible regex. Anchored implicitly by the surrounding regex, NOT by ^ / $ unless intended.",
|
|
53
|
+
"minLength": 1
|
|
54
|
+
},
|
|
55
|
+
"description": {
|
|
56
|
+
"type": "string",
|
|
57
|
+
"description": "Human-readable explanation of what the pattern catches and why it's forbidden.",
|
|
58
|
+
"minLength": 1
|
|
59
|
+
},
|
|
60
|
+
"files": {
|
|
61
|
+
"type": "array",
|
|
62
|
+
"description": "Optional allow-list of files (substrings of diff --git lines). When present, scan is sliced to hunks touching only these files.",
|
|
63
|
+
"items": { "type": "string", "minLength": 1 },
|
|
64
|
+
"default": []
|
|
65
|
+
},
|
|
66
|
+
"severity": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"enum": ["disqualifier", "warning"],
|
|
69
|
+
"description": "disqualifier = hard-floor fail (DQ). warning = judge-visible critical-finding only."
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
"required_files": {
|
|
75
|
+
"type": "array",
|
|
76
|
+
"description": "Files that must exist after the arm runs.",
|
|
77
|
+
"items": { "type": "string", "minLength": 1 },
|
|
78
|
+
"default": []
|
|
79
|
+
},
|
|
80
|
+
"forbidden_files": {
|
|
81
|
+
"type": "array",
|
|
82
|
+
"description": "Files that must NOT appear in the arm's diff (e.g. tooling artifacts the spec didn't request).",
|
|
83
|
+
"items": { "type": "string", "minLength": 1 },
|
|
84
|
+
"default": []
|
|
85
|
+
},
|
|
86
|
+
"max_deps_added": {
|
|
87
|
+
"type": "integer",
|
|
88
|
+
"description": "Hard cap on new entries under dependencies/devDependencies in package.json. Exceeds → DQ.",
|
|
89
|
+
"minimum": 0,
|
|
90
|
+
"default": 0
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
# Shared — `pair-plan.json` schema (iter-0022 archive)
|
|
2
|
+
|
|
3
|
+
> **Archive header (iter-0034 Phase 4 cutover, 2026-05-04)** — this schema was iter-0022 infrastructure for the now-deleted `/devlyn:auto-resolve` PHASE 0 plan-pair contract. The `/devlyn:resolve` PHASE 1 PLAN at HEAD runs solo (per iter-0033 (C1) PASS evidence + iter-0033g § "CLOSURE"). The schema is preserved here as a design archive: the unblock conditions for re-instating PLAN-pair (per `iterations/0034-phase-4-cutover.md` § "L2 PLAN-pair research-only label") are A — container/sandbox isolation justified by another product need, OR B — production telemetry captures positive evidence of subagent introspection that a PLAN-pair measurement would need to isolate. When either condition fires, this schema (and the associated lint / idgen / preflight tooling under `benchmark/auto-resolve/scripts/`) is the starting point.
|
|
4
|
+
|
|
5
|
+
Single source of truth for `pair-plan.json` and its companion `canonical_id_registry.json` when the architecture re-enters scope. Read this once before editing `pair-plan-idgen.py`, `pair-plan-lint.py`, `pair-plan-preflight.sh`, or any future plan-pair PHASE that consumes `state.plan`.
|
|
6
|
+
|
|
7
|
+
## Audience (when re-instated)
|
|
8
|
+
|
|
9
|
+
- `benchmark/auto-resolve/scripts/pair-plan-idgen.py` — produces `canonical_id_registry.json` from `expected.json` + checked-in oracle scripts.
|
|
10
|
+
- `benchmark/auto-resolve/scripts/pair-plan-lint.py` — validates a `pair-plan.json` against its registry.
|
|
11
|
+
- `autoresearch/scripts/pair-plan-preflight.sh` — orchestrates solo + pair plan generation against blind-aliased fixtures.
|
|
12
|
+
- A future `/devlyn:resolve` PHASE 1 plan-pair branch (currently solo; gated on unblock A or B above): would accept `--plan-path` / JSON payload, set `state.plan.{mode, path}`, and run lint before IMPLEMENT, mirroring the deleted `devlyn:auto-resolve` PHASE 0 contract.
|
|
13
|
+
|
|
14
|
+
## File locations and naming (canonical)
|
|
15
|
+
|
|
16
|
+
- Registry per fixture: `benchmark/auto-resolve/fixtures/<F>/expected-pair-plan-registry.json` (committed snapshot for diff-against-baseline; iter-0023 verifies the live idgen output equals this snapshot).
|
|
17
|
+
- Plan produced by preflight: `benchmark/auto-resolve/results/<run_id>/<blind_fixture>/plan-preflight/merged/pair-plan.json`.
|
|
18
|
+
- Plan supplied to a re-instated plan-pair branch by an external caller: any path the user chooses, passed via `--plan-path <path>` (the contract surface is preserved as iter-0022 design archive).
|
|
19
|
+
- The registry filename is `canonical_id_registry.json` for **runtime artifacts** — both inside the bundle dir and in the preflight output root. (HANDOFF.md:280 mentions `canonical-ids.json` for the preflight output dir; that name is deprecated — D4 emits `canonical_id_registry.json` to align with the rest of the toolchain.)
|
|
20
|
+
- The **committed fixture snapshot** is named `expected-pair-plan-registry.json` (one per fixture, under `benchmark/auto-resolve/fixtures/<F>/`) — distinct file name to make snapshots greppable separately from runtime artifacts. iter-0023 verifies the live idgen output equals the committed snapshot for the same fixture.
|
|
21
|
+
|
|
22
|
+
## `canonical_id_registry.json` shape
|
|
23
|
+
|
|
24
|
+
Top-level wrapper:
|
|
25
|
+
|
|
26
|
+
```jsonc
|
|
27
|
+
{
|
|
28
|
+
"schema_version": "1",
|
|
29
|
+
"fixture_id": "F2-cli-medium-subcommand",
|
|
30
|
+
"generated_at": "2026-04-29T18:30:00Z",
|
|
31
|
+
"generated_from": {
|
|
32
|
+
"expected_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json",
|
|
33
|
+
"expected_sha256": "...", // raw file bytes sha256
|
|
34
|
+
"metadata_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json",
|
|
35
|
+
"metadata_sha256": "...", // raw file bytes sha256
|
|
36
|
+
"oracle_script_shas": {
|
|
37
|
+
"test-fidelity": "...", // raw bytes sha256 of oracle-test-fidelity.py
|
|
38
|
+
"scope-tier-a": "...",
|
|
39
|
+
"scope-tier-b": "..."
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"required_invariants": [
|
|
43
|
+
{
|
|
44
|
+
"id": "...",
|
|
45
|
+
"source_field": "expected.json/forbidden_patterns/0 | expected.json/verification_commands/3 | expected.json/required_files | expected.json/forbidden_files | expected.json/max_deps_added | expected.json/spec_output_files | oracle/<oracle-name>/<category-id>",
|
|
46
|
+
"source_ref": "expected.json:60 | expected.json/verification_commands/0 | oracle-test-fidelity.py",
|
|
47
|
+
"operational_check": "...natural-language description of what the variant must do or must not do...",
|
|
48
|
+
"severity": "disqualifier | hard | flag | warn",
|
|
49
|
+
"authority": "expected.json/forbidden_patterns | expected.json/verification_commands | expected.json/required_files | expected.json/forbidden_files | expected.json/max_deps_added | expected.json/spec_output_files | metadata/oracle-allowlist"
|
|
50
|
+
}
|
|
51
|
+
// ...sorted lexicographically by id
|
|
52
|
+
]
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
**Hard rules**:
|
|
57
|
+
- `required_invariants` MUST be sorted lexicographically by `id`. idgen sorts before serializing; lint rejects an unsorted file.
|
|
58
|
+
- All file shas (`expected_sha256`, `metadata_sha256`, `oracle_script_shas.*`) are **raw file bytes sha256** — `sha256(open(path, "rb").read())`. NOT canonical-JSON form. (Canonical form is reserved for the pair-plan pre-stamp hash; see below.)
|
|
59
|
+
- `info`-severity oracle categories are NOT registry entries (e.g. scope-tier-b's `tier-b-reachable` is a positive signal, not an invariant violation).
|
|
60
|
+
- The umbrella oracle category `scope-tier-a:tier-a-violation` is ONE registry entry; the 5 path-glob groups (planning-doc, ci-config, node-modules, test-results-or-coverage, env-secrets) are described inside `operational_check`, not split into 5 entries.
|
|
61
|
+
|
|
62
|
+
**Determinism**: same `(expected.json, metadata.json, oracle scripts)` input → byte-identical `canonical_id_registry.json`. Achieved by:
|
|
63
|
+
- `json.dumps(obj, sort_keys=True, indent=2, ensure_ascii=False)` for the on-disk file.
|
|
64
|
+
- All lists pre-sorted before dumping (registry items by `id`).
|
|
65
|
+
- No timestamps that change run-to-run except `generated_at` — see exemption below.
|
|
66
|
+
|
|
67
|
+
`generated_at` is the ONE volatile field. Lint ignores it for sha-stability checks; lint's determinism check sets `generated_at` to a fixed value before comparing two consecutive idgen runs. (Implementation: idgen accepts `--generated-at <iso8601>` for testing.)
|
|
68
|
+
|
|
69
|
+
## `pair-plan.json` shape
|
|
70
|
+
|
|
71
|
+
```jsonc
|
|
72
|
+
{
|
|
73
|
+
"schema_version": "1",
|
|
74
|
+
"plan_status": "final | blocked | draft",
|
|
75
|
+
"planning_mode": "solo | pair",
|
|
76
|
+
"fixture_id": "F2-cli-medium-subcommand", // human label; not authoritative
|
|
77
|
+
"source": {
|
|
78
|
+
"spec_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md",
|
|
79
|
+
"spec_sha256": "...", // raw file bytes
|
|
80
|
+
"expected_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json",
|
|
81
|
+
"expected_sha256": "...", // raw file bytes (optional only when expected.json absent)
|
|
82
|
+
"rubric_path": "benchmark/auto-resolve/RUBRIC.md",
|
|
83
|
+
"rubric_sha256": "...", // raw file bytes
|
|
84
|
+
"canonical_id_registry_path": "benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json",
|
|
85
|
+
"canonical_id_registry_sha256": "..." // raw file bytes of the registry file
|
|
86
|
+
},
|
|
87
|
+
"authority_order": [
|
|
88
|
+
"spec.md",
|
|
89
|
+
"expected.json/rubric",
|
|
90
|
+
"phase prompt",
|
|
91
|
+
"model preference"
|
|
92
|
+
],
|
|
93
|
+
"rounds": [
|
|
94
|
+
{
|
|
95
|
+
"round": 1,
|
|
96
|
+
"claude_draft_sha256": "...", // raw file bytes of the per-round draft artifact
|
|
97
|
+
"codex_draft_sha256": "...",
|
|
98
|
+
"merged_sha256": "...",
|
|
99
|
+
"note": "..."
|
|
100
|
+
}
|
|
101
|
+
// up to 3 rounds; iter-0022 preflight stops at the first round where neither model has new substantive critique
|
|
102
|
+
],
|
|
103
|
+
"accepted_invariants": [
|
|
104
|
+
{
|
|
105
|
+
"id": "no_silent_catch_return_fallback",
|
|
106
|
+
"paraphrase": "...", // human-readable; informational only, NOT enforced
|
|
107
|
+
"source_refs": ["spec.md:36", "expected.json/forbidden_patterns/0"],
|
|
108
|
+
"operational_check": "BUILD output must not contain `catch[^{]*\\{[^}]*return [^}]*\\}` in bin/cli.js",
|
|
109
|
+
"authority": "expected.json/forbidden_patterns"
|
|
110
|
+
}
|
|
111
|
+
],
|
|
112
|
+
"rejected_alternatives": [
|
|
113
|
+
{
|
|
114
|
+
"id": "alt_silent_catch_with_log",
|
|
115
|
+
"rationale": "Authority order says expected.json/forbidden_patterns dominates; logging does not change visible-error contract.",
|
|
116
|
+
"conflicts_with_ids": ["no_silent_catch_return_fallback"],
|
|
117
|
+
"claude_stamp": "rejected",
|
|
118
|
+
"codex_stamp": "rejected"
|
|
119
|
+
}
|
|
120
|
+
],
|
|
121
|
+
"unresolved": [], // MUST be empty in final plans
|
|
122
|
+
"escalated_to_user": [], // populated only during draft / blocked status; final must have user_resolution per item if non-empty
|
|
123
|
+
"model_stamps": {
|
|
124
|
+
"claude": {
|
|
125
|
+
"status": "sign | block",
|
|
126
|
+
"blocked_ids": [],
|
|
127
|
+
"signed_plan_sha256": "...", // canonical pre-stamp sha (see below)
|
|
128
|
+
"model": "claude-opus-4-7",
|
|
129
|
+
"timestamp": "2026-04-29T..."
|
|
130
|
+
},
|
|
131
|
+
"codex": {
|
|
132
|
+
"status": "sign | block",
|
|
133
|
+
"blocked_ids": [],
|
|
134
|
+
"signed_plan_sha256": "...",
|
|
135
|
+
"model": "gpt-5.5",
|
|
136
|
+
"timestamp": "..."
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Severity decoupling (registry vs findings)
|
|
143
|
+
|
|
144
|
+
The registry's `required_invariants[].severity` taxonomy is **metadata for human review only**: `disqualifier | hard | flag | warn`. It is NOT mapped onto the `references/findings-schema.md` taxonomy used by EVAL / CRITIC findings (`CRITICAL | HIGH | MEDIUM | LOW`). When a phase emits a finding for a missed plan invariant, severity is assigned by that phase's own existing severity policy (per `findings-schema.md`), not by reading the registry severity directly. The two taxonomies serve different audiences (registry severity = "how the oracle classifies it"; findings severity = "what the orchestrator should do about it") and are intentionally not coupled in iter-0022.
|
|
145
|
+
|
|
146
|
+
## Hard rules (lint-enforced)
|
|
147
|
+
|
|
148
|
+
1. `unresolved.length > 0` → `plan_status` MUST be `blocked` or `draft`. Final accepted plan MUST have `unresolved == []`.
|
|
149
|
+
2. `escalated_to_user[]` non-empty → each item MUST carry a `user_resolution` field, OR `plan_status` MUST be `blocked` / `draft`.
|
|
150
|
+
3. Every `accepted_invariants[].id` MUST appear in the registry's `required_invariants[].id` exactly (string match — no paraphrase, no synonym, no new IDs at plan-time). `paraphrase` is informational only.
|
|
151
|
+
4. **Final-plan coverage**: when `plan_status == "final"`, every registry entry MUST be accounted for in the plan — each `required_invariants[].id` is in `accepted_invariants[].id` OR in some `rejected_alternatives[].conflicts_with_ids[]` OR in `escalated_to_user[].id` OR in `unresolved[].id`. (`draft` and `blocked` plans are NOT subject to full coverage; they may still carry un-decided ids in `unresolved[]` per Rule #1.)
|
|
152
|
+
5. `authority_order` MUST be the exact 4-string array `["spec.md", "expected.json/rubric", "phase prompt", "model preference"]` (snapshot at iter-0022 ship time; future iters can amend with explicit `schema_version` bump).
|
|
153
|
+
6. `model_stamps.{claude,codex}.status == "sign"` MUST hold for `plan_status: "final"`. A `block` from either model forces `plan_status` to `blocked` or `draft`.
|
|
154
|
+
7. `model_stamps.{claude,codex}.signed_plan_sha256` MUST be byte-identical AND MUST equal the canonical pre-stamp sha256 of the file (see "Two sha256 contracts" below).
|
|
155
|
+
8. `source.{spec_sha256, expected_sha256, rubric_sha256, canonical_id_registry_sha256}` MUST equal the actual raw-bytes sha256 of the referenced files at lint time (catches stale plans against changed sources).
|
|
156
|
+
9. `source.canonical_id_registry_path` MUST resolve to an existing registry file. lint reads it from this field; if `--registry <path>` is passed on the lint command line, the override wins.
|
|
157
|
+
10. `planning_mode: "pair"` requires `rounds.length >= 1`. `planning_mode: "solo"` requires `rounds.length == 0` (no merge artifacts).
|
|
158
|
+
|
|
159
|
+
## Two sha256 contracts (DO NOT CONFLATE)
|
|
160
|
+
|
|
161
|
+
### Contract A — raw file bytes
|
|
162
|
+
|
|
163
|
+
Used for: every `source.*_sha256` field (spec, expected, rubric, registry), every `generated_from.*_sha256` field in the registry, every `rounds[].*_draft_sha256` and `merged_sha256`.
|
|
164
|
+
|
|
165
|
+
```python
|
|
166
|
+
import hashlib
|
|
167
|
+
with open(path, "rb") as f:
|
|
168
|
+
sha = hashlib.sha256(f.read()).hexdigest()
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
No canonicalization. The bytes on disk are what gets hashed. This catches "the plan claims spec.md is sha X but spec.md actually has bytes producing sha Y" drift.
|
|
172
|
+
|
|
173
|
+
### Contract B — canonical pre-stamp form (pair-plan stamps only)
|
|
174
|
+
|
|
175
|
+
Used for: `model_stamps.claude.signed_plan_sha256` and `model_stamps.codex.signed_plan_sha256`. Both stamps sign **byte-identical** canonical bytes, so both sha values are byte-identical.
|
|
176
|
+
|
|
177
|
+
Algorithm (writers and verifiers MUST implement exactly):
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
import json
|
|
181
|
+
import hashlib
|
|
182
|
+
import copy
|
|
183
|
+
|
|
184
|
+
def canonical_pre_stamp_sha256(plan: dict) -> str:
|
|
185
|
+
# Reject duplicate keys when LOADING the plan; this function assumes a clean dict.
|
|
186
|
+
pre = copy.deepcopy(plan)
|
|
187
|
+
pre["model_stamps"] = {} # replace value, keep key
|
|
188
|
+
s = json.dumps(
|
|
189
|
+
pre,
|
|
190
|
+
sort_keys=True,
|
|
191
|
+
separators=(",", ":"),
|
|
192
|
+
ensure_ascii=False,
|
|
193
|
+
allow_nan=False,
|
|
194
|
+
)
|
|
195
|
+
return hashlib.sha256(s.encode("utf-8")).hexdigest()
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
When LOADING the plan, reject duplicate keys:
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
def _strict_pairs(pairs):
|
|
202
|
+
keys = [k for k, _ in pairs]
|
|
203
|
+
if len(keys) != len(set(keys)):
|
|
204
|
+
raise ValueError("duplicate key in pair-plan.json")
|
|
205
|
+
return dict(pairs)
|
|
206
|
+
|
|
207
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
208
|
+
plan = json.load(f, object_pairs_hook=_strict_pairs)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
**Why no Unicode normalization**: the canonical form hashes input bytes as-is. Writers and verifiers must agree on input form (NFC recommended for any user-supplied free-text strings, but not enforced — the scheme survives because both sides derive from the same source bytes).
|
|
212
|
+
|
|
213
|
+
**Why no floats**: integer + string serialize byte-stably across implementations. Floats vary (e.g. `1.0` vs `1`). Avoid floats in this schema until a future field absolutely requires one; if added, document the canonical float-printing rule in this file.
|
|
214
|
+
|
|
215
|
+
## Slug rules for registry IDs (idgen)
|
|
216
|
+
|
|
217
|
+
When an `expected.json` item lacks an explicit `id` field, idgen synthesizes a deterministic slug.
|
|
218
|
+
|
|
219
|
+
### `forbidden_patterns[i]` slug
|
|
220
|
+
|
|
221
|
+
```
|
|
222
|
+
forbidden_pattern__<sanitize(description, 60)>__<sanitize(files[0], 30)>
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
`sanitize(s, max_len)`: lowercase; replace any non-`[a-z0-9]` run with a single `_`; strip leading/trailing `_`; truncate to `max_len` (right-truncate, no hash suffix at this level).
|
|
226
|
+
|
|
227
|
+
If two items in the same `forbidden_patterns[]` array produce the same slug after sanitization, the FIRST one (by source-array index) keeps the bare slug; each subsequent collision appends `__i<index>`. idgen detects this deterministically by walking the array in order.
|
|
228
|
+
|
|
229
|
+
Example F2:
|
|
230
|
+
- `forbidden_patterns[0]` (description="silent catch returning a fallback value — violates no-silent-catches policy", files=["bin/cli.js"]) → `forbidden_pattern__silent_catch_returning_a_fallback_value_violate__bin_cli_js`
|
|
231
|
+
- `forbidden_patterns[1]` (description="@ts-ignore escape hatch", files=["bin/cli.js"]) → `forbidden_pattern__ts_ignore_escape_hatch__bin_cli_js`
|
|
232
|
+
|
|
233
|
+
### `verification_commands[i]` slug
|
|
234
|
+
|
|
235
|
+
```
|
|
236
|
+
verification__<sha8(canonical_json(verification_obj))>
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
`canonical_json(obj)`: same compact form as Contract B (`json.dumps(obj, sort_keys=True, separators=(",", ":"), ensure_ascii=False, allow_nan=False)`).
|
|
240
|
+
`sha8(s)`: first 8 hex chars of `sha256(s.encode("utf-8"))`.
|
|
241
|
+
|
|
242
|
+
The full verification object is hashed (cmd + exit_code + stdout_contains + stdout_not_contains), so reordering the array does not change the slug. Array-index lives in `source_ref` (`expected.json/verification_commands/<i>`) for human navigation only.
|
|
243
|
+
|
|
244
|
+
### Other expected.json fields
|
|
245
|
+
|
|
246
|
+
- `required_files`: one registry entry per file path: `required_file__<sanitize(path, 60)>`.
|
|
247
|
+
- `forbidden_files`: same shape: `forbidden_file__<sanitize(path, 60)>`.
|
|
248
|
+
- `max_deps_added`: one registry entry: `max_deps_added__<value>` (e.g. `max_deps_added__0`).
|
|
249
|
+
- `spec_output_files`: one registry entry per path: `spec_output_file__<sanitize(path, 60)>`.
|
|
250
|
+
|
|
251
|
+
### Oracle category IDs (no slug — fixed strings)
|
|
252
|
+
|
|
253
|
+
Oracle `--list-categories` returns category IDs in the form `<oracle-name>:<finding-type>`. These are stable strings that idgen passes through verbatim into `required_invariants[].id`. Each oracle script defines its own enum; iter-0022 ship snapshot:
|
|
254
|
+
|
|
255
|
+
- `test-fidelity:test-file-deleted`
|
|
256
|
+
- `test-fidelity:test-file-renamed`
|
|
257
|
+
- `test-fidelity:mock-swap`
|
|
258
|
+
- `test-fidelity:assertion-regression`
|
|
259
|
+
- `scope-tier-a:lockfile-deletion`
|
|
260
|
+
- `scope-tier-a:tier-a-violation`
|
|
261
|
+
- `scope-tier-b:scope-unmatched`
|
|
262
|
+
|
|
263
|
+
`scope-tier-b:tier-b-reachable` is `info`-severity and NOT a registry entry.
|
|
264
|
+
|
|
265
|
+
## metadata.json field for per-fixture oracle allowlist
|
|
266
|
+
|
|
267
|
+
iter-0022 adds one new field to each fixture's `metadata.json`:
|
|
268
|
+
|
|
269
|
+
```json
|
|
270
|
+
{
|
|
271
|
+
"id": "F2-cli-medium-subcommand",
|
|
272
|
+
// ... existing fields unchanged ...
|
|
273
|
+
"pair_plan_oracle_categories": [
|
|
274
|
+
"test-fidelity:test-file-deleted",
|
|
275
|
+
"test-fidelity:test-file-renamed",
|
|
276
|
+
"test-fidelity:mock-swap",
|
|
277
|
+
"test-fidelity:assertion-regression",
|
|
278
|
+
"scope-tier-a:lockfile-deletion",
|
|
279
|
+
"scope-tier-a:tier-a-violation",
|
|
280
|
+
"scope-tier-b:scope-unmatched"
|
|
281
|
+
]
|
|
282
|
+
}
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
Hard rule: idgen filters oracle categories to exactly this allowlist. If the field is missing, idgen treats it as the empty array (no oracle categories registered) — `expected.json`-derived invariants still appear. Schema-version bump if the allowlist semantics change.
|
|
286
|
+
|
|
287
|
+
The runner `run-fixture.sh` reads `timeout_seconds` (line 54) and the report reads `category` (compile-report.py line 76); no other consumer reads metadata.json today, so adding a new field is a pure metadata enrichment with no scoring implication.
|
|
288
|
+
|
|
289
|
+
## Plan field minimum/maximum policy
|
|
290
|
+
|
|
291
|
+
- A field listed in this schema with no "optional" annotation is REQUIRED.
|
|
292
|
+
- Fields explicitly marked optional: `source.expected_path` / `source.expected_sha256` (only when `expected.json` is genuinely absent — not the case for any current fixture).
|
|
293
|
+
- Unknown extra fields in `pair-plan.json` are NOT rejected by lint (forward-compat), but the canonical pre-stamp sha is computed over the whole object so unknown fields participate in the signature.
|
|
294
|
+
- Unknown extra fields in `canonical_id_registry.json` ARE rejected by lint (idgen owns the registry shape; drift here is a bug).
|
|
295
|
+
|
|
296
|
+
## Versioning
|
|
297
|
+
|
|
298
|
+
`schema_version` starts at `"1"`. A breaking change to any hard rule above bumps the version and the lint script gains a per-version dispatcher. iter-0022 ships version `1`. Future iters MUST update this file before bumping the version field anywhere else.
|