devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,691 @@
1
+ #!/usr/bin/env bash
2
+ # run-fixture.sh — run ONE fixture, ONE arm, end-to-end. Self-contained.
3
+ #
4
+ # Prepares a fresh work dir, applies setup, invokes the arm via `claude -p`
5
+ # subprocess (isolated session), then captures artifacts + runs verification.
6
+ #
7
+ # Usage:
8
+ # run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID>
9
+ # run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID> --dry-run
10
+ #
11
+ # Outputs to benchmark/auto-resolve/results/<run-id>/<fixture>/<arm>/:
12
+ # input.md, transcript.txt, diff.patch, changed-files.txt, verify.json,
13
+ # timing.json, result.json, setup.log (if setup ran)
14
+
15
+ set -euo pipefail
16
+
17
+ usage() {
18
+ echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
19
+ exit 1
20
+ }
21
+
22
+ FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
23
+ RESOLVE_SKILL="new"
24
+ while [ $# -gt 0 ]; do
25
+ case "$1" in
26
+ --fixture) FIXTURE="$2"; shift 2;;
27
+ --arm) ARM="$2"; shift 2;;
28
+ --run-id) RUN_ID="$2"; shift 2;;
29
+ --resolve-skill) RESOLVE_SKILL="$2"; shift 2;;
30
+ --dry-run) DRY_RUN=1; shift;;
31
+ *) usage;;
32
+ esac
33
+ done
34
+ [ -n "$FIXTURE" ] && [ -n "$ARM" ] && [ -n "$RUN_ID" ] || usage
35
+ # iter-0019: original 3 arms — variant (L2-old: Claude orchestrator + Codex BUILD pair via --engine auto),
36
+ # solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
37
+ # bare (L0: direct claude -p, no skill, no codex).
38
+ # iter-0033c (Codex R0-infra adoption, 2026-05-02): two new arms for NEW L2 measurement on /devlyn:resolve —
39
+ # l2_gated (--engine claude, no --pair-verify; pair fires only on natural triggers),
40
+ # l2_forced (--engine claude --pair-verify; diagnostic). Both require --resolve-skill new.
41
+ [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] || [ "$ARM" = "bare" ] \
42
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ] || \
43
+ { echo "arm must be variant|solo_claude|bare|l2_gated|l2_forced"; exit 1; }
44
+ # iter-0033c (Codex R0-infra Q2): l2_* arms require NEW skill surface (only NEW
45
+ # `/devlyn:resolve` honors --pair-verify; OLD `/devlyn:auto-resolve` would silently
46
+ # ignore the flag and produce mis-attributed L2 numbers).
47
+ if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
48
+ echo "l2_* arms require --resolve-skill new (got '$RESOLVE_SKILL')"; exit 1
49
+ fi
50
+ # iter-0034 Phase 4 cutover (2026-05-03): OLD `/devlyn:auto-resolve` was
51
+ # deleted. Only `new` (= /devlyn:resolve --spec) is supported. The flag stays
52
+ # an accepted no-op so historical runners (run-iter-0033c.sh:137) keep working
53
+ # unchanged. `old` is hard-errored — silently downgrading to `new` would
54
+ # produce mis-attributed results in any pre-cutover replay attempt.
55
+ if [ "$RESOLVE_SKILL" = "old" ]; then
56
+ echo "--resolve-skill old is no longer supported: /devlyn:auto-resolve was deleted in the iter-0034 Phase 4 cutover. Use --resolve-skill new (default) or omit the flag." >&2
57
+ exit 1
58
+ fi
59
+ [ "$RESOLVE_SKILL" = "new" ] || \
60
+ { echo "--resolve-skill must be 'new' (got '$RESOLVE_SKILL')"; exit 1; }
61
+
62
+ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
63
+ REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
64
+
65
+ FIX_DIR=""
66
+ for candidate in "$BENCH_ROOT/fixtures/$FIXTURE" "$BENCH_ROOT/shadow-fixtures/$FIXTURE"; do
67
+ if [ -d "$candidate" ]; then FIX_DIR="$candidate"; break; fi
68
+ done
69
+ [ -n "$FIX_DIR" ] || { echo "fixture not found in fixtures/ or shadow-fixtures/: $FIXTURE"; exit 1; }
70
+
71
+ META="$FIX_DIR/metadata.json"
72
+ EXPECTED="$FIX_DIR/expected.json"
73
+ SPEC="$FIX_DIR/spec.md"
74
+ TASK="$FIX_DIR/task.txt"
75
+ SETUP="$FIX_DIR/setup.sh"
76
+ for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
77
+ [ -f "$f" ] || { echo "fixture missing required file: $f (see SCHEMA.md)"; exit 1; }
78
+ done
79
+
80
+ TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
81
+
82
+ RESULT_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE/$ARM"
83
+ mkdir -p "$RESULT_DIR"
84
+
85
+ # Fresh copy of test-repo — order matters. We copy arm-env files (skills,
86
+ # CLAUDE.md) BEFORE the baseline commit so they do NOT appear in the diff
87
+ # the arm produces. That keeps diff.patch focused on the arm's actual code
88
+ # changes, so forbidden-pattern scans and judge rubrics see only real work.
89
+ WORK_DIR="/tmp/bench-${RUN_ID}-${FIXTURE}-${ARM}"
90
+ rm -rf "$WORK_DIR"
91
+ cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
92
+
93
+ # All skill-driven arms (variant / solo_claude / l2_gated / l2_forced) get
94
+ # devlyn skills + project CLAUDE.md pre-baseline + codex shim + monitored
95
+ # wrapper. Bare gets nothing (no skill, no shim, no env).
96
+ #
97
+ # iter-0019: solo_claude (L1) shares variant-arm staging because the L1 arm
98
+ # runs the same orchestrator on the same skills — only difference is codex
99
+ # is blocked. Shim catches PATH resolution; wrapper catches direct-path
100
+ # invocations.
101
+ # iter-0033c (Codex R0-infra Q6): l2_gated/l2_forced share variant staging
102
+ # (codex unblocked, shim+wrapper routing). Difference vs variant is the
103
+ # ENGINE_CLAUSE branch below — l2_* run --engine claude (Claude IMPLEMENT)
104
+ # while variant uses --engine auto (Codex IMPLEMENT). Pair-mode in
105
+ # /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
106
+ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
107
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
108
+ mkdir -p "$WORK_DIR/.claude"
109
+ if [ -d "$REPO_ROOT/.claude/skills" ]; then
110
+ cp -R "$REPO_ROOT/.claude/skills" "$WORK_DIR/.claude/skills"
111
+ else
112
+ echo "warning: $REPO_ROOT/.claude/skills missing — $ARM may lack project skills" >&2
113
+ fi
114
+ if [ -f "$REPO_ROOT/CLAUDE.md" ]; then
115
+ cp "$REPO_ROOT/CLAUDE.md" "$WORK_DIR/CLAUDE.md"
116
+ fi
117
+ # Stage the codex PATH shim. Required for both variant (route to monitored
118
+ # wrapper) and solo_claude (CODEX_BLOCKED enforcement at PATH layer).
119
+ if command -v codex >/dev/null 2>&1; then
120
+ CODEX_REAL_BIN="$(command -v codex)"
121
+ SHIM_SRC="$REPO_ROOT/scripts/codex-shim/codex"
122
+ WRAPPER_SRC="$REPO_ROOT/config/skills/_shared/codex-monitored.sh"
123
+ if [ ! -x "$SHIM_SRC" ] || [ ! -r "$WRAPPER_SRC" ]; then
124
+ echo "fatal: iter-0009 shim/wrapper missing at $SHIM_SRC / $WRAPPER_SRC" >&2
125
+ exit 1
126
+ fi
127
+ mkdir -p "$WORK_DIR/.devlyn-bin"
128
+ cp "$SHIM_SRC" "$WORK_DIR/.devlyn-bin/codex"
129
+ chmod +x "$WORK_DIR/.devlyn-bin/codex"
130
+ CODEX_MONITORED_PATH="$WORK_DIR/.claude/skills/_shared/codex-monitored.sh"
131
+ [ -r "$CODEX_MONITORED_PATH" ] || {
132
+ echo "fatal: codex-monitored.sh not present in staged skills at $CODEX_MONITORED_PATH" >&2
133
+ exit 1
134
+ }
135
+ export CODEX_REAL_BIN CODEX_MONITORED_PATH
136
+ SNAPSHOT_PATH=$(grep -m1 '^export PATH=' \
137
+ "$HOME/.claude/shell-snapshots/snapshot-zsh-"*.sh 2>/dev/null \
138
+ | head -1 | sed 's/^[^=]*=//' | tr -d '"' || true)
139
+ [ -n "$SNAPSHOT_PATH" ] || SNAPSHOT_PATH="$PATH"
140
+ INJECTED_PATH="$WORK_DIR/.devlyn-bin:$SNAPSHOT_PATH"
141
+ # iter-0019: arm-specific env. variant gets the codex routing pair;
142
+ # solo_claude gets CODEX_BLOCKED=1 (shim + wrapper both refuse).
143
+ # iter-0019.5 (Codex R1, 2026-04-28): when ARM=solo_claude, OMIT
144
+ # CODEX_REAL_BIN + CODEX_MONITORED_PATH from the env block. Reason: the
145
+ # absolute path to the real codex binary in CODEX_REAL_BIN is itself
146
+ # the bypass weapon — the orchestrator could `Bash("$CODEX_REAL_BIN
147
+ # exec ...")` and reach real codex, defeating both shim (PATH lookup
148
+ # bypassed) and wrapper (codex-monitored.sh not invoked). With the
149
+ # var absent: shim's BLOCKED check fires before it needs CODEX_REAL_BIN
150
+ # so removing it doesn't break the shim path; if CODEX_BLOCKED is
151
+ # somehow unset later, the shim now FAILS CLOSED (no real binary to
152
+ # delegate to) rather than silently delegating. The earlier "diagnostic
153
+ # visibility" justification was inverted — it handed the orchestrator
154
+ # the bypass weapon. Across iter-0019 paid 5-fixture run the bypass
155
+ # was OPEN but never exercised; this closes the surface preemptively
156
+ # before iter-0020's 9-fixture L0/L1/L2 run.
157
+ # iter-0033c (Codex R0-infra Q5): l2_gated/l2_forced are codex-UNBLOCKED
158
+ # (codex must be reachable for VERIFY pair-JUDGE). They take the variant
159
+ # path: ARM_CODEX_BLOCKED=0 → python writer omits CODEX_BLOCKED from env
160
+ # entirely (the shim refuses on any non-empty value, so 0 ≠ unset).
161
+ if [ "$ARM" = "solo_claude" ]; then
162
+ ARM_CODEX_BLOCKED=1
163
+ else
164
+ ARM_CODEX_BLOCKED=0
165
+ fi
166
+ python3 - "$WORK_DIR/.claude/settings.json" \
167
+ "$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" <<'PY'
168
+ import json, sys
169
+ out_path, path_val, real_bin, monitored, codex_blocked = sys.argv[1:6]
170
+ env = {
171
+ "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1",
172
+ "PATH": path_val,
173
+ }
174
+ if codex_blocked == "1":
175
+ # iter-0019.5: solo_claude (L1 arm) — codex blocked at binary layer.
176
+ # Do NOT export CODEX_REAL_BIN / CODEX_MONITORED_PATH to the
177
+ # orchestrator subshell; those vars become bypass weapons under any
178
+ # CODEX_BLOCKED enforcement gap.
179
+ env["CODEX_BLOCKED"] = "1"
180
+ else:
181
+ # variant arm (L2) — codex routes through wrapper as part of pair-mode
182
+ # BUILD; both vars are required by the shim/wrapper handshake.
183
+ env["CODEX_REAL_BIN"] = real_bin
184
+ env["CODEX_MONITORED_PATH"] = monitored
185
+ data = {"env": env}
186
+ with open(out_path, "w") as f:
187
+ json.dump(data, f, indent=2)
188
+ f.write("\n")
189
+ PY
190
+ else
191
+ echo "warning: codex not on PATH — $ARM cannot exercise iter-0009 wrapper / iter-0019 BLOCKED enforcement" >&2
192
+ CODEX_REAL_BIN=""
193
+ CODEX_MONITORED_PATH=""
194
+ fi
195
+ fi
196
+
197
+ (cd "$WORK_DIR" \
198
+ && git init -q \
199
+ && git add -A \
200
+ && git -c user.email=b@b -c user.name=b commit -q -m baseline) \
201
+ || { echo "baseline git init failed"; exit 1; }
202
+
203
+ # Native security-review Skill expects `refs/remotes/origin/HEAD` to identify
204
+ # the diff surface. Fresh `git init` has no remote, which made a prior F8 run
205
+ # spend ~56 minutes inside CRITIC recovering this manually. Configure a
206
+ # synthetic origin pointing at the work dir itself (no network I/O) and
207
+ # wire origin/HEAD → origin/<current-branch> so security-review resolves
208
+ # immediately.
209
+ (
210
+ cd "$WORK_DIR"
211
+ git remote add origin "$WORK_DIR" 2>/dev/null || true
212
+ BRANCH=$(git branch --show-current 2>/dev/null || echo master)
213
+ git update-ref "refs/remotes/origin/$BRANCH" HEAD 2>/dev/null || true
214
+ git symbolic-ref refs/remotes/origin/HEAD "refs/remotes/origin/$BRANCH" 2>/dev/null || true
215
+ ) >/dev/null 2>&1 || true
216
+
217
+ # Fixture-specific setup (applied post-baseline so the diff shows fixture
218
+ # framing as part of the arm's environment, not its work product). Commit
219
+ # failures here break arm-only diff isolation, so fail loudly.
220
+ if [ -f "$SETUP" ] && [ -s "$SETUP" ]; then
221
+ chmod +x "$SETUP"
222
+ if ! (cd "$WORK_DIR" && "$SETUP") > "$RESULT_DIR/setup.log" 2>&1; then
223
+ echo "setup.sh failed; see $RESULT_DIR/setup.log"
224
+ exit 1
225
+ fi
226
+ if ! (cd "$WORK_DIR" \
227
+ && git add -A \
228
+ && git -c user.email=b@b -c user.name=b commit -q --allow-empty -m "fixture-setup"); then
229
+ echo "fixture-setup commit failed — arm diff isolation broken"
230
+ exit 1
231
+ fi
232
+ fi
233
+
234
+ # iter-0019.6: stage normalized .devlyn/spec-verify.json containing ONLY
235
+ # verification_commands from expected.json (no tier_a_waivers, no
236
+ # forbidden_patterns, no scope oracles — those have separate enforcement
237
+ # layers). BUILD_GATE's spec-verify-check.py reads this generic path so
238
+ # the orchestrator stays benchmark-agnostic; future /devlyn:ideate could
239
+ # generate the same shape from a spec.md "## Verification" section for
240
+ # real-user runs (Codex R5, 2026-04-28). This stages all 3 arms — bare's
241
+ # .devlyn/ is created lazily by spec-verify-check.py if absent.
242
+ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
243
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
244
+ python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
245
+ import json, os, sys
246
+ expected = json.load(open(sys.argv[1]))
247
+ out_path = sys.argv[2]
248
+ normalized = {"verification_commands": expected.get("verification_commands", [])}
249
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
250
+ with open(out_path, "w") as f:
251
+ json.dump(normalized, f, indent=2)
252
+ f.write("\n")
253
+ PY
254
+ fi
255
+
256
+ # Build arm-specific prompt + place arm-specific environment files. Anything
257
+ # that's "benchmark scaffolding" (spec path placement, prompt wrapper) is
258
+ # committed to the work repo as a separate pre-model commit so the model's
259
+ # diff shows only its own work.
260
+ #
261
+ # Per-arm prompt selection is:
262
+ # 1. Fixture-id-aware for F9 (end-to-end novice fixture, no pre-placed spec).
263
+ # 2. Spec-mode `/devlyn:resolve --spec <path>` for the rest (post iter-0034
264
+ # Phase 4 cutover the OLD `/devlyn:auto-resolve` route was deleted).
265
+ PROMPT_FILE="$RESULT_DIR/input.md"
266
+ # Variant uses --engine auto (experimental dual-engine: codex BUILD + claude
267
+ # critique pair); solo_claude uses --engine claude explicitly so the orchestrator
268
+ # routes every phase to Claude and never tries to invoke codex. The CODEX_BLOCKED
269
+ # shim enforces this at the binary layer if the orchestrator misroutes. Both
270
+ # arms pass the engine flag explicitly so they survive future runtime-default
271
+ # changes (post iter-0020 close-out: default flipped to claude).
272
+ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
273
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
274
+ case "$ARM" in
275
+ solo_claude)
276
+ ENGINE_CLAUSE="--engine claude"
277
+ ENGINE_PROMPT_HINT="Run with \`--engine claude\` for every phase. Codex must not be invoked — the harness has blocked it at the binary layer for this run."
278
+ ;;
279
+ variant)
280
+ ENGINE_CLAUSE="--engine auto"
281
+ ENGINE_PROMPT_HINT="Run with \`--engine auto\` so the experimental dual-engine routing fires (Codex BUILD/FIX, Claude EVAL/CRITIC) — do not override it."
282
+ ;;
283
+ l2_gated)
284
+ # iter-0033c: NEW L2 with natural pair-mode triggers. Claude does
285
+ # IMPLEMENT; pair-JUDGE in VERIFY fires only on coverage_failed OR
286
+ # MECHANICAL warning per /devlyn:resolve PHASE 5. Codex remains
287
+ # available as the OTHER-engine pair-JUDGE candidate.
288
+ ENGINE_CLAUSE="--engine claude"
289
+ ENGINE_PROMPT_HINT="Run with \`--engine claude\` and let the orchestrator's pair-mode (VERIFY) trigger naturally per its policy. Codex is available as the OTHER-engine pair-JUDGE — the harness has not blocked it. Do NOT pass \`--pair-verify\`; this arm measures gated triggering."
290
+ ;;
291
+ l2_forced)
292
+ # iter-0033c: NEW L2 forced — pair-JUDGE always fires. Diagnostic arm
293
+ # for Gate 6 fixture-level cross-check + Gate 7 attribution causality.
294
+ ENGINE_CLAUSE="--engine claude --pair-verify"
295
+ ENGINE_PROMPT_HINT="Run with \`--engine claude --pair-verify\` so VERIFY pair-mode fires unconditionally. Codex is the OTHER-engine pair-JUDGE."
296
+ ;;
297
+ esac
298
+ if [ "$FIXTURE" = "F9-e2e-ideate-to-resolve" ]; then
299
+ # F9 NEW chain (iter-0033a): /devlyn:ideate --quick → /devlyn:resolve
300
+ # --spec <emitted-path>. No pre-placed spec; the variant arm generates it
301
+ # via ideate. No preflight (folded into resolve's VERIFY phase).
302
+ #
303
+ # --quick is mandatory in autonomous (claude -p) mode: default ideate
304
+ # invokes interactive Q&A which has no human to answer in a benchmark
305
+ # subprocess — the agent asks questions and stops. --quick uses
306
+ # single-turn assume-and-confirm: AI synthesizes the spec from the goal
307
+ # plus an explicit assumptions block, so the chain proceeds end-to-end
308
+ # without user input. Smoke 3 (iter-0033a, 2026-04-30) caught this:
309
+ # default-mode F9 produced empty diffs after 54s of Q&A waiting.
310
+ cat > "$PROMPT_FILE" <<EOF
311
+ You are a first-time devlyn-cli user. You have a vague idea and want the 2-skill harness to take it from unstructured ask to shipped, verified feature. Run the chain:
312
+
313
+ 1. Invoke \`/devlyn:ideate --quick ${ENGINE_CLAUSE}\` to turn the idea into a verifiable spec. \`--quick\` is mandatory: this is an autonomous run with no human to answer interactive questions, so ideate must synthesize the spec single-turn from the goal text and emit assumptions explicitly. The skill announces \`spec ready — /devlyn:resolve --spec <emitted-path>\` when done. The emitted spec lives at \`docs/specs/<id>-<slug>/spec.md\` with a sibling \`spec.expected.json\`.
314
+ 2. Take the emitted spec path verbatim from the announce line and invoke \`/devlyn:resolve --spec <that-path> ${ENGINE_CLAUSE}\` to run PLAN → IMPLEMENT → BUILD_GATE → CLEANUP → VERIFY (VERIFY is the fresh-subagent final phase — there is no separate preflight skill in the 2-skill design).
315
+
316
+ ${ENGINE_PROMPT_HINT}
317
+
318
+ Follow the skills to completion. Do not short-circuit. Do not invoke \`/devlyn:auto-resolve\` or \`/devlyn:preflight\` — they are not part of the 2-skill chain. Do not stop after ideate; the chain only counts as complete after \`/devlyn:resolve\` returns a terminal verdict.
319
+
320
+ After the whole chain, briefly report: (a) the spec path ideate produced, (b) the resolve terminal verdict, (c) whether VERIFY surfaced any findings.
321
+
322
+ RAW IDEA:
323
+ $(cat "$TASK")
324
+ EOF
325
+ else
326
+ # Spec-mode /devlyn:resolve: spec pre-placed at the canonical roadmap path
327
+ # the harness has used since iter-0019. Pre-Phase-4 this branch shared
328
+ # staging with the OLD /devlyn:auto-resolve route; iter-0034 deleted the
329
+ # OLD branch and this is now the only non-F9 path.
330
+ mkdir -p "$WORK_DIR/docs/roadmap/phase-1"
331
+ cp "$SPEC" "$WORK_DIR/docs/roadmap/phase-1/$FIXTURE.md"
332
+ cat > "$PROMPT_FILE" <<EOF
333
+ Use the \`/devlyn:resolve --spec docs/roadmap/phase-1/$FIXTURE.md ${ENGINE_CLAUSE}\` skill to implement the spec. ${ENGINE_PROMPT_HINT}
334
+
335
+ The 2-skill design folds verification into resolve's VERIFY phase — there is no separate \`/devlyn:preflight\`, \`/devlyn:auto-resolve\`, or other 3-skill orchestrator at HEAD.
336
+
337
+ After the pipeline finishes, report the terminal verdict and list of files changed so the benchmark runner can capture state.
338
+ EOF
339
+ fi
340
+ else
341
+ # Bare — same prompt for F9 as any other fixture: task.txt with anti-skill rules.
342
+ cat > "$PROMPT_FILE" <<EOF
343
+ You are acting as a smart engineer implementing the following request directly. No skill pipeline.
344
+
345
+ HARD RULES:
346
+ - Do NOT invoke any \`/devlyn:*\` skill (no auto-resolve, evaluate, review, clean, update-docs, team-*, etc.).
347
+ - Do NOT invoke native \`simplify\` or \`security-review\` skills.
348
+ - Use only direct tools: Read, Write, Edit, Grep, Glob, Bash.
349
+ - Write code to satisfy the request. Run the verification commands the user implies. Fix failures until they pass.
350
+
351
+ REQUEST:
352
+ $(cat "$TASK")
353
+ EOF
354
+ fi
355
+
356
+ # Commit scaffolding so the upcoming arm-only diff excludes it. A failure
357
+ # here means arm work would appear mixed with scaffolding in the diff — fail
358
+ # loudly rather than silently producing corrupted data.
359
+ if ! (cd "$WORK_DIR" \
360
+ && git add -A \
361
+ && git -c user.email=b@b -c user.name=b commit -q --allow-empty -m "bench-scaffold"); then
362
+ echo "bench-scaffold commit failed — arm diff isolation broken"
363
+ exit 1
364
+ fi
365
+ # Capture the scaffold commit SHA so the arm-only diff can be computed even
366
+ # when the arm makes its own commits internally (e.g. variant's auto-resolve
367
+ # pipeline commits after each phase). Diffing against HEAD would miss those.
368
+ SCAFFOLD_SHA=$(cd "$WORK_DIR" && git rev-parse HEAD)
369
+
370
+ # Timing start
371
+ T_START=$(date +%s)
372
+ cat > "$RESULT_DIR/timing.json" <<EOF
373
+ {
374
+ "run_id": "$RUN_ID",
375
+ "fixture": "$FIXTURE",
376
+ "arm": "$ARM",
377
+ "work_dir": "$WORK_DIR",
378
+ "start_epoch": $T_START
379
+ }
380
+ EOF
381
+
382
+ # --- Invocation -------------------------------------------------------------
383
+ # Exit code is captured so infrastructure failures don't silently look like
384
+ # a weak diff. See invoke_exit in result.json.
385
+ INVOKE_EXIT=0
386
+ # iter-0012: WATCHDOG_FIRED is the truth source for `timed_out` in result.json.
387
+ # Set to 1 only when the watchdog flag file existed at post-wait check
388
+ # (lines 332-336). Initialized here so the `set -u` `export` below at the
389
+ # Python aggregator works in both branches (dry-run never sets it).
390
+ WATCHDOG_FIRED=0
391
+ if [ $DRY_RUN -eq 1 ]; then
392
+ echo "[run-fixture] DRY RUN — prepared $WORK_DIR, skipping model invocation" \
393
+ > "$RESULT_DIR/transcript.txt"
394
+ else
395
+ command -v claude >/dev/null 2>&1 || {
396
+ echo "claude CLI not on PATH — cannot invoke arm"; exit 1;
397
+ }
398
+ # Arm uses real HOME so Claude auth (macOS Keychain + ~/.claude session
399
+ # state) works. Fixtures that need HOME isolation override it inline in
400
+ # their verification commands (e.g. F2 uses `HOME=/nonexistent` per command).
401
+ # Variant-arm skills are resolved from $WORK_DIR/.claude/skills (project
402
+ # scope), so bare-arm runs never see them regardless of HOME.
403
+ #
404
+ # Portable wall-clock watchdog. macOS lacks GNU `timeout` by default; the
405
+ # earlier fallback ran arms unbounded, which produced a multi-hour F7 hang
406
+ # when the inner `codex exec` raced against a lingering codex-mcp-server.
407
+ # We background the arm in its own process group (`set -m` + `exec`) so the
408
+ # watchdog can `kill -- -PGID` and reap codex/codex-mcp-server descendants
409
+ # together with the parent. A flag file disambiguates timeout from natural
410
+ # exit; on timeout we set INVOKE_EXIT=124 (GNU timeout convention) so the
411
+ # downstream `invoke_failure` logic routes the run into BLOCKED. iter-0012:
412
+ # the same flag also flips WATCHDOG_FIRED=1, which is exported and consumed
413
+ # by the Python aggregator below to derive result.json.timed_out — so a
414
+ # natural exit at or past the budget is no longer mislabeled as timeout.
415
+ #
416
+ # MCP/config isolation (iter 0004). The harness's `claude -p` subprocess
417
+ # must not load the operator's user-level MCP plugins (pencil, codex-cli,
418
+ # telegram, vercel, …). Project policy is "MCP is not in the loop"; loading
419
+ # user MCP inside the variant arm is uncontrolled environment leaking into
420
+ # the experiment, and it is the most plausible cause of the F7 0-byte-
421
+ # transcript hang. `--strict-mcp-config` + an empty `mcpServers` object
422
+ # forces a hermetic subprocess. Skills still resolve via `/skill-name`.
423
+ # `--debug-file` records per-arm init/runtime so the next hang has a
424
+ # location, not a guess.
425
+ TIMEOUT_FLAG="$RESULT_DIR/.timed_out"
426
+ rm -f "$TIMEOUT_FLAG"
427
+
428
+ set +e
429
+ set -m
430
+ (
431
+ cd "$WORK_DIR"
432
+ # iter-0009 + iter-0019: prepend codex shim PATH for any arm that staged
433
+ # one. variant routes through codex-monitored.sh; solo_claude refuses on
434
+ # CODEX_BLOCKED=1; bare has no shim.
435
+ # iter-0033c (Codex R0-infra Q6): l2_gated/l2_forced ALSO need the shim
436
+ # PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
437
+ # `codex exec` through the wrapper for starvation safety.
438
+ if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
439
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } \
440
+ && [ -x "$WORK_DIR/.devlyn-bin/codex" ]; then
441
+ export PATH="$WORK_DIR/.devlyn-bin:$PATH"
442
+ [ "$ARM" = "solo_claude" ] && export CODEX_BLOCKED=1
443
+ fi
444
+ # iter-0019.6: BUILD_GATE's spec-verify-check.py uses BENCH_WORKDIR for
445
+ # commands that escape the work-dir (e.g. F9's outside-repo check via
446
+ # `cd /tmp && node $BENCH_WORKDIR/bin/cli.js gitstats`). Mirror exactly
447
+ # what the post-run verifier (run-fixture.sh:431-434) sets so the gate
448
+ # sees the same environment shape.
449
+ export BENCH_WORKDIR="$WORK_DIR"
450
+ exec claude \
451
+ -p "$(cat "$PROMPT_FILE")" \
452
+ --dangerously-skip-permissions \
453
+ --effort xhigh \
454
+ --strict-mcp-config \
455
+ --mcp-config '{"mcpServers":{}}' \
456
+ --debug-file "$RESULT_DIR/claude-debug.log"
457
+ ) > "$RESULT_DIR/transcript.txt" 2>&1 &
458
+ CHILD_PID=$!
459
+ set +m
460
+
461
+ (
462
+ sleep "$TIMEOUT"
463
+ if kill -0 "$CHILD_PID" 2>/dev/null; then
464
+ : > "$TIMEOUT_FLAG"
465
+ kill -TERM -- "-$CHILD_PID" 2>/dev/null
466
+ sleep 5
467
+ kill -KILL -- "-$CHILD_PID" 2>/dev/null
468
+ fi
469
+ ) &
470
+ WATCHDOG_PID=$!
471
+
472
+ wait "$CHILD_PID"
473
+ INVOKE_EXIT=$?
474
+
475
+ kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
476
+ wait "$WATCHDOG_PID" 2>/dev/null || true
477
+
478
+ if [ -f "$TIMEOUT_FLAG" ]; then
479
+ INVOKE_EXIT=124
480
+ WATCHDOG_FIRED=1
481
+ rm -f "$TIMEOUT_FLAG"
482
+ echo "[run-fixture] arm timed out after ${TIMEOUT}s — INVOKE_EXIT=124" >&2
483
+ fi
484
+ set -e
485
+ fi
486
+
487
+ T_END=$(date +%s)
488
+ ELAPSED=$((T_END - T_START))
489
+
490
+ # Capture the ARM-ONLY diff against the scaffold commit. Variant's
491
+ # auto-resolve pipeline commits internally after each phase, so diffing
492
+ # against HEAD would miss committed work. Diffing against SCAFFOLD_SHA after
493
+ # `git add -A` picks up both scaffold..HEAD committed deltas AND any
494
+ # staged-but-not-yet-committed leftovers (unstaged or untracked).
495
+ (cd "$WORK_DIR" \
496
+ && git add -A 2>/dev/null \
497
+ && git diff "$SCAFFOLD_SHA") > "$RESULT_DIR/diff.patch" 2>&1 || true
498
+ (cd "$WORK_DIR" \
499
+ && git diff "$SCAFFOLD_SHA" --name-only) > "$RESULT_DIR/changed-files.txt" 2>&1 || true
500
+
501
+ # Deterministic oracles (step 1+ of the benchmark-extension plan).
502
+ # Findings-only at this stage; scoring integration is step 5.
503
+ python3 "$BENCH_ROOT/scripts/oracle-test-fidelity.py" \
504
+ --work "$WORK_DIR" --scaffold "$SCAFFOLD_SHA" \
505
+ > "$RESULT_DIR/oracle-test-fidelity.json" 2>/dev/null || \
506
+ echo '{"oracle":"test-fidelity","findings":[],"error":"oracle invocation failed"}' \
507
+ > "$RESULT_DIR/oracle-test-fidelity.json"
508
+
509
+ python3 "$BENCH_ROOT/scripts/oracle-scope-tier-a.py" \
510
+ --work "$WORK_DIR" --scaffold "$SCAFFOLD_SHA" --expected "$EXPECTED" \
511
+ > "$RESULT_DIR/oracle-scope-tier-a.json" 2>/dev/null || \
512
+ echo '{"oracle":"scope-tier-a","findings":[],"error":"oracle invocation failed"}' \
513
+ > "$RESULT_DIR/oracle-scope-tier-a.json"
514
+
515
+ python3 "$BENCH_ROOT/scripts/oracle-scope-tier-b.py" \
516
+ --work "$WORK_DIR" --scaffold "$SCAFFOLD_SHA" --expected "$EXPECTED" \
517
+ > "$RESULT_DIR/oracle-scope-tier-b.json" 2>/dev/null || \
518
+ echo '{"oracle":"scope-tier-b","findings":[],"error":"oracle invocation failed"}' \
519
+ > "$RESULT_DIR/oracle-scope-tier-b.json"
520
+
521
+ # Run verification commands + forbidden pattern scan + deps check. Uses
522
+ # the operator's real HOME (same as the arm saw). Fixtures that need HOME
523
+ # isolation override it inline per verification command.
524
+ python3 - "$EXPECTED" "$RESULT_DIR" "$WORK_DIR" <<'PY'
525
+ import json, os, re, subprocess, sys
526
+
527
+ expected = json.load(open(sys.argv[1]))
528
+ result_dir = sys.argv[2]
529
+ work = sys.argv[3]
530
+
531
+ verify_env = os.environ.copy()
532
+ # Expose the work-dir path so fixtures whose verification needs to reference
533
+ # the work root can do so portably (e.g. F9's out-of-repo check).
534
+ verify_env["BENCH_WORKDIR"] = work
535
+
536
+ verify = {"commands": [], "forbidden_pattern_hits": [], "deps_added": 0,
537
+ "max_deps_added": expected.get("max_deps_added", 0),
538
+ "missing_required_files": [], "forbidden_files_present": []}
539
+
540
+ for vc in expected.get("verification_commands", []):
541
+ try:
542
+ proc = subprocess.run(vc["cmd"], cwd=work, shell=True, env=verify_env,
543
+ capture_output=True, text=True, timeout=60)
544
+ out = (proc.stdout or "") + (proc.stderr or "")
545
+ ok_exit = proc.returncode == vc.get("exit_code", 0)
546
+ ok_contains = all(s in out for s in vc.get("stdout_contains", []))
547
+ ok_not = not any(s in out for s in vc.get("stdout_not_contains", []))
548
+ verify["commands"].append({
549
+ "cmd": vc["cmd"],
550
+ "expected_exit": vc.get("exit_code", 0),
551
+ "actual_exit": proc.returncode,
552
+ "pass": bool(ok_exit and ok_contains and ok_not),
553
+ "reason": None if (ok_exit and ok_contains and ok_not)
554
+ else ("exit" if not ok_exit
555
+ else ("missing_contains" if not ok_contains else "unexpected_text")),
556
+ "stdout_tail": out[-500:],
557
+ })
558
+ except subprocess.TimeoutExpired:
559
+ verify["commands"].append({"cmd": vc["cmd"], "pass": False, "reason": "timeout"})
560
+ except Exception as e:
561
+ verify["commands"].append({"cmd": vc["cmd"], "pass": False,
562
+ "reason": f"error:{e.__class__.__name__}:{e}"})
563
+
564
+ # Forbidden pattern scan over diff.patch. Each pattern may declare a `files`
565
+ # allowlist; when present, we slice the diff to only those files' hunks.
566
+ diff_text = ""
567
+ try:
568
+ with open(os.path.join(result_dir, "diff.patch")) as fh:
569
+ diff_text = fh.read()
570
+ except Exception:
571
+ pass
572
+
573
+ def slice_diff_to_files(diff, files):
574
+ """Return the subset of a unified diff touching any of `files`.
575
+ Hunks outside the allowlist are dropped."""
576
+ if not files:
577
+ return diff
578
+ out, keep = [], False
579
+ for line in diff.splitlines(keepends=True):
580
+ if line.startswith("diff --git "):
581
+ keep = any(f in line for f in files)
582
+ if keep:
583
+ out.append(line)
584
+ return "".join(out)
585
+
586
+ for fp in expected.get("forbidden_patterns", []):
587
+ scope = slice_diff_to_files(diff_text, fp.get("files") or [])
588
+ if re.search(fp["pattern"], scope):
589
+ verify["forbidden_pattern_hits"].append({
590
+ "pattern": fp["pattern"],
591
+ "severity": fp.get("severity", "warning"),
592
+ "description": fp.get("description", ""),
593
+ "scoped_to": fp.get("files") or "all",
594
+ })
595
+
596
+ # Deps added count (naive: count top-level added lines under dependencies keys)
597
+ try:
598
+ proc = subprocess.run(["git", "diff", "HEAD", "--", "package.json"],
599
+ cwd=work, capture_output=True, text=True)
600
+ in_deps = False
601
+ for line in (proc.stdout or "").splitlines():
602
+ if line.startswith("+ ") or line.startswith("- "):
603
+ continue
604
+ if '"dependencies"' in line or '"devDependencies"' in line:
605
+ in_deps = True
606
+ elif line.strip().startswith("}"):
607
+ in_deps = False
608
+ elif in_deps and line.startswith("+") and not line.startswith("+++"):
609
+ if re.search(r'"[^"]+"\s*:\s*"[^"]+"', line):
610
+ verify["deps_added"] += 1
611
+ except Exception:
612
+ pass
613
+
614
+ # Required / forbidden files
615
+ try:
616
+ with open(os.path.join(result_dir, "changed-files.txt")) as fh:
617
+ changed = [l.strip() for l in fh.read().splitlines() if l.strip()]
618
+ except Exception:
619
+ changed = []
620
+ verify["missing_required_files"] = [
621
+ f for f in expected.get("required_files", [])
622
+ if not os.path.exists(os.path.join(work, f))
623
+ ]
624
+ verify["forbidden_files_present"] = [
625
+ f for f in expected.get("forbidden_files", []) if f in changed
626
+ ]
627
+
628
+ total = len(verify["commands"])
629
+ passed = sum(1 for r in verify["commands"] if r.get("pass"))
630
+ verify["commands_passed"] = passed
631
+ verify["commands_total"] = total
632
+ verify["verify_score"] = (passed / total) if total else 1.0
633
+
634
+ verify["disqualifier"] = (
635
+ any(h["severity"] == "disqualifier" for h in verify["forbidden_pattern_hits"])
636
+ or verify["deps_added"] > verify["max_deps_added"]
637
+ or bool(verify["missing_required_files"])
638
+ or bool(verify["forbidden_files_present"])
639
+ )
640
+
641
+ json.dump(verify, open(os.path.join(result_dir, "verify.json"), "w"), indent=2)
642
+ PY
643
+
644
+ # Timing + aggregate
645
+ export INVOKE_EXIT WATCHDOG_FIRED
646
+ python3 - "$RESULT_DIR" "$FIXTURE" "$ARM" "$RUN_ID" "$T_END" "$ELAPSED" "$TIMEOUT" <<'PY'
647
+ import json, os, sys
648
+ result_dir, fixture, arm, run_id = sys.argv[1:5]
649
+ t_end, elapsed, timeout = int(sys.argv[5]), int(sys.argv[6]), int(sys.argv[7])
650
+
651
+ timing = json.load(open(os.path.join(result_dir, "timing.json")))
652
+ timing["end_epoch"] = t_end
653
+ timing["elapsed_seconds"] = elapsed
654
+ timing["timeout_seconds"] = timeout
655
+ # iter-0012: derive from watchdog signal, not elapsed wall time. Natural
656
+ # exits at-or-past the budget (budget == elapsed, or up to ~5s past due to
657
+ # SIGTERM grace) are no longer mislabeled as timeouts. Source of truth is
658
+ # WATCHDOG_FIRED, set in run-fixture.sh when TIMEOUT_FLAG existed post-wait.
659
+ timing["timed_out"] = os.environ.get("WATCHDOG_FIRED", "0") == "1"
660
+ json.dump(timing, open(os.path.join(result_dir, "timing.json"), "w"), indent=2)
661
+
662
+ verify = json.load(open(os.path.join(result_dir, "verify.json")))
663
+ try:
664
+ with open(os.path.join(result_dir, "diff.patch")) as f: diff_size = len(f.read())
665
+ except Exception: diff_size = 0
666
+ try:
667
+ with open(os.path.join(result_dir, "changed-files.txt")) as f:
668
+ changed = [l for l in f.read().splitlines() if l.strip()]
669
+ except Exception:
670
+ changed = []
671
+
672
+ result = {
673
+ "fixture": fixture,
674
+ "arm": arm,
675
+ "run_id": run_id,
676
+ "disqualifier": verify.get("disqualifier", False),
677
+ "verify_score": verify.get("verify_score", 0.0),
678
+ "commands_passed": verify.get("commands_passed", 0),
679
+ "commands_total": verify.get("commands_total", 0),
680
+ "diff_bytes": diff_size,
681
+ "files_changed": len(changed),
682
+ "elapsed_seconds": elapsed,
683
+ "timed_out": timing["timed_out"],
684
+ "invoke_exit": int(os.environ.get("INVOKE_EXIT", "0")),
685
+ "invoke_failure": int(os.environ.get("INVOKE_EXIT", "0")) not in (0,) and not timing["timed_out"],
686
+ }
687
+ json.dump(result, open(os.path.join(result_dir, "result.json"), "w"), indent=2)
688
+ print(json.dumps(result, indent=2))
689
+ PY
690
+
691
+ echo "[run-fixture] done: $RESULT_DIR"