npm - @tekyzinc/gsd-t - Versions diffs - 3.18.13 → 3.19.0 - Mend

@tekyzinc/gsd-t 3.18.13 → 3.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/CHANGELOG.md +114 -0
package/bin/gsd-t-parallel-probe.cjs +132 -0
package/bin/gsd-t-parallel.cjs +422 -9
package/bin/gsd-t-task-graph.cjs +80 -19
package/bin/gsd-t-unattended.cjs +634 -229
package/bin/gsd-t-worker-dispatch.cjs +211 -0
package/bin/headless-auto-spawn.cjs +44 -1
package/bin/headless-exit-codes.cjs +36 -18
package/bin/m44-proof-measure.cjs +285 -0
package/bin/m46-iter-proof.cjs +149 -0
package/bin/m46-worker-proof.cjs +201 -0
package/bin/parallelism-report.cjs +535 -0
package/bin/spawn-plan-writer.cjs +1 -1
package/commands/gsd-t-debug.md +10 -14
package/commands/gsd-t-execute.md +10 -16
package/commands/gsd-t-help.md +1 -0
package/commands/gsd-t-integrate.md +8 -14
package/commands/gsd-t-quick.md +10 -14
package/commands/gsd-t-resume.md +32 -0
package/commands/gsd-t-status.md +10 -0
package/commands/gsd-t-unattended-watch.md +58 -1
package/commands/gsd-t-visualize.md +15 -12
package/commands/gsd-t-wave.md +2 -11
package/docs/architecture.md +82 -0
package/docs/requirements.md +20 -0
package/package.json +1 -1
package/scripts/gsd-t-compact-detector.js +51 -8
package/scripts/gsd-t-dashboard-server.js +138 -85
package/scripts/gsd-t-transcript.html +152 -1
package/scripts/gsd-t-update-check.js +13 -4
package/scripts/hooks/gsd-t-conversation-capture.js +258 -0
package/templates/CLAUDE-global.md +54 -0

package/bin/gsd-t-unattended.cjs CHANGED Viewed

@@ -62,6 +62,19 @@ function _emit(projectDir, ev) {
   try { _esAppendEvent(projectDir, ev); } catch (_) { /* never halt the loop */ }
 }
+// M44 D9 (v1.5.0) — planner-driven multi-worker fan-out. Lazy-loaded so unit
+// tests can stub via deps._runParallel without touching the real module.
+let _parallelModule = null;
+function _loadRunParallel() {
+  if (_parallelModule) return _parallelModule;
+  try {
+    _parallelModule = require("./gsd-t-parallel.cjs");
+  } catch {
+    _parallelModule = { runParallel: () => ({ workerCount: 0, parallelTasks: [], plan: [] }) };
+  }
+  return _parallelModule;
+}
 // M42 D1 — transcript tee. Captures each worker's stdout lines to an ndjson
 // file and registers the spawn so the dashboard sidebar can list + render it.
 // Best-effort: every call is swallowed so tee failures never halt the loop.
@@ -73,7 +86,7 @@ const { checkHeartbeat: _checkHeartbeat } = require("./gsd-t-unattended-heartbea
 // ── Constants ───────────────────────────────────────────────────────────────
-const CONTRACT_VERSION = "1.4.0";
+const CONTRACT_VERSION = "1.5.0";
 const UNATTENDED_DIR_REL = path.join(".gsd-t", ".unattended");
 const PID_FILE = "supervisor.pid";
 const STATE_FILE = "state.json";
@@ -122,6 +135,8 @@ module.exports = {
   releaseSleepPrevention,
   runMainLoop,
   _spawnWorker,
+  _spawnWorkerFanOut,
+  _partitionTasks,
   _appendRunLog,
   CONTRACT_VERSION,
   UNATTENDED_DIR_REL,
@@ -132,6 +147,50 @@ module.exports = {
   DEFAULT_HEARTBEAT_POLL_MS,
 };
+function _reconcile(state, results) {
+  if (!Array.isArray(results) || results.length === 0) return;
+  for (const r of results) {
+    if (!r || typeof r !== 'object') continue;
+    // append-only completedTasks (preserve order, dedupe)
+    if (Array.isArray(r.tasksDone) && r.tasksDone.length > 0) {
+      const current = new Set(state.completedTasks || []);
+      for (const t of r.tasksDone) {
+        if (!current.has(t)) {
+          state.completedTasks = (state.completedTasks || []).concat([t]);
+          current.add(t);
+        }
+      }
+    }
+    // last-writer-wins on status — but 'error' is sticky: once set, it stays
+    // until the next explicit non-error status in a later iter.
+    if (r.status && r.status !== state.status) {
+      state.status = r.status;
+    }
+    // verifyNeeded is OR-across-results: any iter that flags it wins.
+    if (r.verifyNeeded === true) {
+      state.verifyNeeded = true;
+    }
+    // artifacts: append-only, concat arrays.
+    if (Array.isArray(r.artifacts) && r.artifacts.length > 0) {
+      state.artifacts = (state.artifacts || []).concat(r.artifacts);
+    }
+  }
+  // NOTE: `state.iter` is advanced by the main while loop (pre-M46 contract:
+  // one increment per fan-out pass, regardless of worker/batch count). We do
+  // NOT advance it here — doing so would double-increment against the
+  // existing supervisor-contract invariant (surfaced by m43/m44 tests).
+  state.lastBatch = {
+    size: results.length,
+    endedAt: new Date().toISOString(),
+    errorCount: results.filter(r => r && r.status === 'error').length,
+  };
+}
+// M46 D1 T2 — expose the extracted single-iter body for future unit tests
+// (T7) and the iter-parallel driver (T4/T5). Kept out of the main exports
+// block so consumers don't accidentally import implementation details.
+module.exports.__test__ = { _runOneIter, _computeIterBatchSize, _runIterParallel, _reconcile };
 // ── parseArgs ───────────────────────────────────────────────────────────────
 /**
@@ -927,6 +986,10 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
     deps._spawnWorker || (useTestStub ? _testModeSpawnWorker : _spawnWorker);
   const milestoneComplete =
     deps._isMilestoneComplete || (useTestStub ? () => true : isMilestoneComplete);
+  // M44 D9 (v1.5.0) — planner injected for multi-worker iter fan-out.
+  // Tests stub via deps._runParallel; production lazy-loads from gsd-t-parallel.cjs.
+  const runParallelImpl =
+    deps._runParallel || ((o) => _loadRunParallel().runParallel(o));
   const stopCheck = deps._stopRequested || stopRequested;
   const workerTimeoutMs = opts.workerTimeoutMs || DEFAULT_WORKER_TIMEOUT_MS;
   const staleHeartbeatMs =
@@ -947,61 +1010,200 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
   const heartbeatEnabled = !deps._disableHeartbeat && !useTestStub;
   const projectDir = state.projectDir;
+  // M46 D1 T2 — pure extract-method refactor. The body of each iteration
+  // now lives in the top-level `_runOneIter` helper (below). The while loop
+  // itself is unchanged in semantics: stop-check and isDone evaluate per
+  // pass, and any terminal state.status ({"done","failed"}) written by the
+  // iter body causes us to break, matching every pre-refactor `break` path.
+  // Non-terminal outcomes fall through to the next iteration, matching the
+  // pre-refactor `continue` paths.
+  const iterCtx = {
+    dir,
+    fn,
+    config,
+    spawnWorker,
+    milestoneComplete,
+    runParallelImpl,
+    workerTimeoutMs,
+    heartbeatImpl,
+    heartbeatEnabled,
+    staleHeartbeatMs,
+    heartbeatPollMs,
+    projectDir,
+    verbose: !!opts.verbose,
+  };
   while (!isDone(state) && !stopCheck(projectDir)) {
-    // ── PRE-WORKER HOOK (contract §12) ─────────────────────────────────────
-    // Refusal → halt with status=failed, lastExit=6 (caps) or 2 (validate).
-    const capIter = fn.checkIterationCap(state, config);
-    if (!capIter.ok) {
-      state.status = "failed";
-      state.lastExit = capIter.code || 6;
-      writeState(state, dir);
-      break;
-    }
-    const capWall = fn.checkWallClockCap(state, config);
-    if (!capWall.ok) {
-      state.status = "failed";
-      state.lastExit = capWall.code || 6;
-      writeState(state, dir);
-      break;
-    }
-    const vRes = fn.validateState(state);
-    if (!vRes.ok) {
-      state.status = "failed";
-      state.lastExit = vRes.code || 2;
-      writeState(state, dir);
-      break;
-    }
+    const batchSize = _computeIterBatchSize(state, opts);
+    const _batchStartMs = Date.now();
+    try {
+      fs.appendFileSync(
+        path.join(dir, RUN_LOG),
+        `[iter-batch-start] batch-size=${batchSize} iter=${state.iter} ts=${new Date(_batchStartMs).toISOString()}\n`,
+        "utf8"
+      );
+    } catch (_) { /* best effort */ }
+    const results = await _runIterParallel(state, opts, (s, o) => _runOneIter(s, iterCtx), batchSize);
+    _reconcile(state, results);
+    try {
+      const _ok = results.filter((r) => r.status !== "error").length;
+      const _fail = results.length - _ok;
+      const _durSec = ((Date.now() - _batchStartMs) / 1000).toFixed(1);
+      fs.appendFileSync(
+        path.join(dir, RUN_LOG),
+        `[iter-batch-complete] size=${results.length} ok=${_ok} fail=${_fail} duration=${_durSec}s iter=${state.iter}\n`,
+        "utf8"
+      );
+    } catch (_) { /* best effort */ }
+    if (isTerminal(state.status)) break;
+  }
-    // Pre-spawn bookkeeping
-    state.iter = (state.iter || 0) + 1;
-    const workerStart = new Date();
-    state.lastWorkerStartedAt = workerStart.toISOString();
+  // If we exited because the user dropped a stop sentinel and no terminal
+  // status has been assigned yet, transition to 'stopped' now (contract §10).
+  // The sentinel file itself is NOT removed by the supervisor — it stays on
+  // disk as evidence, to be cleaned by the next launch via
+  // `cleanStaleStopSentinel`.
+  if (!isTerminal(state.status) && stopCheck(projectDir)) {
+    state.status = "stopped";
     writeState(state, dir);
+  }
+  return state;
+}
+// ── _runOneIter (M46 D1 T2) ─────────────────────────────────────────────────
+/**
+ * Body of a single supervisor iteration, extracted verbatim from the
+ * `runMainLoop` while-loop (pre-M46-D1). Mutates `state` in place exactly as
+ * the original body did — all writeState calls, event-stream emits, run.log
+ * and token-log appends, heartbeat wiring, fan-out dispatch, and exit-code
+ * classification are preserved line-for-line.
+ *
+ * `opts` here is the per-iter context bundle assembled in runMainLoop (not
+ * the supervisor-level opts object). It carries the closure values the body
+ * used to read from the enclosing scope: fn, config, dir, projectDir,
+ * spawnWorker, milestoneComplete, runParallelImpl, workerTimeoutMs,
+ * heartbeatImpl, heartbeatEnabled, staleHeartbeatMs, heartbeatPollMs, verbose.
+ *
+ * Returns an IterResult per iter-parallel-contract.md v1.0.0 §4. T2 emits a
+ * minimal shape (tasksDone = []) — T4/T5 will populate tasksDone and use
+ * `status` to drive `_computeIterBatchSize`. For now the while-loop driver
+ * consumes only `isTerminal(state.status)`; the returned value is forward-
+ * compatible scaffolding.
+ */
+async function _runOneIter(state, opts) {
+  const {
+    dir, fn, config, spawnWorker, milestoneComplete, runParallelImpl,
+    workerTimeoutMs, heartbeatImpl, heartbeatEnabled,
+    staleHeartbeatMs, heartbeatPollMs, projectDir,
+  } = opts;
+  const _result = (status, extras) => ({
+    iter: state.iter,
+    status,
+    tasksDone: [],
+    verifyNeeded: status === "verify-needed",
+    artifacts: extras || {},
+  });
+  // ── PRE-WORKER HOOK (contract §12) ─────────────────────────────────────
+  // Refusal → halt with status=failed, lastExit=6 (caps) or 2 (validate).
+  const capIter = fn.checkIterationCap(state, config);
+  if (!capIter.ok) {
+    state.status = "failed";
+    state.lastExit = capIter.code || 6;
+    writeState(state, dir);
+    return _result("failed", { errorMessage: `iteration_cap:${state.lastExit}` });
+  }
+  const capWall = fn.checkWallClockCap(state, config);
+  if (!capWall.ok) {
+    state.status = "failed";
+    state.lastExit = capWall.code || 6;
+    writeState(state, dir);
+    return _result("failed", { errorMessage: `wall_clock_cap:${state.lastExit}` });
+  }
+  const vRes = fn.validateState(state);
+  if (!vRes.ok) {
+    state.status = "failed";
+    state.lastExit = vRes.code || 2;
+    writeState(state, dir);
+    return _result("failed", { errorMessage: `validate_state:${state.lastExit}` });
+  }
+  // Pre-spawn bookkeeping
+  state.iter = (state.iter || 0) + 1;
+  const workerStart = new Date();
+  state.lastWorkerStartedAt = workerStart.toISOString();
+  writeState(state, dir);
+  _emit(projectDir, {
+    ts: workerStart.toISOString(),
+    iter: state.iter,
+    type: "task_start",
+    source: "supervisor",
+    milestone: state.milestone || "",
+    wave: state.wave || "",
+    task: state.nextTask || "",
+  });
+  let res;
+  const workerStartMs = workerStart.getTime();
+  const hbOpts = heartbeatEnabled
+    ? {
+        onHeartbeatCheck: () =>
+          heartbeatImpl({
+            projectDir,
+            workerStartedAt: workerStartMs,
+            staleHeartbeatMs,
+          }),
+        heartbeatPollMs,
+      }
+    : {};
+  // M44 D9 (v1.5.0) — planner-driven fan-out decision for this iter.
+  // Ask runParallel whether the current task graph supports ≥2 concurrent
+  // workers. Any failure in the planner path MUST fall back to the single-
+  // worker spawn — the parallel path is purely additive.
+  let iterPlan = null;
+  try {
+    iterPlan = runParallelImpl({
+      projectDir,
+      mode: "unattended",
+      milestone: state.milestone || null,
+      dryRun: true,
+    });
+  } catch (e) {
+    iterPlan = null;
     _emit(projectDir, {
-      ts: workerStart.toISOString(),
       iter: state.iter,
-      type: "task_start",
+      type: "parallelism_reduced",
       source: "supervisor",
-      milestone: state.milestone || "",
-      wave: state.wave || "",
-      task: state.nextTask || "",
+      original_count: null,
+      reduced_count: 1,
+      reason: `planner_error:${(e && e.message) || "unknown"}`,
     });
+  }
+  const fanOutCount = iterPlan && Number(iterPlan.workerCount) >= 2 ? Number(iterPlan.workerCount) : 1;
+  const parallelTaskIds = iterPlan && Array.isArray(iterPlan.parallelTasks) ? iterPlan.parallelTasks : [];
+  const subsets = fanOutCount >= 2 ? _partitionTasks(parallelTaskIds, fanOutCount) : null;
+  const useFanOut = !!(subsets && subsets.length >= 2);
-    let res;
-    const workerStartMs = workerStart.getTime();
-    const hbOpts = heartbeatEnabled
-      ? {
-          onHeartbeatCheck: () =>
-            heartbeatImpl({
-              projectDir,
-              workerStartedAt: workerStartMs,
-              staleHeartbeatMs,
-            }),
-          heartbeatPollMs,
-        }
-      : {};
-    try {
+  try {
+    if (useFanOut) {
+      _emit(projectDir, {
+        ts: workerStart.toISOString(),
+        iter: state.iter,
+        type: "fan_out",
+        source: "supervisor",
+        worker_count: subsets.length,
+        task_ids: parallelTaskIds,
+      });
+      res = await _spawnWorkerFanOut(state, {
+        cwd: projectDir,
+        timeout: workerTimeoutMs,
+        verbose: !!opts.verbose,
+        ...hbOpts,
+      }, spawnWorker, subsets);
+    } else {
       res = spawnWorker(state, {
         cwd: projectDir,
         timeout: workerTimeoutMs,
@@ -1011,207 +1213,294 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
       if (res && typeof res.then === "function") {
         res = await res;
       }
-    } catch (e) {
-      // Defensive: a real spawnSync shouldn't throw, but a shim could.
-      res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
-    }
-    res = res || { status: null, stdout: "", stderr: "", signal: null };
-    const workerEnd = new Date();
-    const elapsedMs = workerEnd.getTime() - workerStart.getTime();
-    const stdout = typeof res.stdout === "string" ? res.stdout : "";
-    const stderr = typeof res.stderr === "string" ? res.stderr : "";
-    // Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
-    //   - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
-    //   - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
-    // Heartbeat wins on ties because it's the more specific signal.
-    let exitCode;
-    let lastExitReason = null;
-    if (res.staleHeartbeat === true) {
-      exitCode = 125;
-      lastExitReason = "stale_heartbeat";
-    } else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
-      exitCode = 124;
-      lastExitReason = "worker_timeout";
-    } else {
-      exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
     }
+  } catch (e) {
+    // Defensive: a real spawnSync shouldn't throw, but a shim could.
+    res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
+  }
+  res = res || { status: null, stdout: "", stderr: "", signal: null };
+  const workerEnd = new Date();
+  const elapsedMs = workerEnd.getTime() - workerStart.getTime();
+  const stdout = typeof res.stdout === "string" ? res.stdout : "";
+  const stderr = typeof res.stderr === "string" ? res.stderr : "";
+  // Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
+  //   - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
+  //   - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
+  // Heartbeat wins on ties because it's the more specific signal.
+  let exitCode;
+  let lastExitReason = null;
+  if (res.staleHeartbeat === true) {
+    exitCode = 125;
+    lastExitReason = "stale_heartbeat";
+  } else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
+    exitCode = 124;
+    lastExitReason = "worker_timeout";
+  } else {
+    exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
+  }
-    // v3.13.11 Bug 1: when a watchdog fires, make the event explicit in
-    // run.log so operators can see WHICH iteration was cut without inferring
-    // from exit codes. The marker is prepended to stdout and written in the
-    // single per-iter run.log append (no duplicate header).
-    let loggedStdout = stdout;
-    if (exitCode === 124) {
-      const marker =
-        `[worker_timeout] iter=${state.iter} budget=${workerTimeoutMs}ms ` +
-        `elapsed=${elapsedMs}ms — absolute-backstop SIGTERM delivered, ` +
-        `supervisor continues relay per contract §16.\n`;
-      loggedStdout = marker + (stdout || "");
-    } else if (exitCode === 125) {
-      const reason = res.heartbeatReason || "no recent events.jsonl writes";
-      const marker =
-        `[stale_heartbeat] iter=${state.iter} threshold=${staleHeartbeatMs}ms ` +
-        `elapsed=${elapsedMs}ms reason="${reason}" — ` +
-        `heartbeat watchdog SIGTERM delivered, supervisor continues relay.\n`;
-      loggedStdout = marker + (stdout || "");
-    }
+  // v3.13.11 Bug 1: when a watchdog fires, make the event explicit in
+  // run.log so operators can see WHICH iteration was cut without inferring
+  // from exit codes. The marker is prepended to stdout and written in the
+  // single per-iter run.log append (no duplicate header).
+  let loggedStdout = stdout;
+  if (exitCode === 124) {
+    const marker =
+      `[worker_timeout] iter=${state.iter} budget=${workerTimeoutMs}ms ` +
+      `elapsed=${elapsedMs}ms — absolute-backstop SIGTERM delivered, ` +
+      `supervisor continues relay per contract §16.\n`;
+    loggedStdout = marker + (stdout || "");
+  } else if (exitCode === 125) {
+    const reason = res.heartbeatReason || "no recent events.jsonl writes";
+    const marker =
+      `[stale_heartbeat] iter=${state.iter} threshold=${staleHeartbeatMs}ms ` +
+      `elapsed=${elapsedMs}ms reason="${reason}" — ` +
+      `heartbeat watchdog SIGTERM delivered, supervisor continues relay.\n`;
+    loggedStdout = marker + (stdout || "");
+  }
+  // Append the full worker output to run.log (never truncate).
+  _appendRunLog(dir, state.iter, workerEnd, exitCode, loggedStdout, stderr);
+  // Append to token-log.md (Fix 1, v3.12.12) — supervisor workers write rows
+  // so the log captures headless/unattended activity, not just interactive spawns.
+  _appendTokenLog(projectDir, {
+    dtStart: workerStart.toISOString().slice(0, 16).replace("T", " "),
+    dtEnd: workerEnd.toISOString().slice(0, 16).replace("T", " "),
+    command: "gsd-t-resume",
+    durationS: Math.round(elapsedMs / 1000),
+    exitCode,
+    iter: state.iter,
+  });
-    // Append the full worker output to run.log (never truncate).
-    _appendRunLog(dir, state.iter, workerEnd, exitCode, loggedStdout, stderr);
-    // Append to token-log.md (Fix 1, v3.12.12) — supervisor workers write rows
-    // so the log captures headless/unattended activity, not just interactive spawns.
-    _appendTokenLog(projectDir, {
-      dtStart: workerStart.toISOString().slice(0, 16).replace("T", " "),
-      dtEnd: workerEnd.toISOString().slice(0, 16).replace("T", " "),
-      command: "gsd-t-resume",
-      durationS: Math.round(elapsedMs / 1000),
-      exitCode,
+  // Post-spawn state update
+  state.lastExit = exitCode;
+  state.lastWorkerFinishedAt = workerEnd.toISOString();
+  state.lastElapsedMs = elapsedMs;
+  if (lastExitReason) {
+    state.lastExitReason = lastExitReason;
+  } else if (exitCode === 0) {
+    state.lastExitReason = "clean";
+  } else {
+    state.lastExitReason = `exit_${exitCode}`;
+  }
+  // M44 D9 (v1.5.0) — per-iter multi-worker aggregates. Present only when the
+  // planner selected fan-out; single-worker iters omit these fields so the
+  // state schema stays backward-compatible with v1.4.x readers.
+  if (useFanOut && Array.isArray(res.workerResults)) {
+    state.lastExits = res.workerResults.map((w) => ({
+      idx: w.idx,
+      code: typeof w.status === "number" ? w.status : null,
+      taskIds: w.taskIds || [],
+      elapsedMs: w.elapsedMs,
+      spawnId: w.spawnId || null,
+    }));
+    state.workerPids = res.workerResults.map((w) => w.spawnId || null);
+    state.lastFanOutCount = res.workerResults.length;
+  } else {
+    // Clear stale multi-worker fields on single-worker iters so readers
+    // never see a mix of regimes.
+    if (state.lastExits) delete state.lastExits;
+    if (state.workerPids) delete state.workerPids;
+    if (state.lastFanOutCount) delete state.lastFanOutCount;
+  }
+  writeState(state, dir);
+  // Event-stream: task_complete on success, error on non-zero.
+  const durationS = Math.round(elapsedMs / 1000);
+  if (exitCode === 0) {
+    _emit(projectDir, {
+      ts: workerEnd.toISOString(),
       iter: state.iter,
+      type: "task_complete",
+      source: "supervisor",
+      task: state.nextTask || "",
+      verdict: "pass",
+      duration_s: durationS,
     });
+  } else {
+    _emit(projectDir, {
+      ts: workerEnd.toISOString(),
+      iter: state.iter,
+      type: "error",
+      source: "supervisor",
+      error: `worker exit ${exitCode}`,
+      recoverable: exitCode !== 4 && exitCode !== 5,
+    });
+  }
-    // Post-spawn state update
-    state.lastExit = exitCode;
-    state.lastWorkerFinishedAt = workerEnd.toISOString();
-    state.lastElapsedMs = elapsedMs;
-    if (lastExitReason) {
-      state.lastExitReason = lastExitReason;
-    } else if (exitCode === 0) {
-      state.lastExitReason = "clean";
-    } else {
-      state.lastExitReason = `exit_${exitCode}`;
+  // ── POST-WORKER HOOK (contract §12) ────────────────────────────────────
+  // Read the tail of run.log for pattern detection. ~200 lines is enough
+  // to span the last several iteration blocks for the gutter detector.
+  let runLogTail = "";
+  try {
+    const logPath = path.join(dir, RUN_LOG);
+    if (fs.existsSync(logPath)) {
+      const all = fs.readFileSync(logPath, "utf8");
+      const lines = all.split(/\r?\n/);
+      runLogTail = lines.slice(-200).join("\n");
     }
+  } catch (_) {
+    // best effort — tail read failure does not halt the loop
+  }
+  const blocker = fn.detectBlockerSentinel(runLogTail);
+  if (!blocker.ok) {
+    state.status = "failed";
+    state.lastExit = blocker.code || 6;
     writeState(state, dir);
+    return _result("failed", { errorMessage: `blocker_sentinel:${state.lastExit}` });
+  }
+  const gutter = fn.detectGutter(state, runLogTail, config);
+  if (!gutter.ok) {
+    state.status = "failed";
+    state.lastExit = gutter.code || 6;
+    writeState(state, dir);
+    return _result("failed", { errorMessage: `gutter:${state.lastExit}` });
+  }
-    // Event-stream: task_complete on success, error on non-zero.
-    const durationS = Math.round(elapsedMs / 1000);
-    if (exitCode === 0) {
-      _emit(projectDir, {
-        ts: workerEnd.toISOString(),
-        iter: state.iter,
-        type: "task_complete",
-        source: "supervisor",
-        task: state.nextTask || "",
-        verdict: "pass",
-        duration_s: durationS,
-      });
-    } else {
-      _emit(projectDir, {
-        ts: workerEnd.toISOString(),
-        iter: state.iter,
-        type: "error",
-        source: "supervisor",
-        error: `worker exit ${exitCode}`,
-        recoverable: exitCode !== 4 && exitCode !== 5,
-      });
-    }
-    // ── POST-WORKER HOOK (contract §12) ────────────────────────────────────
-    // Read the tail of run.log for pattern detection. ~200 lines is enough
-    // to span the last several iteration blocks for the gutter detector.
-    let runLogTail = "";
-    try {
-      const logPath = path.join(dir, RUN_LOG);
-      if (fs.existsSync(logPath)) {
-        const all = fs.readFileSync(logPath, "utf8");
-        const lines = all.split(/\r?\n/);
-        runLogTail = lines.slice(-200).join("\n");
-      }
-    } catch (_) {
-      // best effort — tail read failure does not halt the loop
-    }
-    const blocker = fn.detectBlockerSentinel(runLogTail);
-    if (!blocker.ok) {
-      state.status = "failed";
-      state.lastExit = blocker.code || 6;
+  // Terminal exit classification
+  if (exitCode === 0) {
+    // Success — check if the milestone is now complete.
+    if (milestoneComplete(projectDir, state.milestone)) {
+      state.status = "done";
       writeState(state, dir);
-      break;
-    }
-    const gutter = fn.detectGutter(state, runLogTail, config);
-    if (!gutter.ok) {
-      state.status = "failed";
-      state.lastExit = gutter.code || 6;
-      writeState(state, dir);
-      break;
-    }
-    // Terminal exit classification
-    if (exitCode === 0) {
-      // Success — check if the milestone is now complete.
-      if (milestoneComplete(projectDir, state.milestone)) {
-        state.status = "done";
-        writeState(state, dir);
-        break;
-      }
-      // Not yet done — continue relay.
-      _emit(projectDir, {
-        iter: state.iter,
-        type: "retry",
-        source: "supervisor",
-        attempt: state.iter,
-        reason: "milestone_incomplete",
-      });
-      continue;
+      return _result("done");
     }
-    if (exitCode === 4) {
-      // Unrecoverable blocker.
-      state.status = "failed";
-      writeState(state, dir);
-      break;
-    }
-    if (exitCode === 5) {
-      // Command dispatch failure — worker invocation is broken.
-      state.status = "failed";
-      writeState(state, dir);
-      break;
-    }
-    if (exitCode === 124) {
-      // Timeout — continue unless the iter cap is hit on the next check.
-      _emit(projectDir, {
-        iter: state.iter,
-        type: "retry",
-        source: "supervisor",
-        attempt: state.iter,
-        reason: "timeout",
-      });
-      continue;
-    }
-    if (exitCode === 125) {
-      // Stale heartbeat (M43) — continue unless the iter cap hits. The
-      // heartbeat kill is recoverable by definition: the worker was not
-      // emitting events, which is the most common class of stuck iteration
-      // (e.g. child stuck on a long Bash call with no tool_call emits).
-      _emit(projectDir, {
-        iter: state.iter,
-        type: "retry",
-        source: "supervisor",
-        attempt: state.iter,
-        reason: "stale_heartbeat",
-      });
-      continue;
-    }
-    // Non-terminal (1/2/3) — continue the relay.
+    // Not yet done — continue relay.
     _emit(projectDir, {
       iter: state.iter,
       type: "retry",
       source: "supervisor",
       attempt: state.iter,
-      reason: `exit_${exitCode}`,
+      reason: "milestone_incomplete",
     });
+    return _result("running");
   }
-  // If we exited because the user dropped a stop sentinel and no terminal
-  // status has been assigned yet, transition to 'stopped' now (contract §10).
-  // The sentinel file itself is NOT removed by the supervisor — it stays on
-  // disk as evidence, to be cleaned by the next launch via
-  // `cleanStaleStopSentinel`.
-  if (!isTerminal(state.status) && stopCheck(projectDir)) {
-    state.status = "stopped";
+  if (exitCode === 4) {
+    // Unrecoverable blocker.
+    state.status = "failed";
     writeState(state, dir);
+    return _result("failed", { errorMessage: "exit_4_unrecoverable" });
   }
-  return state;
+  if (exitCode === 5) {
+    // Command dispatch failure — worker invocation is broken.
+    state.status = "failed";
+    writeState(state, dir);
+    return _result("failed", { errorMessage: "exit_5_dispatch_failure" });
+  }
+  if (exitCode === 124) {
+    // Timeout — continue unless the iter cap is hit on the next check.
+    _emit(projectDir, {
+      iter: state.iter,
+      type: "retry",
+      source: "supervisor",
+      attempt: state.iter,
+      reason: "timeout",
+    });
+    return _result("running");
+  }
+  if (exitCode === 125) {
+    // Stale heartbeat (M43) — continue unless the iter cap hits. The
+    // heartbeat kill is recoverable by definition: the worker was not
+    // emitting events, which is the most common class of stuck iteration
+    // (e.g. child stuck on a long Bash call with no tool_call emits).
+    _emit(projectDir, {
+      iter: state.iter,
+      type: "retry",
+      source: "supervisor",
+      attempt: state.iter,
+      reason: "stale_heartbeat",
+    });
+    return _result("running");
+  }
+  // Non-terminal (1/2/3) — continue the relay.
+  _emit(projectDir, {
+    iter: state.iter,
+    type: "retry",
+    source: "supervisor",
+    attempt: state.iter,
+    reason: `exit_${exitCode}`,
+  });
+  return _result("running");
+}
+// ── _computeIterBatchSize (M46 D1 T3) ───────────────────────────────────────
+/**
+ * Decide how many iterations the supervisor main loop should dispatch
+ * concurrently in the next pass. Implements the mode-safety rules from
+ * `.gsd-t/contracts/iter-parallel-contract.md` v1.0.0 §3.1.
+ *
+ * Rules evaluated top-down; first match wins:
+ *   1. status === "verify-needed"        → 1 (serial verify gate)
+ *   2. milestoneBoundary === true        → 1 (milestone boundary)
+ *   3. status === "complete-milestone"   → 1 (single-shot closeout)
+ *   4. otherwise → min(opts.maxIterParallel ?? 4, remainingIters, 8)
+ *      where remainingIters = (state.maxIterations ?? Infinity) - (state.iter ?? 0)
+ *
+ * Never returns less than 1.
+ */
+function _computeIterBatchSize(state, opts) {
+  if (state && state.status === "verify-needed") return 1;
+  if (state && state.milestoneBoundary === true) return 1;
+  if (state && state.status === "complete-milestone") return 1;
+  // Production default is 1 (serial, pre-M46 behavior). Iter-parallelism is
+  // opt-in via `opts.maxIterParallel` — callers that pass a number enable it.
+  // Rationale: `_runOneIter` mutates `state.iter` and other shared fields
+  // (heartbeat bookkeeping, writeState) that are not safe to execute on the
+  // same state object concurrently. Unit tests exercise the parallel path
+  // with explicit batch sizes; production main loop omits the flag and runs
+  // strictly serial, preserving the pre-M46 supervisor contract (one iter
+  // counter increment per fan-out pass). See backlog #24 for the follow-up
+  // that makes `_runOneIter` state-clone-safe and lifts this gate.
+  if (!opts || typeof opts.maxIterParallel !== "number") return 1;
+  const cap = opts.maxIterParallel;
+  const maxIters = state && typeof state.maxIterations === "number"
+    ? state.maxIterations
+    : Infinity;
+  const currentIter = state && typeof state.iter === "number"
+    ? state.iter
+    : 0;
+  const remainingIters = maxIters - currentIter;
+  const size = Math.min(cap, remainingIters, 8);
+  return size < 1 ? 1 : size;
+}
+// ── _runIterParallel (M46 D1 T4) ────────────────────────────────────────────
+/**
+ * Dispatch `batchSize` independent iter slices concurrently and return an
+ * IterResult[] of exactly that length. Implements the error-isolation rule
+ * from `.gsd-t/contracts/iter-parallel-contract.md` v1.0.0 §4.2: a single
+ * rejected iter is translated into an IterResult with status "error" and
+ * does NOT cancel siblings. The caller decides how to react.
+ *
+ * iterFn defaults to `_runOneIter` for the T7 tests; production callers
+ * (T5 main-loop rewrite) pass the same.
+ */
+async function _runIterParallel(state, opts, iterFn, batchSize) {
+  const fn = typeof iterFn === "function" ? iterFn : _runOneIter;
+  const n = typeof batchSize === "number" && batchSize >= 1 ? batchSize : 1;
+  const slices = [];
+  for (let i = 0; i < n; i++) slices.push(Promise.resolve().then(() => fn(state, opts)));
+  const settled = await Promise.allSettled(slices);
+  return settled.map((s) => {
+    if (s.status === "fulfilled") return s.value;
+    const reason = s.reason;
+    const msg = (reason && reason.message) ? reason.message : String(reason);
+    return {
+      status: "error",
+      tasksDone: [],
+      verifyNeeded: false,
+      artifacts: [],
+      error: msg,
+    };
+  });
 }
 // ── _appendTokenLog (Fix 1, v3.12.12) ───────────────────────────────────────
@@ -1285,11 +1574,24 @@ function _spawnWorker(state, opts) {
   // id as parent, so shims inside the worker write state files that the tree
   // builder can attach under the supervisor root.
   workerEnv.GSD_T_AGENT_ID =
-    "supervisor-iter-" + (state && state.iter ? state.iter : Date.now());
+    "supervisor-iter-" + (state && state.iter ? state.iter : Date.now()) +
+    (state && typeof state._workerIndex === "number" ? `-w${state._workerIndex}` : "");
   if (process.env.GSD_T_AGENT_ID) {
     workerEnv.GSD_T_PARENT_AGENT_ID = process.env.GSD_T_AGENT_ID;
   }
+  // M44 D9 (v1.5.0) — planner-driven fan-out: when the supervisor partitions
+  // the iter's task graph across N workers, each worker carries its disjoint
+  // task-id subset via env var. The worker prompt consumes this to (a) skip
+  // the intra-worker Team Mode block (the fan-out is the team), (b) restrict
+  // itself to its assigned task IDs.
+  const assignedTaskIds = Array.isArray(opts && opts.taskIds) ? opts.taskIds : null;
+  if (assignedTaskIds && assignedTaskIds.length > 0) {
+    workerEnv.GSD_T_WORKER_TASK_IDS = assignedTaskIds.join(",");
+    workerEnv.GSD_T_WORKER_INDEX = String((state && state._workerIndex) || 0);
+    workerEnv.GSD_T_WORKER_TOTAL = String((state && state._workerTotal) || 1);
+  }
   // M42 D1 — allocate a spawn-id + open transcript before spawning. parentId
   // is the supervisor's own spawn-id (set once at supervisor start via
   // GSD_T_SPAWN_ID env) so the sidebar can render parent-indented trees.
@@ -1353,6 +1655,18 @@ function _spawnWorker(state, opts) {
         "",
         "# Team Mode (Intra-Wave Parallelism)",
         "",
+        "M44 D9 (v1.5.0+) — check env `GSD_T_WORKER_TASK_IDS` FIRST. If SET, you",
+        "are one of N planner-assigned workers in a supervisor-level fan-out.",
+        "The value is your disjoint task-id subset. DO NOT spawn Task subagents",
+        "to re-fan-out (the supervisor already did). Execute ONLY your assigned",
+        "task IDs sequentially in this worker, then return. Skip the rest of",
+        "this block.",
+        "",
+        "If GSD_T_WORKER_TASK_IDS is UNSET, the supervisor's planner decided",
+        "N=1 for this iter (sequential fallback: gates vetoed, file-disjointness",
+        "unprovable, or est CW% too high). Proceed with the legacy worker-level",
+        "Team Mode below:",
+        "",
         "Before executing tasks for this iteration, read `.gsd-t/partition.md` to",
         "identify the current wave and which domains belong to it.",
         "",
@@ -1427,6 +1741,97 @@ function _spawnWorker(state, opts) {
   return finalize(spawnResult);
 }
+// ── _spawnWorkerFanOut (M44 D9, contract v1.5.0) ────────────────────────────
+/**
+ * Planner-driven multi-worker fan-out. Spawns N concurrent workers via the
+ * injected `spawnWorker` shim, each receiving a disjoint subset of the iter's
+ * parallel task IDs (passed through `opts.taskIds`). Waits on all via
+ * Promise.all before returning a merged result shape compatible with the
+ * single-worker path.
+ *
+ * Merge semantics:
+ *   - `status`        — 0 if every worker cleanly returned 0, else the first
+ *                       non-zero status encountered (worst exit wins).
+ *   - `stdout`        — per-worker blocks joined by `[WORKER i/N tasks=...]` headers.
+ *   - `stderr`        — concatenated.
+ *   - `staleHeartbeat`/`timedOut` — true if any worker triggered them.
+ *   - `workerResults` — array of per-worker {status, taskIds, pid, spawnId, elapsedMs}
+ *                       for state.json aggregation.
+ *
+ * The caller (runMainLoop) treats this result exactly like a single-worker
+ * result for downstream classification. Multi-worker observability lives in
+ * the `workerResults` array, not in new control-flow branches.
+ */
+async function _spawnWorkerFanOut(state, opts, spawnWorker, subsets) {
+  const launches = subsets.map((taskIds, i) => {
+    const subState = { ...state, _workerIndex: i, _workerTotal: subsets.length, _workerTaskIds: taskIds };
+    const started = Date.now();
+    return Promise.resolve()
+      .then(() => spawnWorker(subState, { ...opts, taskIds }))
+      .then((r) => ({ r: r || {}, taskIds, started, ended: Date.now(), idx: i }))
+      .catch((e) => ({
+        r: { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null },
+        taskIds, started, ended: Date.now(), idx: i,
+      }));
+  });
+  const outcomes = await Promise.all(launches);
+  outcomes.sort((a, b) => a.idx - b.idx);
+  let mergedStatus = 0;
+  let stale = false;
+  let timedOut = false;
+  let heartbeatReason = null;
+  const stdoutBlocks = [];
+  const stderrBlocks = [];
+  const workerResults = [];
+  for (const o of outcomes) {
+    const s = typeof o.r.status === "number" ? o.r.status : null;
+    if (mergedStatus === 0 && s !== 0) mergedStatus = s === null ? 1 : s;
+    if (o.r.staleHeartbeat) stale = true;
+    if (o.r.timedOut) timedOut = true;
+    if (!heartbeatReason && o.r.heartbeatReason) heartbeatReason = o.r.heartbeatReason;
+    const tag = `[WORKER ${o.idx + 1}/${outcomes.length} tasks=${(o.taskIds || []).join(",") || "-"}]`;
+    stdoutBlocks.push(`${tag}\n${o.r.stdout || ""}`);
+    if (o.r.stderr) stderrBlocks.push(`${tag}\n${o.r.stderr}`);
+    workerResults.push({
+      idx: o.idx,
+      status: s,
+      taskIds: o.taskIds,
+      spawnId: o.r.spawnId || null,
+      signal: o.r.signal || null,
+      elapsedMs: o.ended - o.started,
+      staleHeartbeat: !!o.r.staleHeartbeat,
+      timedOut: !!o.r.timedOut,
+    });
+  }
+  return {
+    status: mergedStatus,
+    stdout: stdoutBlocks.join("\n"),
+    stderr: stderrBlocks.join("\n"),
+    signal: null,
+    timedOut,
+    staleHeartbeat: stale,
+    heartbeatReason,
+    workerResults,
+    fanOutCount: outcomes.length,
+  };
+}
+/**
+ * Partition a task-id list into `workerCount` roughly-equal subsets. Simple
+ * round-robin — each subset is non-empty as long as `tasks.length >= workerCount`.
+ */
+function _partitionTasks(tasks, workerCount) {
+  if (!Array.isArray(tasks) || tasks.length === 0 || workerCount < 1) return [];
+  const n = Math.min(workerCount, tasks.length);
+  const subsets = Array.from({ length: n }, () => []);
+  for (let i = 0; i < tasks.length; i++) subsets[i % n].push(tasks[i]);
+  return subsets;
+}
 // ── _testModeSpawnWorker ────────────────────────────────────────────────────
 /**