@tekyzinc/gsd-t 3.18.13 → 3.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,6 +62,19 @@ function _emit(projectDir, ev) {
62
62
  try { _esAppendEvent(projectDir, ev); } catch (_) { /* never halt the loop */ }
63
63
  }
64
64
 
65
+ // M44 D9 (v1.5.0) — planner-driven multi-worker fan-out. Lazy-loaded so unit
66
+ // tests can stub via deps._runParallel without touching the real module.
67
+ let _parallelModule = null;
68
+ function _loadRunParallel() {
69
+ if (_parallelModule) return _parallelModule;
70
+ try {
71
+ _parallelModule = require("./gsd-t-parallel.cjs");
72
+ } catch {
73
+ _parallelModule = { runParallel: () => ({ workerCount: 0, parallelTasks: [], plan: [] }) };
74
+ }
75
+ return _parallelModule;
76
+ }
77
+
65
78
  // M42 D1 — transcript tee. Captures each worker's stdout lines to an ndjson
66
79
  // file and registers the spawn so the dashboard sidebar can list + render it.
67
80
  // Best-effort: every call is swallowed so tee failures never halt the loop.
@@ -73,7 +86,7 @@ const { checkHeartbeat: _checkHeartbeat } = require("./gsd-t-unattended-heartbea
73
86
 
74
87
  // ── Constants ───────────────────────────────────────────────────────────────
75
88
 
76
- const CONTRACT_VERSION = "1.4.0";
89
+ const CONTRACT_VERSION = "1.5.0";
77
90
  const UNATTENDED_DIR_REL = path.join(".gsd-t", ".unattended");
78
91
  const PID_FILE = "supervisor.pid";
79
92
  const STATE_FILE = "state.json";
@@ -122,6 +135,8 @@ module.exports = {
122
135
  releaseSleepPrevention,
123
136
  runMainLoop,
124
137
  _spawnWorker,
138
+ _spawnWorkerFanOut,
139
+ _partitionTasks,
125
140
  _appendRunLog,
126
141
  CONTRACT_VERSION,
127
142
  UNATTENDED_DIR_REL,
@@ -132,6 +147,50 @@ module.exports = {
132
147
  DEFAULT_HEARTBEAT_POLL_MS,
133
148
  };
134
149
 
150
+ function _reconcile(state, results) {
151
+ if (!Array.isArray(results) || results.length === 0) return;
152
+ for (const r of results) {
153
+ if (!r || typeof r !== 'object') continue;
154
+ // append-only completedTasks (preserve order, dedupe)
155
+ if (Array.isArray(r.tasksDone) && r.tasksDone.length > 0) {
156
+ const current = new Set(state.completedTasks || []);
157
+ for (const t of r.tasksDone) {
158
+ if (!current.has(t)) {
159
+ state.completedTasks = (state.completedTasks || []).concat([t]);
160
+ current.add(t);
161
+ }
162
+ }
163
+ }
164
+ // last-writer-wins on status — but 'error' is sticky: once set, it stays
165
+ // until the next explicit non-error status in a later iter.
166
+ if (r.status && r.status !== state.status) {
167
+ state.status = r.status;
168
+ }
169
+ // verifyNeeded is OR-across-results: any iter that flags it wins.
170
+ if (r.verifyNeeded === true) {
171
+ state.verifyNeeded = true;
172
+ }
173
+ // artifacts: append-only, concat arrays.
174
+ if (Array.isArray(r.artifacts) && r.artifacts.length > 0) {
175
+ state.artifacts = (state.artifacts || []).concat(r.artifacts);
176
+ }
177
+ }
178
+ // NOTE: `state.iter` is advanced by the main while loop (pre-M46 contract:
179
+ // one increment per fan-out pass, regardless of worker/batch count). We do
180
+ // NOT advance it here — doing so would double-increment against the
181
+ // existing supervisor-contract invariant (surfaced by m43/m44 tests).
182
+ state.lastBatch = {
183
+ size: results.length,
184
+ endedAt: new Date().toISOString(),
185
+ errorCount: results.filter(r => r && r.status === 'error').length,
186
+ };
187
+ }
188
+
189
+ // M46 D1 T2 — expose the extracted single-iter body for future unit tests
190
+ // (T7) and the iter-parallel driver (T4/T5). Kept out of the main exports
191
+ // block so consumers don't accidentally import implementation details.
192
+ module.exports.__test__ = { _runOneIter, _computeIterBatchSize, _runIterParallel, _reconcile };
193
+
135
194
  // ── parseArgs ───────────────────────────────────────────────────────────────
136
195
 
137
196
  /**
@@ -927,6 +986,10 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
927
986
  deps._spawnWorker || (useTestStub ? _testModeSpawnWorker : _spawnWorker);
928
987
  const milestoneComplete =
929
988
  deps._isMilestoneComplete || (useTestStub ? () => true : isMilestoneComplete);
989
+ // M44 D9 (v1.5.0) — planner injected for multi-worker iter fan-out.
990
+ // Tests stub via deps._runParallel; production lazy-loads from gsd-t-parallel.cjs.
991
+ const runParallelImpl =
992
+ deps._runParallel || ((o) => _loadRunParallel().runParallel(o));
930
993
  const stopCheck = deps._stopRequested || stopRequested;
931
994
  const workerTimeoutMs = opts.workerTimeoutMs || DEFAULT_WORKER_TIMEOUT_MS;
932
995
  const staleHeartbeatMs =
@@ -947,61 +1010,200 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
947
1010
  const heartbeatEnabled = !deps._disableHeartbeat && !useTestStub;
948
1011
  const projectDir = state.projectDir;
949
1012
 
1013
+ // M46 D1 T2 — pure extract-method refactor. The body of each iteration
1014
+ // now lives in the top-level `_runOneIter` helper (below). The while loop
1015
+ // itself is unchanged in semantics: stop-check and isDone evaluate per
1016
+ // pass, and any terminal state.status ({"done","failed"}) written by the
1017
+ // iter body causes us to break, matching every pre-refactor `break` path.
1018
+ // Non-terminal outcomes fall through to the next iteration, matching the
1019
+ // pre-refactor `continue` paths.
1020
+ const iterCtx = {
1021
+ dir,
1022
+ fn,
1023
+ config,
1024
+ spawnWorker,
1025
+ milestoneComplete,
1026
+ runParallelImpl,
1027
+ workerTimeoutMs,
1028
+ heartbeatImpl,
1029
+ heartbeatEnabled,
1030
+ staleHeartbeatMs,
1031
+ heartbeatPollMs,
1032
+ projectDir,
1033
+ verbose: !!opts.verbose,
1034
+ };
950
1035
  while (!isDone(state) && !stopCheck(projectDir)) {
951
- // ── PRE-WORKER HOOK (contract §12) ─────────────────────────────────────
952
- // Refusal → halt with status=failed, lastExit=6 (caps) or 2 (validate).
953
- const capIter = fn.checkIterationCap(state, config);
954
- if (!capIter.ok) {
955
- state.status = "failed";
956
- state.lastExit = capIter.code || 6;
957
- writeState(state, dir);
958
- break;
959
- }
960
- const capWall = fn.checkWallClockCap(state, config);
961
- if (!capWall.ok) {
962
- state.status = "failed";
963
- state.lastExit = capWall.code || 6;
964
- writeState(state, dir);
965
- break;
966
- }
967
- const vRes = fn.validateState(state);
968
- if (!vRes.ok) {
969
- state.status = "failed";
970
- state.lastExit = vRes.code || 2;
971
- writeState(state, dir);
972
- break;
973
- }
1036
+ const batchSize = _computeIterBatchSize(state, opts);
1037
+ const _batchStartMs = Date.now();
1038
+ try {
1039
+ fs.appendFileSync(
1040
+ path.join(dir, RUN_LOG),
1041
+ `[iter-batch-start] batch-size=${batchSize} iter=${state.iter} ts=${new Date(_batchStartMs).toISOString()}\n`,
1042
+ "utf8"
1043
+ );
1044
+ } catch (_) { /* best effort */ }
1045
+ const results = await _runIterParallel(state, opts, (s, o) => _runOneIter(s, iterCtx), batchSize);
1046
+ _reconcile(state, results);
1047
+ try {
1048
+ const _ok = results.filter((r) => r.status !== "error").length;
1049
+ const _fail = results.length - _ok;
1050
+ const _durSec = ((Date.now() - _batchStartMs) / 1000).toFixed(1);
1051
+ fs.appendFileSync(
1052
+ path.join(dir, RUN_LOG),
1053
+ `[iter-batch-complete] size=${results.length} ok=${_ok} fail=${_fail} duration=${_durSec}s iter=${state.iter}\n`,
1054
+ "utf8"
1055
+ );
1056
+ } catch (_) { /* best effort */ }
1057
+ if (isTerminal(state.status)) break;
1058
+ }
974
1059
 
975
- // Pre-spawn bookkeeping
976
- state.iter = (state.iter || 0) + 1;
977
- const workerStart = new Date();
978
- state.lastWorkerStartedAt = workerStart.toISOString();
1060
+ // If we exited because the user dropped a stop sentinel and no terminal
1061
+ // status has been assigned yet, transition to 'stopped' now (contract §10).
1062
+ // The sentinel file itself is NOT removed by the supervisor — it stays on
1063
+ // disk as evidence, to be cleaned by the next launch via
1064
+ // `cleanStaleStopSentinel`.
1065
+ if (!isTerminal(state.status) && stopCheck(projectDir)) {
1066
+ state.status = "stopped";
979
1067
  writeState(state, dir);
1068
+ }
1069
+ return state;
1070
+ }
1071
+
1072
+ // ── _runOneIter (M46 D1 T2) ─────────────────────────────────────────────────
1073
+
1074
+ /**
1075
+ * Body of a single supervisor iteration, extracted verbatim from the
1076
+ * `runMainLoop` while-loop (pre-M46-D1). Mutates `state` in place exactly as
1077
+ * the original body did — all writeState calls, event-stream emits, run.log
1078
+ * and token-log appends, heartbeat wiring, fan-out dispatch, and exit-code
1079
+ * classification are preserved line-for-line.
1080
+ *
1081
+ * `opts` here is the per-iter context bundle assembled in runMainLoop (not
1082
+ * the supervisor-level opts object). It carries the closure values the body
1083
+ * used to read from the enclosing scope: fn, config, dir, projectDir,
1084
+ * spawnWorker, milestoneComplete, runParallelImpl, workerTimeoutMs,
1085
+ * heartbeatImpl, heartbeatEnabled, staleHeartbeatMs, heartbeatPollMs, verbose.
1086
+ *
1087
+ * Returns an IterResult per iter-parallel-contract.md v1.0.0 §4. T2 emits a
1088
+ * minimal shape (tasksDone = []) — T4/T5 will populate tasksDone and use
1089
+ * `status` to drive `_computeIterBatchSize`. For now the while-loop driver
1090
+ * consumes only `isTerminal(state.status)`; the returned value is forward-
1091
+ * compatible scaffolding.
1092
+ */
1093
+ async function _runOneIter(state, opts) {
1094
+ const {
1095
+ dir, fn, config, spawnWorker, milestoneComplete, runParallelImpl,
1096
+ workerTimeoutMs, heartbeatImpl, heartbeatEnabled,
1097
+ staleHeartbeatMs, heartbeatPollMs, projectDir,
1098
+ } = opts;
1099
+
1100
+ const _result = (status, extras) => ({
1101
+ iter: state.iter,
1102
+ status,
1103
+ tasksDone: [],
1104
+ verifyNeeded: status === "verify-needed",
1105
+ artifacts: extras || {},
1106
+ });
980
1107
 
1108
+ // ── PRE-WORKER HOOK (contract §12) ─────────────────────────────────────
1109
+ // Refusal → halt with status=failed, lastExit=6 (caps) or 2 (validate).
1110
+ const capIter = fn.checkIterationCap(state, config);
1111
+ if (!capIter.ok) {
1112
+ state.status = "failed";
1113
+ state.lastExit = capIter.code || 6;
1114
+ writeState(state, dir);
1115
+ return _result("failed", { errorMessage: `iteration_cap:${state.lastExit}` });
1116
+ }
1117
+ const capWall = fn.checkWallClockCap(state, config);
1118
+ if (!capWall.ok) {
1119
+ state.status = "failed";
1120
+ state.lastExit = capWall.code || 6;
1121
+ writeState(state, dir);
1122
+ return _result("failed", { errorMessage: `wall_clock_cap:${state.lastExit}` });
1123
+ }
1124
+ const vRes = fn.validateState(state);
1125
+ if (!vRes.ok) {
1126
+ state.status = "failed";
1127
+ state.lastExit = vRes.code || 2;
1128
+ writeState(state, dir);
1129
+ return _result("failed", { errorMessage: `validate_state:${state.lastExit}` });
1130
+ }
1131
+
1132
+ // Pre-spawn bookkeeping
1133
+ state.iter = (state.iter || 0) + 1;
1134
+ const workerStart = new Date();
1135
+ state.lastWorkerStartedAt = workerStart.toISOString();
1136
+ writeState(state, dir);
1137
+
1138
+ _emit(projectDir, {
1139
+ ts: workerStart.toISOString(),
1140
+ iter: state.iter,
1141
+ type: "task_start",
1142
+ source: "supervisor",
1143
+ milestone: state.milestone || "",
1144
+ wave: state.wave || "",
1145
+ task: state.nextTask || "",
1146
+ });
1147
+
1148
+ let res;
1149
+ const workerStartMs = workerStart.getTime();
1150
+ const hbOpts = heartbeatEnabled
1151
+ ? {
1152
+ onHeartbeatCheck: () =>
1153
+ heartbeatImpl({
1154
+ projectDir,
1155
+ workerStartedAt: workerStartMs,
1156
+ staleHeartbeatMs,
1157
+ }),
1158
+ heartbeatPollMs,
1159
+ }
1160
+ : {};
1161
+
1162
+ // M44 D9 (v1.5.0) — planner-driven fan-out decision for this iter.
1163
+ // Ask runParallel whether the current task graph supports ≥2 concurrent
1164
+ // workers. Any failure in the planner path MUST fall back to the single-
1165
+ // worker spawn — the parallel path is purely additive.
1166
+ let iterPlan = null;
1167
+ try {
1168
+ iterPlan = runParallelImpl({
1169
+ projectDir,
1170
+ mode: "unattended",
1171
+ milestone: state.milestone || null,
1172
+ dryRun: true,
1173
+ });
1174
+ } catch (e) {
1175
+ iterPlan = null;
981
1176
  _emit(projectDir, {
982
- ts: workerStart.toISOString(),
983
1177
  iter: state.iter,
984
- type: "task_start",
1178
+ type: "parallelism_reduced",
985
1179
  source: "supervisor",
986
- milestone: state.milestone || "",
987
- wave: state.wave || "",
988
- task: state.nextTask || "",
1180
+ original_count: null,
1181
+ reduced_count: 1,
1182
+ reason: `planner_error:${(e && e.message) || "unknown"}`,
989
1183
  });
1184
+ }
1185
+ const fanOutCount = iterPlan && Number(iterPlan.workerCount) >= 2 ? Number(iterPlan.workerCount) : 1;
1186
+ const parallelTaskIds = iterPlan && Array.isArray(iterPlan.parallelTasks) ? iterPlan.parallelTasks : [];
1187
+ const subsets = fanOutCount >= 2 ? _partitionTasks(parallelTaskIds, fanOutCount) : null;
1188
+ const useFanOut = !!(subsets && subsets.length >= 2);
990
1189
 
991
- let res;
992
- const workerStartMs = workerStart.getTime();
993
- const hbOpts = heartbeatEnabled
994
- ? {
995
- onHeartbeatCheck: () =>
996
- heartbeatImpl({
997
- projectDir,
998
- workerStartedAt: workerStartMs,
999
- staleHeartbeatMs,
1000
- }),
1001
- heartbeatPollMs,
1002
- }
1003
- : {};
1004
- try {
1190
+ try {
1191
+ if (useFanOut) {
1192
+ _emit(projectDir, {
1193
+ ts: workerStart.toISOString(),
1194
+ iter: state.iter,
1195
+ type: "fan_out",
1196
+ source: "supervisor",
1197
+ worker_count: subsets.length,
1198
+ task_ids: parallelTaskIds,
1199
+ });
1200
+ res = await _spawnWorkerFanOut(state, {
1201
+ cwd: projectDir,
1202
+ timeout: workerTimeoutMs,
1203
+ verbose: !!opts.verbose,
1204
+ ...hbOpts,
1205
+ }, spawnWorker, subsets);
1206
+ } else {
1005
1207
  res = spawnWorker(state, {
1006
1208
  cwd: projectDir,
1007
1209
  timeout: workerTimeoutMs,
@@ -1011,207 +1213,294 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
1011
1213
  if (res && typeof res.then === "function") {
1012
1214
  res = await res;
1013
1215
  }
1014
- } catch (e) {
1015
- // Defensive: a real spawnSync shouldn't throw, but a shim could.
1016
- res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
1017
- }
1018
- res = res || { status: null, stdout: "", stderr: "", signal: null };
1019
-
1020
- const workerEnd = new Date();
1021
- const elapsedMs = workerEnd.getTime() - workerStart.getTime();
1022
- const stdout = typeof res.stdout === "string" ? res.stdout : "";
1023
- const stderr = typeof res.stderr === "string" ? res.stderr : "";
1024
-
1025
- // Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
1026
- // - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
1027
- // - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
1028
- // Heartbeat wins on ties because it's the more specific signal.
1029
- let exitCode;
1030
- let lastExitReason = null;
1031
- if (res.staleHeartbeat === true) {
1032
- exitCode = 125;
1033
- lastExitReason = "stale_heartbeat";
1034
- } else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
1035
- exitCode = 124;
1036
- lastExitReason = "worker_timeout";
1037
- } else {
1038
- exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
1039
1216
  }
1217
+ } catch (e) {
1218
+ // Defensive: a real spawnSync shouldn't throw, but a shim could.
1219
+ res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
1220
+ }
1221
+ res = res || { status: null, stdout: "", stderr: "", signal: null };
1222
+
1223
+ const workerEnd = new Date();
1224
+ const elapsedMs = workerEnd.getTime() - workerStart.getTime();
1225
+ const stdout = typeof res.stdout === "string" ? res.stdout : "";
1226
+ const stderr = typeof res.stderr === "string" ? res.stderr : "";
1227
+
1228
+ // Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
1229
+ // - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
1230
+ // - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
1231
+ // Heartbeat wins on ties because it's the more specific signal.
1232
+ let exitCode;
1233
+ let lastExitReason = null;
1234
+ if (res.staleHeartbeat === true) {
1235
+ exitCode = 125;
1236
+ lastExitReason = "stale_heartbeat";
1237
+ } else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
1238
+ exitCode = 124;
1239
+ lastExitReason = "worker_timeout";
1240
+ } else {
1241
+ exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
1242
+ }
1040
1243
 
1041
- // v3.13.11 Bug 1: when a watchdog fires, make the event explicit in
1042
- // run.log so operators can see WHICH iteration was cut without inferring
1043
- // from exit codes. The marker is prepended to stdout and written in the
1044
- // single per-iter run.log append (no duplicate header).
1045
- let loggedStdout = stdout;
1046
- if (exitCode === 124) {
1047
- const marker =
1048
- `[worker_timeout] iter=${state.iter} budget=${workerTimeoutMs}ms ` +
1049
- `elapsed=${elapsedMs}ms — absolute-backstop SIGTERM delivered, ` +
1050
- `supervisor continues relay per contract §16.\n`;
1051
- loggedStdout = marker + (stdout || "");
1052
- } else if (exitCode === 125) {
1053
- const reason = res.heartbeatReason || "no recent events.jsonl writes";
1054
- const marker =
1055
- `[stale_heartbeat] iter=${state.iter} threshold=${staleHeartbeatMs}ms ` +
1056
- `elapsed=${elapsedMs}ms reason="${reason}" — ` +
1057
- `heartbeat watchdog SIGTERM delivered, supervisor continues relay.\n`;
1058
- loggedStdout = marker + (stdout || "");
1059
- }
1244
+ // v3.13.11 Bug 1: when a watchdog fires, make the event explicit in
1245
+ // run.log so operators can see WHICH iteration was cut without inferring
1246
+ // from exit codes. The marker is prepended to stdout and written in the
1247
+ // single per-iter run.log append (no duplicate header).
1248
+ let loggedStdout = stdout;
1249
+ if (exitCode === 124) {
1250
+ const marker =
1251
+ `[worker_timeout] iter=${state.iter} budget=${workerTimeoutMs}ms ` +
1252
+ `elapsed=${elapsedMs}ms — absolute-backstop SIGTERM delivered, ` +
1253
+ `supervisor continues relay per contract §16.\n`;
1254
+ loggedStdout = marker + (stdout || "");
1255
+ } else if (exitCode === 125) {
1256
+ const reason = res.heartbeatReason || "no recent events.jsonl writes";
1257
+ const marker =
1258
+ `[stale_heartbeat] iter=${state.iter} threshold=${staleHeartbeatMs}ms ` +
1259
+ `elapsed=${elapsedMs}ms reason="${reason}" — ` +
1260
+ `heartbeat watchdog SIGTERM delivered, supervisor continues relay.\n`;
1261
+ loggedStdout = marker + (stdout || "");
1262
+ }
1263
+
1264
+ // Append the full worker output to run.log (never truncate).
1265
+ _appendRunLog(dir, state.iter, workerEnd, exitCode, loggedStdout, stderr);
1266
+
1267
+ // Append to token-log.md (Fix 1, v3.12.12) — supervisor workers write rows
1268
+ // so the log captures headless/unattended activity, not just interactive spawns.
1269
+ _appendTokenLog(projectDir, {
1270
+ dtStart: workerStart.toISOString().slice(0, 16).replace("T", " "),
1271
+ dtEnd: workerEnd.toISOString().slice(0, 16).replace("T", " "),
1272
+ command: "gsd-t-resume",
1273
+ durationS: Math.round(elapsedMs / 1000),
1274
+ exitCode,
1275
+ iter: state.iter,
1276
+ });
1060
1277
 
1061
- // Append the full worker output to run.log (never truncate).
1062
- _appendRunLog(dir, state.iter, workerEnd, exitCode, loggedStdout, stderr);
1063
-
1064
- // Append to token-log.md (Fix 1, v3.12.12) — supervisor workers write rows
1065
- // so the log captures headless/unattended activity, not just interactive spawns.
1066
- _appendTokenLog(projectDir, {
1067
- dtStart: workerStart.toISOString().slice(0, 16).replace("T", " "),
1068
- dtEnd: workerEnd.toISOString().slice(0, 16).replace("T", " "),
1069
- command: "gsd-t-resume",
1070
- durationS: Math.round(elapsedMs / 1000),
1071
- exitCode,
1278
+ // Post-spawn state update
1279
+ state.lastExit = exitCode;
1280
+ state.lastWorkerFinishedAt = workerEnd.toISOString();
1281
+ state.lastElapsedMs = elapsedMs;
1282
+ if (lastExitReason) {
1283
+ state.lastExitReason = lastExitReason;
1284
+ } else if (exitCode === 0) {
1285
+ state.lastExitReason = "clean";
1286
+ } else {
1287
+ state.lastExitReason = `exit_${exitCode}`;
1288
+ }
1289
+ // M44 D9 (v1.5.0) — per-iter multi-worker aggregates. Present only when the
1290
+ // planner selected fan-out; single-worker iters omit these fields so the
1291
+ // state schema stays backward-compatible with v1.4.x readers.
1292
+ if (useFanOut && Array.isArray(res.workerResults)) {
1293
+ state.lastExits = res.workerResults.map((w) => ({
1294
+ idx: w.idx,
1295
+ code: typeof w.status === "number" ? w.status : null,
1296
+ taskIds: w.taskIds || [],
1297
+ elapsedMs: w.elapsedMs,
1298
+ spawnId: w.spawnId || null,
1299
+ }));
1300
+ state.workerPids = res.workerResults.map((w) => w.spawnId || null);
1301
+ state.lastFanOutCount = res.workerResults.length;
1302
+ } else {
1303
+ // Clear stale multi-worker fields on single-worker iters so readers
1304
+ // never see a mix of regimes.
1305
+ if (state.lastExits) delete state.lastExits;
1306
+ if (state.workerPids) delete state.workerPids;
1307
+ if (state.lastFanOutCount) delete state.lastFanOutCount;
1308
+ }
1309
+ writeState(state, dir);
1310
+
1311
+ // Event-stream: task_complete on success, error on non-zero.
1312
+ const durationS = Math.round(elapsedMs / 1000);
1313
+ if (exitCode === 0) {
1314
+ _emit(projectDir, {
1315
+ ts: workerEnd.toISOString(),
1072
1316
  iter: state.iter,
1317
+ type: "task_complete",
1318
+ source: "supervisor",
1319
+ task: state.nextTask || "",
1320
+ verdict: "pass",
1321
+ duration_s: durationS,
1073
1322
  });
1323
+ } else {
1324
+ _emit(projectDir, {
1325
+ ts: workerEnd.toISOString(),
1326
+ iter: state.iter,
1327
+ type: "error",
1328
+ source: "supervisor",
1329
+ error: `worker exit ${exitCode}`,
1330
+ recoverable: exitCode !== 4 && exitCode !== 5,
1331
+ });
1332
+ }
1074
1333
 
1075
- // Post-spawn state update
1076
- state.lastExit = exitCode;
1077
- state.lastWorkerFinishedAt = workerEnd.toISOString();
1078
- state.lastElapsedMs = elapsedMs;
1079
- if (lastExitReason) {
1080
- state.lastExitReason = lastExitReason;
1081
- } else if (exitCode === 0) {
1082
- state.lastExitReason = "clean";
1083
- } else {
1084
- state.lastExitReason = `exit_${exitCode}`;
1334
+ // ── POST-WORKER HOOK (contract §12) ────────────────────────────────────
1335
+ // Read the tail of run.log for pattern detection. ~200 lines is enough
1336
+ // to span the last several iteration blocks for the gutter detector.
1337
+ let runLogTail = "";
1338
+ try {
1339
+ const logPath = path.join(dir, RUN_LOG);
1340
+ if (fs.existsSync(logPath)) {
1341
+ const all = fs.readFileSync(logPath, "utf8");
1342
+ const lines = all.split(/\r?\n/);
1343
+ runLogTail = lines.slice(-200).join("\n");
1085
1344
  }
1345
+ } catch (_) {
1346
+ // best effort — tail read failure does not halt the loop
1347
+ }
1348
+ const blocker = fn.detectBlockerSentinel(runLogTail);
1349
+ if (!blocker.ok) {
1350
+ state.status = "failed";
1351
+ state.lastExit = blocker.code || 6;
1086
1352
  writeState(state, dir);
1353
+ return _result("failed", { errorMessage: `blocker_sentinel:${state.lastExit}` });
1354
+ }
1355
+ const gutter = fn.detectGutter(state, runLogTail, config);
1356
+ if (!gutter.ok) {
1357
+ state.status = "failed";
1358
+ state.lastExit = gutter.code || 6;
1359
+ writeState(state, dir);
1360
+ return _result("failed", { errorMessage: `gutter:${state.lastExit}` });
1361
+ }
1087
1362
 
1088
- // Event-stream: task_complete on success, error on non-zero.
1089
- const durationS = Math.round(elapsedMs / 1000);
1090
- if (exitCode === 0) {
1091
- _emit(projectDir, {
1092
- ts: workerEnd.toISOString(),
1093
- iter: state.iter,
1094
- type: "task_complete",
1095
- source: "supervisor",
1096
- task: state.nextTask || "",
1097
- verdict: "pass",
1098
- duration_s: durationS,
1099
- });
1100
- } else {
1101
- _emit(projectDir, {
1102
- ts: workerEnd.toISOString(),
1103
- iter: state.iter,
1104
- type: "error",
1105
- source: "supervisor",
1106
- error: `worker exit ${exitCode}`,
1107
- recoverable: exitCode !== 4 && exitCode !== 5,
1108
- });
1109
- }
1110
-
1111
- // ── POST-WORKER HOOK (contract §12) ────────────────────────────────────
1112
- // Read the tail of run.log for pattern detection. ~200 lines is enough
1113
- // to span the last several iteration blocks for the gutter detector.
1114
- let runLogTail = "";
1115
- try {
1116
- const logPath = path.join(dir, RUN_LOG);
1117
- if (fs.existsSync(logPath)) {
1118
- const all = fs.readFileSync(logPath, "utf8");
1119
- const lines = all.split(/\r?\n/);
1120
- runLogTail = lines.slice(-200).join("\n");
1121
- }
1122
- } catch (_) {
1123
- // best effort — tail read failure does not halt the loop
1124
- }
1125
- const blocker = fn.detectBlockerSentinel(runLogTail);
1126
- if (!blocker.ok) {
1127
- state.status = "failed";
1128
- state.lastExit = blocker.code || 6;
1363
+ // Terminal exit classification
1364
+ if (exitCode === 0) {
1365
+ // Success — check if the milestone is now complete.
1366
+ if (milestoneComplete(projectDir, state.milestone)) {
1367
+ state.status = "done";
1129
1368
  writeState(state, dir);
1130
- break;
1131
- }
1132
- const gutter = fn.detectGutter(state, runLogTail, config);
1133
- if (!gutter.ok) {
1134
- state.status = "failed";
1135
- state.lastExit = gutter.code || 6;
1136
- writeState(state, dir);
1137
- break;
1138
- }
1139
-
1140
- // Terminal exit classification
1141
- if (exitCode === 0) {
1142
- // Success — check if the milestone is now complete.
1143
- if (milestoneComplete(projectDir, state.milestone)) {
1144
- state.status = "done";
1145
- writeState(state, dir);
1146
- break;
1147
- }
1148
- // Not yet done — continue relay.
1149
- _emit(projectDir, {
1150
- iter: state.iter,
1151
- type: "retry",
1152
- source: "supervisor",
1153
- attempt: state.iter,
1154
- reason: "milestone_incomplete",
1155
- });
1156
- continue;
1369
+ return _result("done");
1157
1370
  }
1158
- if (exitCode === 4) {
1159
- // Unrecoverable blocker.
1160
- state.status = "failed";
1161
- writeState(state, dir);
1162
- break;
1163
- }
1164
- if (exitCode === 5) {
1165
- // Command dispatch failure — worker invocation is broken.
1166
- state.status = "failed";
1167
- writeState(state, dir);
1168
- break;
1169
- }
1170
- if (exitCode === 124) {
1171
- // Timeout — continue unless the iter cap is hit on the next check.
1172
- _emit(projectDir, {
1173
- iter: state.iter,
1174
- type: "retry",
1175
- source: "supervisor",
1176
- attempt: state.iter,
1177
- reason: "timeout",
1178
- });
1179
- continue;
1180
- }
1181
- if (exitCode === 125) {
1182
- // Stale heartbeat (M43) — continue unless the iter cap hits. The
1183
- // heartbeat kill is recoverable by definition: the worker was not
1184
- // emitting events, which is the most common class of stuck iteration
1185
- // (e.g. child stuck on a long Bash call with no tool_call emits).
1186
- _emit(projectDir, {
1187
- iter: state.iter,
1188
- type: "retry",
1189
- source: "supervisor",
1190
- attempt: state.iter,
1191
- reason: "stale_heartbeat",
1192
- });
1193
- continue;
1194
- }
1195
- // Non-terminal (1/2/3) — continue the relay.
1371
+ // Not yet done — continue relay.
1196
1372
  _emit(projectDir, {
1197
1373
  iter: state.iter,
1198
1374
  type: "retry",
1199
1375
  source: "supervisor",
1200
1376
  attempt: state.iter,
1201
- reason: `exit_${exitCode}`,
1377
+ reason: "milestone_incomplete",
1202
1378
  });
1379
+ return _result("running");
1203
1380
  }
1204
-
1205
- // If we exited because the user dropped a stop sentinel and no terminal
1206
- // status has been assigned yet, transition to 'stopped' now (contract §10).
1207
- // The sentinel file itself is NOT removed by the supervisor — it stays on
1208
- // disk as evidence, to be cleaned by the next launch via
1209
- // `cleanStaleStopSentinel`.
1210
- if (!isTerminal(state.status) && stopCheck(projectDir)) {
1211
- state.status = "stopped";
1381
+ if (exitCode === 4) {
1382
+ // Unrecoverable blocker.
1383
+ state.status = "failed";
1212
1384
  writeState(state, dir);
1385
+ return _result("failed", { errorMessage: "exit_4_unrecoverable" });
1213
1386
  }
1214
- return state;
1387
+ if (exitCode === 5) {
1388
+ // Command dispatch failure — worker invocation is broken.
1389
+ state.status = "failed";
1390
+ writeState(state, dir);
1391
+ return _result("failed", { errorMessage: "exit_5_dispatch_failure" });
1392
+ }
1393
+ if (exitCode === 124) {
1394
+ // Timeout — continue unless the iter cap is hit on the next check.
1395
+ _emit(projectDir, {
1396
+ iter: state.iter,
1397
+ type: "retry",
1398
+ source: "supervisor",
1399
+ attempt: state.iter,
1400
+ reason: "timeout",
1401
+ });
1402
+ return _result("running");
1403
+ }
1404
+ if (exitCode === 125) {
1405
+ // Stale heartbeat (M43) — continue unless the iter cap hits. The
1406
+ // heartbeat kill is recoverable by definition: the worker was not
1407
+ // emitting events, which is the most common class of stuck iteration
1408
+ // (e.g. child stuck on a long Bash call with no tool_call emits).
1409
+ _emit(projectDir, {
1410
+ iter: state.iter,
1411
+ type: "retry",
1412
+ source: "supervisor",
1413
+ attempt: state.iter,
1414
+ reason: "stale_heartbeat",
1415
+ });
1416
+ return _result("running");
1417
+ }
1418
+ // Non-terminal (1/2/3) — continue the relay.
1419
+ _emit(projectDir, {
1420
+ iter: state.iter,
1421
+ type: "retry",
1422
+ source: "supervisor",
1423
+ attempt: state.iter,
1424
+ reason: `exit_${exitCode}`,
1425
+ });
1426
+ return _result("running");
1427
+ }
1428
+
1429
+ // ── _computeIterBatchSize (M46 D1 T3) ───────────────────────────────────────
1430
+
1431
+ /**
1432
+ * Decide how many iterations the supervisor main loop should dispatch
1433
+ * concurrently in the next pass. Implements the mode-safety rules from
1434
+ * `.gsd-t/contracts/iter-parallel-contract.md` v1.0.0 §3.1.
1435
+ *
1436
+ * Rules evaluated top-down; first match wins:
1437
+ * 1. status === "verify-needed" → 1 (serial verify gate)
1438
+ * 2. milestoneBoundary === true → 1 (milestone boundary)
1439
+ * 3. status === "complete-milestone" → 1 (single-shot closeout)
1440
+ * 4. otherwise → min(opts.maxIterParallel ?? 4, remainingIters, 8)
1441
+ * where remainingIters = (state.maxIterations ?? Infinity) - (state.iter ?? 0)
1442
+ *
1443
+ * Never returns less than 1.
1444
+ */
1445
+ function _computeIterBatchSize(state, opts) {
1446
+ if (state && state.status === "verify-needed") return 1;
1447
+ if (state && state.milestoneBoundary === true) return 1;
1448
+ if (state && state.status === "complete-milestone") return 1;
1449
+
1450
+ // Production default is 1 (serial, pre-M46 behavior). Iter-parallelism is
1451
+ // opt-in via `opts.maxIterParallel` — callers that pass a number enable it.
1452
+ // Rationale: `_runOneIter` mutates `state.iter` and other shared fields
1453
+ // (heartbeat bookkeeping, writeState) that are not safe to execute on the
1454
+ // same state object concurrently. Unit tests exercise the parallel path
1455
+ // with explicit batch sizes; production main loop omits the flag and runs
1456
+ // strictly serial, preserving the pre-M46 supervisor contract (one iter
1457
+ // counter increment per fan-out pass). See backlog #24 for the follow-up
1458
+ // that makes `_runOneIter` state-clone-safe and lifts this gate.
1459
+ if (!opts || typeof opts.maxIterParallel !== "number") return 1;
1460
+
1461
+ const cap = opts.maxIterParallel;
1462
+ const maxIters = state && typeof state.maxIterations === "number"
1463
+ ? state.maxIterations
1464
+ : Infinity;
1465
+ const currentIter = state && typeof state.iter === "number"
1466
+ ? state.iter
1467
+ : 0;
1468
+ const remainingIters = maxIters - currentIter;
1469
+
1470
+ const size = Math.min(cap, remainingIters, 8);
1471
+ return size < 1 ? 1 : size;
1472
+ }
1473
+
1474
+ // ── _runIterParallel (M46 D1 T4) ────────────────────────────────────────────
1475
+
1476
+ /**
1477
+ * Dispatch `batchSize` independent iter slices concurrently and return an
1478
+ * IterResult[] of exactly that length. Implements the error-isolation rule
1479
+ * from `.gsd-t/contracts/iter-parallel-contract.md` v1.0.0 §4.2: a single
1480
+ * rejected iter is translated into an IterResult with status "error" and
1481
+ * does NOT cancel siblings. The caller decides how to react.
1482
+ *
1483
+ * iterFn defaults to `_runOneIter` for the T7 tests; production callers
1484
+ * (T5 main-loop rewrite) pass the same.
1485
+ */
1486
+ async function _runIterParallel(state, opts, iterFn, batchSize) {
1487
+ const fn = typeof iterFn === "function" ? iterFn : _runOneIter;
1488
+ const n = typeof batchSize === "number" && batchSize >= 1 ? batchSize : 1;
1489
+ const slices = [];
1490
+ for (let i = 0; i < n; i++) slices.push(Promise.resolve().then(() => fn(state, opts)));
1491
+ const settled = await Promise.allSettled(slices);
1492
+ return settled.map((s) => {
1493
+ if (s.status === "fulfilled") return s.value;
1494
+ const reason = s.reason;
1495
+ const msg = (reason && reason.message) ? reason.message : String(reason);
1496
+ return {
1497
+ status: "error",
1498
+ tasksDone: [],
1499
+ verifyNeeded: false,
1500
+ artifacts: [],
1501
+ error: msg,
1502
+ };
1503
+ });
1215
1504
  }
1216
1505
 
1217
1506
  // ── _appendTokenLog (Fix 1, v3.12.12) ───────────────────────────────────────
@@ -1285,11 +1574,24 @@ function _spawnWorker(state, opts) {
1285
1574
  // id as parent, so shims inside the worker write state files that the tree
1286
1575
  // builder can attach under the supervisor root.
1287
1576
  workerEnv.GSD_T_AGENT_ID =
1288
- "supervisor-iter-" + (state && state.iter ? state.iter : Date.now());
1577
+ "supervisor-iter-" + (state && state.iter ? state.iter : Date.now()) +
1578
+ (state && typeof state._workerIndex === "number" ? `-w${state._workerIndex}` : "");
1289
1579
  if (process.env.GSD_T_AGENT_ID) {
1290
1580
  workerEnv.GSD_T_PARENT_AGENT_ID = process.env.GSD_T_AGENT_ID;
1291
1581
  }
1292
1582
 
1583
+ // M44 D9 (v1.5.0) — planner-driven fan-out: when the supervisor partitions
1584
+ // the iter's task graph across N workers, each worker carries its disjoint
1585
+ // task-id subset via env var. The worker prompt consumes this to (a) skip
1586
+ // the intra-worker Team Mode block (the fan-out is the team), (b) restrict
1587
+ // itself to its assigned task IDs.
1588
+ const assignedTaskIds = Array.isArray(opts && opts.taskIds) ? opts.taskIds : null;
1589
+ if (assignedTaskIds && assignedTaskIds.length > 0) {
1590
+ workerEnv.GSD_T_WORKER_TASK_IDS = assignedTaskIds.join(",");
1591
+ workerEnv.GSD_T_WORKER_INDEX = String((state && state._workerIndex) || 0);
1592
+ workerEnv.GSD_T_WORKER_TOTAL = String((state && state._workerTotal) || 1);
1593
+ }
1594
+
1293
1595
  // M42 D1 — allocate a spawn-id + open transcript before spawning. parentId
1294
1596
  // is the supervisor's own spawn-id (set once at supervisor start via
1295
1597
  // GSD_T_SPAWN_ID env) so the sidebar can render parent-indented trees.
@@ -1353,6 +1655,18 @@ function _spawnWorker(state, opts) {
1353
1655
  "",
1354
1656
  "# Team Mode (Intra-Wave Parallelism)",
1355
1657
  "",
1658
+ "M44 D9 (v1.5.0+) — check env `GSD_T_WORKER_TASK_IDS` FIRST. If SET, you",
1659
+ "are one of N planner-assigned workers in a supervisor-level fan-out.",
1660
+ "The value is your disjoint task-id subset. DO NOT spawn Task subagents",
1661
+ "to re-fan-out (the supervisor already did). Execute ONLY your assigned",
1662
+ "task IDs sequentially in this worker, then return. Skip the rest of",
1663
+ "this block.",
1664
+ "",
1665
+ "If GSD_T_WORKER_TASK_IDS is UNSET, the supervisor's planner decided",
1666
+ "N=1 for this iter (sequential fallback: gates vetoed, file-disjointness",
1667
+ "unprovable, or est CW% too high). Proceed with the legacy worker-level",
1668
+ "Team Mode below:",
1669
+ "",
1356
1670
  "Before executing tasks for this iteration, read `.gsd-t/partition.md` to",
1357
1671
  "identify the current wave and which domains belong to it.",
1358
1672
  "",
@@ -1427,6 +1741,97 @@ function _spawnWorker(state, opts) {
1427
1741
  return finalize(spawnResult);
1428
1742
  }
1429
1743
 
1744
+ // ── _spawnWorkerFanOut (M44 D9, contract v1.5.0) ────────────────────────────
1745
+
1746
+ /**
1747
+ * Planner-driven multi-worker fan-out. Spawns N concurrent workers via the
1748
+ * injected `spawnWorker` shim, each receiving a disjoint subset of the iter's
1749
+ * parallel task IDs (passed through `opts.taskIds`). Waits on all via
1750
+ * Promise.all before returning a merged result shape compatible with the
1751
+ * single-worker path.
1752
+ *
1753
+ * Merge semantics:
1754
+ * - `status` — 0 if every worker cleanly returned 0, else the first
1755
+ * non-zero status encountered (worst exit wins).
1756
+ * - `stdout` — per-worker blocks joined by `[WORKER i/N tasks=...]` headers.
1757
+ * - `stderr` — concatenated.
1758
+ * - `staleHeartbeat`/`timedOut` — true if any worker triggered them.
1759
+ * - `workerResults` — array of per-worker {status, taskIds, pid, spawnId, elapsedMs}
1760
+ * for state.json aggregation.
1761
+ *
1762
+ * The caller (runMainLoop) treats this result exactly like a single-worker
1763
+ * result for downstream classification. Multi-worker observability lives in
1764
+ * the `workerResults` array, not in new control-flow branches.
1765
+ */
1766
+ async function _spawnWorkerFanOut(state, opts, spawnWorker, subsets) {
1767
+ const launches = subsets.map((taskIds, i) => {
1768
+ const subState = { ...state, _workerIndex: i, _workerTotal: subsets.length, _workerTaskIds: taskIds };
1769
+ const started = Date.now();
1770
+ return Promise.resolve()
1771
+ .then(() => spawnWorker(subState, { ...opts, taskIds }))
1772
+ .then((r) => ({ r: r || {}, taskIds, started, ended: Date.now(), idx: i }))
1773
+ .catch((e) => ({
1774
+ r: { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null },
1775
+ taskIds, started, ended: Date.now(), idx: i,
1776
+ }));
1777
+ });
1778
+ const outcomes = await Promise.all(launches);
1779
+ outcomes.sort((a, b) => a.idx - b.idx);
1780
+
1781
+ let mergedStatus = 0;
1782
+ let stale = false;
1783
+ let timedOut = false;
1784
+ let heartbeatReason = null;
1785
+ const stdoutBlocks = [];
1786
+ const stderrBlocks = [];
1787
+ const workerResults = [];
1788
+
1789
+ for (const o of outcomes) {
1790
+ const s = typeof o.r.status === "number" ? o.r.status : null;
1791
+ if (mergedStatus === 0 && s !== 0) mergedStatus = s === null ? 1 : s;
1792
+ if (o.r.staleHeartbeat) stale = true;
1793
+ if (o.r.timedOut) timedOut = true;
1794
+ if (!heartbeatReason && o.r.heartbeatReason) heartbeatReason = o.r.heartbeatReason;
1795
+ const tag = `[WORKER ${o.idx + 1}/${outcomes.length} tasks=${(o.taskIds || []).join(",") || "-"}]`;
1796
+ stdoutBlocks.push(`${tag}\n${o.r.stdout || ""}`);
1797
+ if (o.r.stderr) stderrBlocks.push(`${tag}\n${o.r.stderr}`);
1798
+ workerResults.push({
1799
+ idx: o.idx,
1800
+ status: s,
1801
+ taskIds: o.taskIds,
1802
+ spawnId: o.r.spawnId || null,
1803
+ signal: o.r.signal || null,
1804
+ elapsedMs: o.ended - o.started,
1805
+ staleHeartbeat: !!o.r.staleHeartbeat,
1806
+ timedOut: !!o.r.timedOut,
1807
+ });
1808
+ }
1809
+
1810
+ return {
1811
+ status: mergedStatus,
1812
+ stdout: stdoutBlocks.join("\n"),
1813
+ stderr: stderrBlocks.join("\n"),
1814
+ signal: null,
1815
+ timedOut,
1816
+ staleHeartbeat: stale,
1817
+ heartbeatReason,
1818
+ workerResults,
1819
+ fanOutCount: outcomes.length,
1820
+ };
1821
+ }
1822
+
1823
+ /**
1824
+ * Partition a task-id list into `workerCount` roughly-equal subsets. Simple
1825
+ * round-robin — each subset is non-empty as long as `tasks.length >= workerCount`.
1826
+ */
1827
+ function _partitionTasks(tasks, workerCount) {
1828
+ if (!Array.isArray(tasks) || tasks.length === 0 || workerCount < 1) return [];
1829
+ const n = Math.min(workerCount, tasks.length);
1830
+ const subsets = Array.from({ length: n }, () => []);
1831
+ for (let i = 0; i < tasks.length; i++) subsets[i % n].push(tasks[i]);
1832
+ return subsets;
1833
+ }
1834
+
1430
1835
  // ── _testModeSpawnWorker ────────────────────────────────────────────────────
1431
1836
 
1432
1837
  /**