@tekyzinc/gsd-t 3.18.17 → 3.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -147,6 +147,50 @@ module.exports = {
147
147
  DEFAULT_HEARTBEAT_POLL_MS,
148
148
  };
149
149
 
150
+ function _reconcile(state, results) {
151
+ if (!Array.isArray(results) || results.length === 0) return;
152
+ for (const r of results) {
153
+ if (!r || typeof r !== 'object') continue;
154
+ // append-only completedTasks (preserve order, dedupe)
155
+ if (Array.isArray(r.tasksDone) && r.tasksDone.length > 0) {
156
+ const current = new Set(state.completedTasks || []);
157
+ for (const t of r.tasksDone) {
158
+ if (!current.has(t)) {
159
+ state.completedTasks = (state.completedTasks || []).concat([t]);
160
+ current.add(t);
161
+ }
162
+ }
163
+ }
164
+ // last-writer-wins on status — but 'error' is sticky: once set, it stays
165
+ // until the next explicit non-error status in a later iter.
166
+ if (r.status && r.status !== state.status) {
167
+ state.status = r.status;
168
+ }
169
+ // verifyNeeded is OR-across-results: any iter that flags it wins.
170
+ if (r.verifyNeeded === true) {
171
+ state.verifyNeeded = true;
172
+ }
173
+ // artifacts: append-only, concat arrays.
174
+ if (Array.isArray(r.artifacts) && r.artifacts.length > 0) {
175
+ state.artifacts = (state.artifacts || []).concat(r.artifacts);
176
+ }
177
+ }
178
+ // NOTE: `state.iter` is advanced by the main while loop (pre-M46 contract:
179
+ // one increment per fan-out pass, regardless of worker/batch count). We do
180
+ // NOT advance it here — doing so would double-increment against the
181
+ // existing supervisor-contract invariant (surfaced by m43/m44 tests).
182
+ state.lastBatch = {
183
+ size: results.length,
184
+ endedAt: new Date().toISOString(),
185
+ errorCount: results.filter(r => r && r.status === 'error').length,
186
+ };
187
+ }
188
+
189
+ // M46 D1 T2 — expose the extracted single-iter body for future unit tests
190
+ // (T7) and the iter-parallel driver (T4/T5). Kept out of the main exports
191
+ // block so consumers don't accidentally import implementation details.
192
+ module.exports.__test__ = { _runOneIter, _computeIterBatchSize, _runIterParallel, _reconcile };
193
+
150
194
  // ── parseArgs ───────────────────────────────────────────────────────────────
151
195
 
152
196
  /**
@@ -966,337 +1010,497 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
966
1010
  const heartbeatEnabled = !deps._disableHeartbeat && !useTestStub;
967
1011
  const projectDir = state.projectDir;
968
1012
 
1013
+ // M46 D1 T2 — pure extract-method refactor. The body of each iteration
1014
+ // now lives in the top-level `_runOneIter` helper (below). The while loop
1015
+ // itself is unchanged in semantics: stop-check and isDone evaluate per
1016
+ // pass, and any terminal state.status ({"done","failed"}) written by the
1017
+ // iter body causes us to break, matching every pre-refactor `break` path.
1018
+ // Non-terminal outcomes fall through to the next iteration, matching the
1019
+ // pre-refactor `continue` paths.
1020
+ const iterCtx = {
1021
+ dir,
1022
+ fn,
1023
+ config,
1024
+ spawnWorker,
1025
+ milestoneComplete,
1026
+ runParallelImpl,
1027
+ workerTimeoutMs,
1028
+ heartbeatImpl,
1029
+ heartbeatEnabled,
1030
+ staleHeartbeatMs,
1031
+ heartbeatPollMs,
1032
+ projectDir,
1033
+ verbose: !!opts.verbose,
1034
+ };
969
1035
  while (!isDone(state) && !stopCheck(projectDir)) {
970
- // ── PRE-WORKER HOOK (contract §12) ─────────────────────────────────────
971
- // Refusal → halt with status=failed, lastExit=6 (caps) or 2 (validate).
972
- const capIter = fn.checkIterationCap(state, config);
973
- if (!capIter.ok) {
974
- state.status = "failed";
975
- state.lastExit = capIter.code || 6;
976
- writeState(state, dir);
977
- break;
978
- }
979
- const capWall = fn.checkWallClockCap(state, config);
980
- if (!capWall.ok) {
981
- state.status = "failed";
982
- state.lastExit = capWall.code || 6;
983
- writeState(state, dir);
984
- break;
985
- }
986
- const vRes = fn.validateState(state);
987
- if (!vRes.ok) {
988
- state.status = "failed";
989
- state.lastExit = vRes.code || 2;
990
- writeState(state, dir);
991
- break;
992
- }
1036
+ const batchSize = _computeIterBatchSize(state, opts);
1037
+ const _batchStartMs = Date.now();
1038
+ try {
1039
+ fs.appendFileSync(
1040
+ path.join(dir, RUN_LOG),
1041
+ `[iter-batch-start] batch-size=${batchSize} iter=${state.iter} ts=${new Date(_batchStartMs).toISOString()}\n`,
1042
+ "utf8"
1043
+ );
1044
+ } catch (_) { /* best effort */ }
1045
+ const results = await _runIterParallel(state, opts, (s, o) => _runOneIter(s, iterCtx), batchSize);
1046
+ _reconcile(state, results);
1047
+ try {
1048
+ const _ok = results.filter((r) => r.status !== "error").length;
1049
+ const _fail = results.length - _ok;
1050
+ const _durSec = ((Date.now() - _batchStartMs) / 1000).toFixed(1);
1051
+ fs.appendFileSync(
1052
+ path.join(dir, RUN_LOG),
1053
+ `[iter-batch-complete] size=${results.length} ok=${_ok} fail=${_fail} duration=${_durSec}s iter=${state.iter}\n`,
1054
+ "utf8"
1055
+ );
1056
+ } catch (_) { /* best effort */ }
1057
+ if (isTerminal(state.status)) break;
1058
+ }
993
1059
 
994
- // Pre-spawn bookkeeping
995
- state.iter = (state.iter || 0) + 1;
996
- const workerStart = new Date();
997
- state.lastWorkerStartedAt = workerStart.toISOString();
1060
+ // If we exited because the user dropped a stop sentinel and no terminal
1061
+ // status has been assigned yet, transition to 'stopped' now (contract §10).
1062
+ // The sentinel file itself is NOT removed by the supervisor — it stays on
1063
+ // disk as evidence, to be cleaned by the next launch via
1064
+ // `cleanStaleStopSentinel`.
1065
+ if (!isTerminal(state.status) && stopCheck(projectDir)) {
1066
+ state.status = "stopped";
1067
+ writeState(state, dir);
1068
+ }
1069
+ return state;
1070
+ }
1071
+
1072
+ // ── _runOneIter (M46 D1 T2) ─────────────────────────────────────────────────
1073
+
1074
+ /**
1075
+ * Body of a single supervisor iteration, extracted verbatim from the
1076
+ * `runMainLoop` while-loop (pre-M46-D1). Mutates `state` in place exactly as
1077
+ * the original body did — all writeState calls, event-stream emits, run.log
1078
+ * and token-log appends, heartbeat wiring, fan-out dispatch, and exit-code
1079
+ * classification are preserved line-for-line.
1080
+ *
1081
+ * `opts` here is the per-iter context bundle assembled in runMainLoop (not
1082
+ * the supervisor-level opts object). It carries the closure values the body
1083
+ * used to read from the enclosing scope: fn, config, dir, projectDir,
1084
+ * spawnWorker, milestoneComplete, runParallelImpl, workerTimeoutMs,
1085
+ * heartbeatImpl, heartbeatEnabled, staleHeartbeatMs, heartbeatPollMs, verbose.
1086
+ *
1087
+ * Returns an IterResult per iter-parallel-contract.md v1.0.0 §4. T2 emits a
1088
+ * minimal shape (tasksDone = []) — T4/T5 will populate tasksDone and use
1089
+ * `status` to drive `_computeIterBatchSize`. For now the while-loop driver
1090
+ * consumes only `isTerminal(state.status)`; the returned value is forward-
1091
+ * compatible scaffolding.
1092
+ */
1093
+ async function _runOneIter(state, opts) {
1094
+ const {
1095
+ dir, fn, config, spawnWorker, milestoneComplete, runParallelImpl,
1096
+ workerTimeoutMs, heartbeatImpl, heartbeatEnabled,
1097
+ staleHeartbeatMs, heartbeatPollMs, projectDir,
1098
+ } = opts;
1099
+
1100
+ const _result = (status, extras) => ({
1101
+ iter: state.iter,
1102
+ status,
1103
+ tasksDone: [],
1104
+ verifyNeeded: status === "verify-needed",
1105
+ artifacts: extras || {},
1106
+ });
1107
+
1108
+ // ── PRE-WORKER HOOK (contract §12) ─────────────────────────────────────
1109
+ // Refusal → halt with status=failed, lastExit=6 (caps) or 2 (validate).
1110
+ const capIter = fn.checkIterationCap(state, config);
1111
+ if (!capIter.ok) {
1112
+ state.status = "failed";
1113
+ state.lastExit = capIter.code || 6;
1114
+ writeState(state, dir);
1115
+ return _result("failed", { errorMessage: `iteration_cap:${state.lastExit}` });
1116
+ }
1117
+ const capWall = fn.checkWallClockCap(state, config);
1118
+ if (!capWall.ok) {
1119
+ state.status = "failed";
1120
+ state.lastExit = capWall.code || 6;
1121
+ writeState(state, dir);
1122
+ return _result("failed", { errorMessage: `wall_clock_cap:${state.lastExit}` });
1123
+ }
1124
+ const vRes = fn.validateState(state);
1125
+ if (!vRes.ok) {
1126
+ state.status = "failed";
1127
+ state.lastExit = vRes.code || 2;
998
1128
  writeState(state, dir);
1129
+ return _result("failed", { errorMessage: `validate_state:${state.lastExit}` });
1130
+ }
999
1131
 
1132
+ // Pre-spawn bookkeeping
1133
+ state.iter = (state.iter || 0) + 1;
1134
+ const workerStart = new Date();
1135
+ state.lastWorkerStartedAt = workerStart.toISOString();
1136
+ writeState(state, dir);
1137
+
1138
+ _emit(projectDir, {
1139
+ ts: workerStart.toISOString(),
1140
+ iter: state.iter,
1141
+ type: "task_start",
1142
+ source: "supervisor",
1143
+ milestone: state.milestone || "",
1144
+ wave: state.wave || "",
1145
+ task: state.nextTask || "",
1146
+ });
1147
+
1148
+ let res;
1149
+ const workerStartMs = workerStart.getTime();
1150
+ const hbOpts = heartbeatEnabled
1151
+ ? {
1152
+ onHeartbeatCheck: () =>
1153
+ heartbeatImpl({
1154
+ projectDir,
1155
+ workerStartedAt: workerStartMs,
1156
+ staleHeartbeatMs,
1157
+ }),
1158
+ heartbeatPollMs,
1159
+ }
1160
+ : {};
1161
+
1162
+ // M44 D9 (v1.5.0) — planner-driven fan-out decision for this iter.
1163
+ // Ask runParallel whether the current task graph supports ≥2 concurrent
1164
+ // workers. Any failure in the planner path MUST fall back to the single-
1165
+ // worker spawn — the parallel path is purely additive.
1166
+ let iterPlan = null;
1167
+ try {
1168
+ iterPlan = runParallelImpl({
1169
+ projectDir,
1170
+ mode: "unattended",
1171
+ milestone: state.milestone || null,
1172
+ dryRun: true,
1173
+ });
1174
+ } catch (e) {
1175
+ iterPlan = null;
1000
1176
  _emit(projectDir, {
1001
- ts: workerStart.toISOString(),
1002
1177
  iter: state.iter,
1003
- type: "task_start",
1178
+ type: "parallelism_reduced",
1004
1179
  source: "supervisor",
1005
- milestone: state.milestone || "",
1006
- wave: state.wave || "",
1007
- task: state.nextTask || "",
1180
+ original_count: null,
1181
+ reduced_count: 1,
1182
+ reason: `planner_error:${(e && e.message) || "unknown"}`,
1008
1183
  });
1184
+ }
1185
+ const fanOutCount = iterPlan && Number(iterPlan.workerCount) >= 2 ? Number(iterPlan.workerCount) : 1;
1186
+ const parallelTaskIds = iterPlan && Array.isArray(iterPlan.parallelTasks) ? iterPlan.parallelTasks : [];
1187
+ const subsets = fanOutCount >= 2 ? _partitionTasks(parallelTaskIds, fanOutCount) : null;
1188
+ const useFanOut = !!(subsets && subsets.length >= 2);
1009
1189
 
1010
- let res;
1011
- const workerStartMs = workerStart.getTime();
1012
- const hbOpts = heartbeatEnabled
1013
- ? {
1014
- onHeartbeatCheck: () =>
1015
- heartbeatImpl({
1016
- projectDir,
1017
- workerStartedAt: workerStartMs,
1018
- staleHeartbeatMs,
1019
- }),
1020
- heartbeatPollMs,
1021
- }
1022
- : {};
1023
-
1024
- // M44 D9 (v1.5.0) — planner-driven fan-out decision for this iter.
1025
- // Ask runParallel whether the current task graph supports ≥2 concurrent
1026
- // workers. Any failure in the planner path MUST fall back to the single-
1027
- // worker spawn — the parallel path is purely additive.
1028
- let iterPlan = null;
1029
- try {
1030
- iterPlan = runParallelImpl({
1031
- projectDir,
1032
- mode: "unattended",
1033
- milestone: state.milestone || null,
1034
- dryRun: true,
1035
- });
1036
- } catch (e) {
1037
- iterPlan = null;
1190
+ try {
1191
+ if (useFanOut) {
1038
1192
  _emit(projectDir, {
1193
+ ts: workerStart.toISOString(),
1039
1194
  iter: state.iter,
1040
- type: "parallelism_reduced",
1195
+ type: "fan_out",
1041
1196
  source: "supervisor",
1042
- original_count: null,
1043
- reduced_count: 1,
1044
- reason: `planner_error:${(e && e.message) || "unknown"}`,
1197
+ worker_count: subsets.length,
1198
+ task_ids: parallelTaskIds,
1045
1199
  });
1046
- }
1047
- const fanOutCount = iterPlan && Number(iterPlan.workerCount) >= 2 ? Number(iterPlan.workerCount) : 1;
1048
- const parallelTaskIds = iterPlan && Array.isArray(iterPlan.parallelTasks) ? iterPlan.parallelTasks : [];
1049
- const subsets = fanOutCount >= 2 ? _partitionTasks(parallelTaskIds, fanOutCount) : null;
1050
- const useFanOut = !!(subsets && subsets.length >= 2);
1051
-
1052
- try {
1053
- if (useFanOut) {
1054
- _emit(projectDir, {
1055
- ts: workerStart.toISOString(),
1056
- iter: state.iter,
1057
- type: "fan_out",
1058
- source: "supervisor",
1059
- worker_count: subsets.length,
1060
- task_ids: parallelTaskIds,
1061
- });
1062
- res = await _spawnWorkerFanOut(state, {
1063
- cwd: projectDir,
1064
- timeout: workerTimeoutMs,
1065
- verbose: !!opts.verbose,
1066
- ...hbOpts,
1067
- }, spawnWorker, subsets);
1068
- } else {
1069
- res = spawnWorker(state, {
1070
- cwd: projectDir,
1071
- timeout: workerTimeoutMs,
1072
- verbose: !!opts.verbose,
1073
- ...hbOpts,
1074
- });
1075
- if (res && typeof res.then === "function") {
1076
- res = await res;
1077
- }
1078
- }
1079
- } catch (e) {
1080
- // Defensive: a real spawnSync shouldn't throw, but a shim could.
1081
- res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
1082
- }
1083
- res = res || { status: null, stdout: "", stderr: "", signal: null };
1084
-
1085
- const workerEnd = new Date();
1086
- const elapsedMs = workerEnd.getTime() - workerStart.getTime();
1087
- const stdout = typeof res.stdout === "string" ? res.stdout : "";
1088
- const stderr = typeof res.stderr === "string" ? res.stderr : "";
1089
-
1090
- // Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
1091
- // - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
1092
- // - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
1093
- // Heartbeat wins on ties because it's the more specific signal.
1094
- let exitCode;
1095
- let lastExitReason = null;
1096
- if (res.staleHeartbeat === true) {
1097
- exitCode = 125;
1098
- lastExitReason = "stale_heartbeat";
1099
- } else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
1100
- exitCode = 124;
1101
- lastExitReason = "worker_timeout";
1200
+ res = await _spawnWorkerFanOut(state, {
1201
+ cwd: projectDir,
1202
+ timeout: workerTimeoutMs,
1203
+ verbose: !!opts.verbose,
1204
+ ...hbOpts,
1205
+ }, spawnWorker, subsets);
1102
1206
  } else {
1103
- exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
1207
+ res = spawnWorker(state, {
1208
+ cwd: projectDir,
1209
+ timeout: workerTimeoutMs,
1210
+ verbose: !!opts.verbose,
1211
+ ...hbOpts,
1212
+ });
1213
+ if (res && typeof res.then === "function") {
1214
+ res = await res;
1215
+ }
1104
1216
  }
1217
+ } catch (e) {
1218
+ // Defensive: a real spawnSync shouldn't throw, but a shim could.
1219
+ res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
1220
+ }
1221
+ res = res || { status: null, stdout: "", stderr: "", signal: null };
1222
+
1223
+ const workerEnd = new Date();
1224
+ const elapsedMs = workerEnd.getTime() - workerStart.getTime();
1225
+ const stdout = typeof res.stdout === "string" ? res.stdout : "";
1226
+ const stderr = typeof res.stderr === "string" ? res.stderr : "";
1227
+
1228
+ // Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
1229
+ // - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
1230
+ // - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
1231
+ // Heartbeat wins on ties because it's the more specific signal.
1232
+ let exitCode;
1233
+ let lastExitReason = null;
1234
+ if (res.staleHeartbeat === true) {
1235
+ exitCode = 125;
1236
+ lastExitReason = "stale_heartbeat";
1237
+ } else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
1238
+ exitCode = 124;
1239
+ lastExitReason = "worker_timeout";
1240
+ } else {
1241
+ exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
1242
+ }
1105
1243
 
1106
- // v3.13.11 Bug 1: when a watchdog fires, make the event explicit in
1107
- // run.log so operators can see WHICH iteration was cut without inferring
1108
- // from exit codes. The marker is prepended to stdout and written in the
1109
- // single per-iter run.log append (no duplicate header).
1110
- let loggedStdout = stdout;
1111
- if (exitCode === 124) {
1112
- const marker =
1113
- `[worker_timeout] iter=${state.iter} budget=${workerTimeoutMs}ms ` +
1114
- `elapsed=${elapsedMs}ms — absolute-backstop SIGTERM delivered, ` +
1115
- `supervisor continues relay per contract §16.\n`;
1116
- loggedStdout = marker + (stdout || "");
1117
- } else if (exitCode === 125) {
1118
- const reason = res.heartbeatReason || "no recent events.jsonl writes";
1119
- const marker =
1120
- `[stale_heartbeat] iter=${state.iter} threshold=${staleHeartbeatMs}ms ` +
1121
- `elapsed=${elapsedMs}ms reason="${reason}" — ` +
1122
- `heartbeat watchdog SIGTERM delivered, supervisor continues relay.\n`;
1123
- loggedStdout = marker + (stdout || "");
1124
- }
1244
+ // v3.13.11 Bug 1: when a watchdog fires, make the event explicit in
1245
+ // run.log so operators can see WHICH iteration was cut without inferring
1246
+ // from exit codes. The marker is prepended to stdout and written in the
1247
+ // single per-iter run.log append (no duplicate header).
1248
+ let loggedStdout = stdout;
1249
+ if (exitCode === 124) {
1250
+ const marker =
1251
+ `[worker_timeout] iter=${state.iter} budget=${workerTimeoutMs}ms ` +
1252
+ `elapsed=${elapsedMs}ms — absolute-backstop SIGTERM delivered, ` +
1253
+ `supervisor continues relay per contract §16.\n`;
1254
+ loggedStdout = marker + (stdout || "");
1255
+ } else if (exitCode === 125) {
1256
+ const reason = res.heartbeatReason || "no recent events.jsonl writes";
1257
+ const marker =
1258
+ `[stale_heartbeat] iter=${state.iter} threshold=${staleHeartbeatMs}ms ` +
1259
+ `elapsed=${elapsedMs}ms reason="${reason}" — ` +
1260
+ `heartbeat watchdog SIGTERM delivered, supervisor continues relay.\n`;
1261
+ loggedStdout = marker + (stdout || "");
1262
+ }
1125
1263
 
1126
- // Append the full worker output to run.log (never truncate).
1127
- _appendRunLog(dir, state.iter, workerEnd, exitCode, loggedStdout, stderr);
1128
-
1129
- // Append to token-log.md (Fix 1, v3.12.12) — supervisor workers write rows
1130
- // so the log captures headless/unattended activity, not just interactive spawns.
1131
- _appendTokenLog(projectDir, {
1132
- dtStart: workerStart.toISOString().slice(0, 16).replace("T", " "),
1133
- dtEnd: workerEnd.toISOString().slice(0, 16).replace("T", " "),
1134
- command: "gsd-t-resume",
1135
- durationS: Math.round(elapsedMs / 1000),
1136
- exitCode,
1264
+ // Append the full worker output to run.log (never truncate).
1265
+ _appendRunLog(dir, state.iter, workerEnd, exitCode, loggedStdout, stderr);
1266
+
1267
+ // Append to token-log.md (Fix 1, v3.12.12) — supervisor workers write rows
1268
+ // so the log captures headless/unattended activity, not just interactive spawns.
1269
+ _appendTokenLog(projectDir, {
1270
+ dtStart: workerStart.toISOString().slice(0, 16).replace("T", " "),
1271
+ dtEnd: workerEnd.toISOString().slice(0, 16).replace("T", " "),
1272
+ command: "gsd-t-resume",
1273
+ durationS: Math.round(elapsedMs / 1000),
1274
+ exitCode,
1275
+ iter: state.iter,
1276
+ });
1277
+
1278
+ // Post-spawn state update
1279
+ state.lastExit = exitCode;
1280
+ state.lastWorkerFinishedAt = workerEnd.toISOString();
1281
+ state.lastElapsedMs = elapsedMs;
1282
+ if (lastExitReason) {
1283
+ state.lastExitReason = lastExitReason;
1284
+ } else if (exitCode === 0) {
1285
+ state.lastExitReason = "clean";
1286
+ } else {
1287
+ state.lastExitReason = `exit_${exitCode}`;
1288
+ }
1289
+ // M44 D9 (v1.5.0) — per-iter multi-worker aggregates. Present only when the
1290
+ // planner selected fan-out; single-worker iters omit these fields so the
1291
+ // state schema stays backward-compatible with v1.4.x readers.
1292
+ if (useFanOut && Array.isArray(res.workerResults)) {
1293
+ state.lastExits = res.workerResults.map((w) => ({
1294
+ idx: w.idx,
1295
+ code: typeof w.status === "number" ? w.status : null,
1296
+ taskIds: w.taskIds || [],
1297
+ elapsedMs: w.elapsedMs,
1298
+ spawnId: w.spawnId || null,
1299
+ }));
1300
+ state.workerPids = res.workerResults.map((w) => w.spawnId || null);
1301
+ state.lastFanOutCount = res.workerResults.length;
1302
+ } else {
1303
+ // Clear stale multi-worker fields on single-worker iters so readers
1304
+ // never see a mix of regimes.
1305
+ if (state.lastExits) delete state.lastExits;
1306
+ if (state.workerPids) delete state.workerPids;
1307
+ if (state.lastFanOutCount) delete state.lastFanOutCount;
1308
+ }
1309
+ writeState(state, dir);
1310
+
1311
+ // Event-stream: task_complete on success, error on non-zero.
1312
+ const durationS = Math.round(elapsedMs / 1000);
1313
+ if (exitCode === 0) {
1314
+ _emit(projectDir, {
1315
+ ts: workerEnd.toISOString(),
1137
1316
  iter: state.iter,
1317
+ type: "task_complete",
1318
+ source: "supervisor",
1319
+ task: state.nextTask || "",
1320
+ verdict: "pass",
1321
+ duration_s: durationS,
1322
+ });
1323
+ } else {
1324
+ _emit(projectDir, {
1325
+ ts: workerEnd.toISOString(),
1326
+ iter: state.iter,
1327
+ type: "error",
1328
+ source: "supervisor",
1329
+ error: `worker exit ${exitCode}`,
1330
+ recoverable: exitCode !== 4 && exitCode !== 5,
1138
1331
  });
1332
+ }
1139
1333
 
1140
- // Post-spawn state update
1141
- state.lastExit = exitCode;
1142
- state.lastWorkerFinishedAt = workerEnd.toISOString();
1143
- state.lastElapsedMs = elapsedMs;
1144
- if (lastExitReason) {
1145
- state.lastExitReason = lastExitReason;
1146
- } else if (exitCode === 0) {
1147
- state.lastExitReason = "clean";
1148
- } else {
1149
- state.lastExitReason = `exit_${exitCode}`;
1150
- }
1151
- // M44 D9 (v1.5.0) — per-iter multi-worker aggregates. Present only when the
1152
- // planner selected fan-out; single-worker iters omit these fields so the
1153
- // state schema stays backward-compatible with v1.4.x readers.
1154
- if (useFanOut && Array.isArray(res.workerResults)) {
1155
- state.lastExits = res.workerResults.map((w) => ({
1156
- idx: w.idx,
1157
- code: typeof w.status === "number" ? w.status : null,
1158
- taskIds: w.taskIds || [],
1159
- elapsedMs: w.elapsedMs,
1160
- spawnId: w.spawnId || null,
1161
- }));
1162
- state.workerPids = res.workerResults.map((w) => w.spawnId || null);
1163
- state.lastFanOutCount = res.workerResults.length;
1164
- } else {
1165
- // Clear stale multi-worker fields on single-worker iters so readers
1166
- // never see a mix of regimes.
1167
- if (state.lastExits) delete state.lastExits;
1168
- if (state.workerPids) delete state.workerPids;
1169
- if (state.lastFanOutCount) delete state.lastFanOutCount;
1334
+ // ── POST-WORKER HOOK (contract §12) ────────────────────────────────────
1335
+ // Read the tail of run.log for pattern detection. ~200 lines is enough
1336
+ // to span the last several iteration blocks for the gutter detector.
1337
+ let runLogTail = "";
1338
+ try {
1339
+ const logPath = path.join(dir, RUN_LOG);
1340
+ if (fs.existsSync(logPath)) {
1341
+ const all = fs.readFileSync(logPath, "utf8");
1342
+ const lines = all.split(/\r?\n/);
1343
+ runLogTail = lines.slice(-200).join("\n");
1170
1344
  }
1345
+ } catch (_) {
1346
+ // best effort — tail read failure does not halt the loop
1347
+ }
1348
+ const blocker = fn.detectBlockerSentinel(runLogTail);
1349
+ if (!blocker.ok) {
1350
+ state.status = "failed";
1351
+ state.lastExit = blocker.code || 6;
1171
1352
  writeState(state, dir);
1353
+ return _result("failed", { errorMessage: `blocker_sentinel:${state.lastExit}` });
1354
+ }
1355
+ const gutter = fn.detectGutter(state, runLogTail, config);
1356
+ if (!gutter.ok) {
1357
+ state.status = "failed";
1358
+ state.lastExit = gutter.code || 6;
1359
+ writeState(state, dir);
1360
+ return _result("failed", { errorMessage: `gutter:${state.lastExit}` });
1361
+ }
1172
1362
 
1173
- // Event-stream: task_complete on success, error on non-zero.
1174
- const durationS = Math.round(elapsedMs / 1000);
1175
- if (exitCode === 0) {
1176
- _emit(projectDir, {
1177
- ts: workerEnd.toISOString(),
1178
- iter: state.iter,
1179
- type: "task_complete",
1180
- source: "supervisor",
1181
- task: state.nextTask || "",
1182
- verdict: "pass",
1183
- duration_s: durationS,
1184
- });
1185
- } else {
1186
- _emit(projectDir, {
1187
- ts: workerEnd.toISOString(),
1188
- iter: state.iter,
1189
- type: "error",
1190
- source: "supervisor",
1191
- error: `worker exit ${exitCode}`,
1192
- recoverable: exitCode !== 4 && exitCode !== 5,
1193
- });
1194
- }
1195
-
1196
- // ── POST-WORKER HOOK (contract §12) ────────────────────────────────────
1197
- // Read the tail of run.log for pattern detection. ~200 lines is enough
1198
- // to span the last several iteration blocks for the gutter detector.
1199
- let runLogTail = "";
1200
- try {
1201
- const logPath = path.join(dir, RUN_LOG);
1202
- if (fs.existsSync(logPath)) {
1203
- const all = fs.readFileSync(logPath, "utf8");
1204
- const lines = all.split(/\r?\n/);
1205
- runLogTail = lines.slice(-200).join("\n");
1206
- }
1207
- } catch (_) {
1208
- // best effort — tail read failure does not halt the loop
1209
- }
1210
- const blocker = fn.detectBlockerSentinel(runLogTail);
1211
- if (!blocker.ok) {
1212
- state.status = "failed";
1213
- state.lastExit = blocker.code || 6;
1214
- writeState(state, dir);
1215
- break;
1216
- }
1217
- const gutter = fn.detectGutter(state, runLogTail, config);
1218
- if (!gutter.ok) {
1219
- state.status = "failed";
1220
- state.lastExit = gutter.code || 6;
1221
- writeState(state, dir);
1222
- break;
1223
- }
1224
-
1225
- // Terminal exit classification
1226
- if (exitCode === 0) {
1227
- // Success — check if the milestone is now complete.
1228
- if (milestoneComplete(projectDir, state.milestone)) {
1229
- state.status = "done";
1230
- writeState(state, dir);
1231
- break;
1232
- }
1233
- // Not yet done — continue relay.
1234
- _emit(projectDir, {
1235
- iter: state.iter,
1236
- type: "retry",
1237
- source: "supervisor",
1238
- attempt: state.iter,
1239
- reason: "milestone_incomplete",
1240
- });
1241
- continue;
1242
- }
1243
- if (exitCode === 4) {
1244
- // Unrecoverable blocker.
1245
- state.status = "failed";
1246
- writeState(state, dir);
1247
- break;
1248
- }
1249
- if (exitCode === 5) {
1250
- // Command dispatch failure — worker invocation is broken.
1251
- state.status = "failed";
1363
+ // Terminal exit classification
1364
+ if (exitCode === 0) {
1365
+ // Success — check if the milestone is now complete.
1366
+ if (milestoneComplete(projectDir, state.milestone)) {
1367
+ state.status = "done";
1252
1368
  writeState(state, dir);
1253
- break;
1254
- }
1255
- if (exitCode === 124) {
1256
- // Timeout — continue unless the iter cap is hit on the next check.
1257
- _emit(projectDir, {
1258
- iter: state.iter,
1259
- type: "retry",
1260
- source: "supervisor",
1261
- attempt: state.iter,
1262
- reason: "timeout",
1263
- });
1264
- continue;
1369
+ return _result("done");
1265
1370
  }
1266
- if (exitCode === 125) {
1267
- // Stale heartbeat (M43) — continue unless the iter cap hits. The
1268
- // heartbeat kill is recoverable by definition: the worker was not
1269
- // emitting events, which is the most common class of stuck iteration
1270
- // (e.g. child stuck on a long Bash call with no tool_call emits).
1271
- _emit(projectDir, {
1272
- iter: state.iter,
1273
- type: "retry",
1274
- source: "supervisor",
1275
- attempt: state.iter,
1276
- reason: "stale_heartbeat",
1277
- });
1278
- continue;
1279
- }
1280
- // Non-terminal (1/2/3) — continue the relay.
1371
+ // Not yet done — continue relay.
1281
1372
  _emit(projectDir, {
1282
1373
  iter: state.iter,
1283
1374
  type: "retry",
1284
1375
  source: "supervisor",
1285
1376
  attempt: state.iter,
1286
- reason: `exit_${exitCode}`,
1377
+ reason: "milestone_incomplete",
1287
1378
  });
1379
+ return _result("running");
1288
1380
  }
1289
-
1290
- // If we exited because the user dropped a stop sentinel and no terminal
1291
- // status has been assigned yet, transition to 'stopped' now (contract §10).
1292
- // The sentinel file itself is NOT removed by the supervisor — it stays on
1293
- // disk as evidence, to be cleaned by the next launch via
1294
- // `cleanStaleStopSentinel`.
1295
- if (!isTerminal(state.status) && stopCheck(projectDir)) {
1296
- state.status = "stopped";
1381
+ if (exitCode === 4) {
1382
+ // Unrecoverable blocker.
1383
+ state.status = "failed";
1297
1384
  writeState(state, dir);
1385
+ return _result("failed", { errorMessage: "exit_4_unrecoverable" });
1298
1386
  }
1299
- return state;
1387
+ if (exitCode === 5) {
1388
+ // Command dispatch failure — worker invocation is broken.
1389
+ state.status = "failed";
1390
+ writeState(state, dir);
1391
+ return _result("failed", { errorMessage: "exit_5_dispatch_failure" });
1392
+ }
1393
+ if (exitCode === 124) {
1394
+ // Timeout — continue unless the iter cap is hit on the next check.
1395
+ _emit(projectDir, {
1396
+ iter: state.iter,
1397
+ type: "retry",
1398
+ source: "supervisor",
1399
+ attempt: state.iter,
1400
+ reason: "timeout",
1401
+ });
1402
+ return _result("running");
1403
+ }
1404
+ if (exitCode === 125) {
1405
+ // Stale heartbeat (M43) — continue unless the iter cap hits. The
1406
+ // heartbeat kill is recoverable by definition: the worker was not
1407
+ // emitting events, which is the most common class of stuck iteration
1408
+ // (e.g. child stuck on a long Bash call with no tool_call emits).
1409
+ _emit(projectDir, {
1410
+ iter: state.iter,
1411
+ type: "retry",
1412
+ source: "supervisor",
1413
+ attempt: state.iter,
1414
+ reason: "stale_heartbeat",
1415
+ });
1416
+ return _result("running");
1417
+ }
1418
+ // Non-terminal (1/2/3) — continue the relay.
1419
+ _emit(projectDir, {
1420
+ iter: state.iter,
1421
+ type: "retry",
1422
+ source: "supervisor",
1423
+ attempt: state.iter,
1424
+ reason: `exit_${exitCode}`,
1425
+ });
1426
+ return _result("running");
1427
+ }
1428
+
1429
+ // ── _computeIterBatchSize (M46 D1 T3) ───────────────────────────────────────
1430
+
1431
+ /**
1432
+ * Decide how many iterations the supervisor main loop should dispatch
1433
+ * concurrently in the next pass. Implements the mode-safety rules from
1434
+ * `.gsd-t/contracts/iter-parallel-contract.md` v1.0.0 §3.1.
1435
+ *
1436
+ * Rules evaluated top-down; first match wins:
1437
+ * 1. status === "verify-needed" → 1 (serial verify gate)
1438
+ * 2. milestoneBoundary === true → 1 (milestone boundary)
1439
+ * 3. status === "complete-milestone" → 1 (single-shot closeout)
1440
+ * 4. otherwise → min(opts.maxIterParallel ?? 4, remainingIters, 8)
1441
+ * where remainingIters = (state.maxIterations ?? Infinity) - (state.iter ?? 0)
1442
+ *
1443
+ * Never returns less than 1.
1444
+ */
1445
+ function _computeIterBatchSize(state, opts) {
1446
+ if (state && state.status === "verify-needed") return 1;
1447
+ if (state && state.milestoneBoundary === true) return 1;
1448
+ if (state && state.status === "complete-milestone") return 1;
1449
+
1450
+ // Production default is 1 (serial, pre-M46 behavior). Iter-parallelism is
1451
+ // opt-in via `opts.maxIterParallel` — callers that pass a number enable it.
1452
+ // Rationale: `_runOneIter` mutates `state.iter` and other shared fields
1453
+ // (heartbeat bookkeeping, writeState) that are not safe to execute on the
1454
+ // same state object concurrently. Unit tests exercise the parallel path
1455
+ // with explicit batch sizes; production main loop omits the flag and runs
1456
+ // strictly serial, preserving the pre-M46 supervisor contract (one iter
1457
+ // counter increment per fan-out pass). See backlog #24 for the follow-up
1458
+ // that makes `_runOneIter` state-clone-safe and lifts this gate.
1459
+ if (!opts || typeof opts.maxIterParallel !== "number") return 1;
1460
+
1461
+ const cap = opts.maxIterParallel;
1462
+ const maxIters = state && typeof state.maxIterations === "number"
1463
+ ? state.maxIterations
1464
+ : Infinity;
1465
+ const currentIter = state && typeof state.iter === "number"
1466
+ ? state.iter
1467
+ : 0;
1468
+ const remainingIters = maxIters - currentIter;
1469
+
1470
+ const size = Math.min(cap, remainingIters, 8);
1471
+ return size < 1 ? 1 : size;
1472
+ }
1473
+
1474
+ // ── _runIterParallel (M46 D1 T4) ────────────────────────────────────────────
1475
+
1476
+ /**
1477
+ * Dispatch `batchSize` independent iter slices concurrently and return an
1478
+ * IterResult[] of exactly that length. Implements the error-isolation rule
1479
+ * from `.gsd-t/contracts/iter-parallel-contract.md` v1.0.0 §4.2: a single
1480
+ * rejected iter is translated into an IterResult with status "error" and
1481
+ * does NOT cancel siblings. The caller decides how to react.
1482
+ *
1483
+ * iterFn defaults to `_runOneIter` for the T7 tests; production callers
1484
+ * (T5 main-loop rewrite) pass the same.
1485
+ */
1486
+ async function _runIterParallel(state, opts, iterFn, batchSize) {
1487
+ const fn = typeof iterFn === "function" ? iterFn : _runOneIter;
1488
+ const n = typeof batchSize === "number" && batchSize >= 1 ? batchSize : 1;
1489
+ const slices = [];
1490
+ for (let i = 0; i < n; i++) slices.push(Promise.resolve().then(() => fn(state, opts)));
1491
+ const settled = await Promise.allSettled(slices);
1492
+ return settled.map((s) => {
1493
+ if (s.status === "fulfilled") return s.value;
1494
+ const reason = s.reason;
1495
+ const msg = (reason && reason.message) ? reason.message : String(reason);
1496
+ return {
1497
+ status: "error",
1498
+ tasksDone: [],
1499
+ verifyNeeded: false,
1500
+ artifacts: [],
1501
+ error: msg,
1502
+ };
1503
+ });
1300
1504
  }
1301
1505
 
1302
1506
  // ── _appendTokenLog (Fix 1, v3.12.12) ───────────────────────────────────────