@tekyzinc/gsd-t 3.18.13 → 3.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +114 -0
- package/bin/gsd-t-parallel-probe.cjs +132 -0
- package/bin/gsd-t-parallel.cjs +422 -9
- package/bin/gsd-t-task-graph.cjs +80 -19
- package/bin/gsd-t-unattended.cjs +634 -229
- package/bin/gsd-t-worker-dispatch.cjs +211 -0
- package/bin/headless-auto-spawn.cjs +44 -1
- package/bin/headless-exit-codes.cjs +36 -18
- package/bin/m44-proof-measure.cjs +285 -0
- package/bin/m46-iter-proof.cjs +149 -0
- package/bin/m46-worker-proof.cjs +201 -0
- package/bin/parallelism-report.cjs +535 -0
- package/bin/spawn-plan-writer.cjs +1 -1
- package/commands/gsd-t-debug.md +10 -14
- package/commands/gsd-t-execute.md +10 -16
- package/commands/gsd-t-help.md +1 -0
- package/commands/gsd-t-integrate.md +8 -14
- package/commands/gsd-t-quick.md +10 -14
- package/commands/gsd-t-resume.md +32 -0
- package/commands/gsd-t-status.md +10 -0
- package/commands/gsd-t-unattended-watch.md +58 -1
- package/commands/gsd-t-visualize.md +15 -12
- package/commands/gsd-t-wave.md +2 -11
- package/docs/architecture.md +82 -0
- package/docs/requirements.md +20 -0
- package/package.json +1 -1
- package/scripts/gsd-t-compact-detector.js +51 -8
- package/scripts/gsd-t-dashboard-server.js +138 -85
- package/scripts/gsd-t-transcript.html +152 -1
- package/scripts/gsd-t-update-check.js +13 -4
- package/scripts/hooks/gsd-t-conversation-capture.js +258 -0
- package/templates/CLAUDE-global.md +54 -0
package/bin/gsd-t-unattended.cjs
CHANGED
|
@@ -62,6 +62,19 @@ function _emit(projectDir, ev) {
|
|
|
62
62
|
try { _esAppendEvent(projectDir, ev); } catch (_) { /* never halt the loop */ }
|
|
63
63
|
}
|
|
64
64
|
|
|
65
|
+
// M44 D9 (v1.5.0) — planner-driven multi-worker fan-out. Lazy-loaded so unit
|
|
66
|
+
// tests can stub via deps._runParallel without touching the real module.
|
|
67
|
+
let _parallelModule = null;
|
|
68
|
+
function _loadRunParallel() {
|
|
69
|
+
if (_parallelModule) return _parallelModule;
|
|
70
|
+
try {
|
|
71
|
+
_parallelModule = require("./gsd-t-parallel.cjs");
|
|
72
|
+
} catch {
|
|
73
|
+
_parallelModule = { runParallel: () => ({ workerCount: 0, parallelTasks: [], plan: [] }) };
|
|
74
|
+
}
|
|
75
|
+
return _parallelModule;
|
|
76
|
+
}
|
|
77
|
+
|
|
65
78
|
// M42 D1 — transcript tee. Captures each worker's stdout lines to an ndjson
|
|
66
79
|
// file and registers the spawn so the dashboard sidebar can list + render it.
|
|
67
80
|
// Best-effort: every call is swallowed so tee failures never halt the loop.
|
|
@@ -73,7 +86,7 @@ const { checkHeartbeat: _checkHeartbeat } = require("./gsd-t-unattended-heartbea
|
|
|
73
86
|
|
|
74
87
|
// ── Constants ───────────────────────────────────────────────────────────────
|
|
75
88
|
|
|
76
|
-
const CONTRACT_VERSION = "1.
|
|
89
|
+
const CONTRACT_VERSION = "1.5.0";
|
|
77
90
|
const UNATTENDED_DIR_REL = path.join(".gsd-t", ".unattended");
|
|
78
91
|
const PID_FILE = "supervisor.pid";
|
|
79
92
|
const STATE_FILE = "state.json";
|
|
@@ -122,6 +135,8 @@ module.exports = {
|
|
|
122
135
|
releaseSleepPrevention,
|
|
123
136
|
runMainLoop,
|
|
124
137
|
_spawnWorker,
|
|
138
|
+
_spawnWorkerFanOut,
|
|
139
|
+
_partitionTasks,
|
|
125
140
|
_appendRunLog,
|
|
126
141
|
CONTRACT_VERSION,
|
|
127
142
|
UNATTENDED_DIR_REL,
|
|
@@ -132,6 +147,50 @@ module.exports = {
|
|
|
132
147
|
DEFAULT_HEARTBEAT_POLL_MS,
|
|
133
148
|
};
|
|
134
149
|
|
|
150
|
+
function _reconcile(state, results) {
|
|
151
|
+
if (!Array.isArray(results) || results.length === 0) return;
|
|
152
|
+
for (const r of results) {
|
|
153
|
+
if (!r || typeof r !== 'object') continue;
|
|
154
|
+
// append-only completedTasks (preserve order, dedupe)
|
|
155
|
+
if (Array.isArray(r.tasksDone) && r.tasksDone.length > 0) {
|
|
156
|
+
const current = new Set(state.completedTasks || []);
|
|
157
|
+
for (const t of r.tasksDone) {
|
|
158
|
+
if (!current.has(t)) {
|
|
159
|
+
state.completedTasks = (state.completedTasks || []).concat([t]);
|
|
160
|
+
current.add(t);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
// last-writer-wins on status — but 'error' is sticky: once set, it stays
|
|
165
|
+
// until the next explicit non-error status in a later iter.
|
|
166
|
+
if (r.status && r.status !== state.status) {
|
|
167
|
+
state.status = r.status;
|
|
168
|
+
}
|
|
169
|
+
// verifyNeeded is OR-across-results: any iter that flags it wins.
|
|
170
|
+
if (r.verifyNeeded === true) {
|
|
171
|
+
state.verifyNeeded = true;
|
|
172
|
+
}
|
|
173
|
+
// artifacts: append-only, concat arrays.
|
|
174
|
+
if (Array.isArray(r.artifacts) && r.artifacts.length > 0) {
|
|
175
|
+
state.artifacts = (state.artifacts || []).concat(r.artifacts);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
// NOTE: `state.iter` is advanced by the main while loop (pre-M46 contract:
|
|
179
|
+
// one increment per fan-out pass, regardless of worker/batch count). We do
|
|
180
|
+
// NOT advance it here — doing so would double-increment against the
|
|
181
|
+
// existing supervisor-contract invariant (surfaced by m43/m44 tests).
|
|
182
|
+
state.lastBatch = {
|
|
183
|
+
size: results.length,
|
|
184
|
+
endedAt: new Date().toISOString(),
|
|
185
|
+
errorCount: results.filter(r => r && r.status === 'error').length,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// M46 D1 T2 — expose the extracted single-iter body for future unit tests
|
|
190
|
+
// (T7) and the iter-parallel driver (T4/T5). Kept out of the main exports
|
|
191
|
+
// block so consumers don't accidentally import implementation details.
|
|
192
|
+
module.exports.__test__ = { _runOneIter, _computeIterBatchSize, _runIterParallel, _reconcile };
|
|
193
|
+
|
|
135
194
|
// ── parseArgs ───────────────────────────────────────────────────────────────
|
|
136
195
|
|
|
137
196
|
/**
|
|
@@ -927,6 +986,10 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
|
|
|
927
986
|
deps._spawnWorker || (useTestStub ? _testModeSpawnWorker : _spawnWorker);
|
|
928
987
|
const milestoneComplete =
|
|
929
988
|
deps._isMilestoneComplete || (useTestStub ? () => true : isMilestoneComplete);
|
|
989
|
+
// M44 D9 (v1.5.0) — planner injected for multi-worker iter fan-out.
|
|
990
|
+
// Tests stub via deps._runParallel; production lazy-loads from gsd-t-parallel.cjs.
|
|
991
|
+
const runParallelImpl =
|
|
992
|
+
deps._runParallel || ((o) => _loadRunParallel().runParallel(o));
|
|
930
993
|
const stopCheck = deps._stopRequested || stopRequested;
|
|
931
994
|
const workerTimeoutMs = opts.workerTimeoutMs || DEFAULT_WORKER_TIMEOUT_MS;
|
|
932
995
|
const staleHeartbeatMs =
|
|
@@ -947,61 +1010,200 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
|
|
|
947
1010
|
const heartbeatEnabled = !deps._disableHeartbeat && !useTestStub;
|
|
948
1011
|
const projectDir = state.projectDir;
|
|
949
1012
|
|
|
1013
|
+
// M46 D1 T2 — pure extract-method refactor. The body of each iteration
|
|
1014
|
+
// now lives in the top-level `_runOneIter` helper (below). The while loop
|
|
1015
|
+
// itself is unchanged in semantics: stop-check and isDone evaluate per
|
|
1016
|
+
// pass, and any terminal state.status ({"done","failed"}) written by the
|
|
1017
|
+
// iter body causes us to break, matching every pre-refactor `break` path.
|
|
1018
|
+
// Non-terminal outcomes fall through to the next iteration, matching the
|
|
1019
|
+
// pre-refactor `continue` paths.
|
|
1020
|
+
const iterCtx = {
|
|
1021
|
+
dir,
|
|
1022
|
+
fn,
|
|
1023
|
+
config,
|
|
1024
|
+
spawnWorker,
|
|
1025
|
+
milestoneComplete,
|
|
1026
|
+
runParallelImpl,
|
|
1027
|
+
workerTimeoutMs,
|
|
1028
|
+
heartbeatImpl,
|
|
1029
|
+
heartbeatEnabled,
|
|
1030
|
+
staleHeartbeatMs,
|
|
1031
|
+
heartbeatPollMs,
|
|
1032
|
+
projectDir,
|
|
1033
|
+
verbose: !!opts.verbose,
|
|
1034
|
+
};
|
|
950
1035
|
while (!isDone(state) && !stopCheck(projectDir)) {
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
}
|
|
960
|
-
const
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
1036
|
+
const batchSize = _computeIterBatchSize(state, opts);
|
|
1037
|
+
const _batchStartMs = Date.now();
|
|
1038
|
+
try {
|
|
1039
|
+
fs.appendFileSync(
|
|
1040
|
+
path.join(dir, RUN_LOG),
|
|
1041
|
+
`[iter-batch-start] batch-size=${batchSize} iter=${state.iter} ts=${new Date(_batchStartMs).toISOString()}\n`,
|
|
1042
|
+
"utf8"
|
|
1043
|
+
);
|
|
1044
|
+
} catch (_) { /* best effort */ }
|
|
1045
|
+
const results = await _runIterParallel(state, opts, (s, o) => _runOneIter(s, iterCtx), batchSize);
|
|
1046
|
+
_reconcile(state, results);
|
|
1047
|
+
try {
|
|
1048
|
+
const _ok = results.filter((r) => r.status !== "error").length;
|
|
1049
|
+
const _fail = results.length - _ok;
|
|
1050
|
+
const _durSec = ((Date.now() - _batchStartMs) / 1000).toFixed(1);
|
|
1051
|
+
fs.appendFileSync(
|
|
1052
|
+
path.join(dir, RUN_LOG),
|
|
1053
|
+
`[iter-batch-complete] size=${results.length} ok=${_ok} fail=${_fail} duration=${_durSec}s iter=${state.iter}\n`,
|
|
1054
|
+
"utf8"
|
|
1055
|
+
);
|
|
1056
|
+
} catch (_) { /* best effort */ }
|
|
1057
|
+
if (isTerminal(state.status)) break;
|
|
1058
|
+
}
|
|
974
1059
|
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
1060
|
+
// If we exited because the user dropped a stop sentinel and no terminal
|
|
1061
|
+
// status has been assigned yet, transition to 'stopped' now (contract §10).
|
|
1062
|
+
// The sentinel file itself is NOT removed by the supervisor — it stays on
|
|
1063
|
+
// disk as evidence, to be cleaned by the next launch via
|
|
1064
|
+
// `cleanStaleStopSentinel`.
|
|
1065
|
+
if (!isTerminal(state.status) && stopCheck(projectDir)) {
|
|
1066
|
+
state.status = "stopped";
|
|
979
1067
|
writeState(state, dir);
|
|
1068
|
+
}
|
|
1069
|
+
return state;
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
// ── _runOneIter (M46 D1 T2) ─────────────────────────────────────────────────
|
|
1073
|
+
|
|
1074
|
+
/**
|
|
1075
|
+
* Body of a single supervisor iteration, extracted verbatim from the
|
|
1076
|
+
* `runMainLoop` while-loop (pre-M46-D1). Mutates `state` in place exactly as
|
|
1077
|
+
* the original body did — all writeState calls, event-stream emits, run.log
|
|
1078
|
+
* and token-log appends, heartbeat wiring, fan-out dispatch, and exit-code
|
|
1079
|
+
* classification are preserved line-for-line.
|
|
1080
|
+
*
|
|
1081
|
+
* `opts` here is the per-iter context bundle assembled in runMainLoop (not
|
|
1082
|
+
* the supervisor-level opts object). It carries the closure values the body
|
|
1083
|
+
* used to read from the enclosing scope: fn, config, dir, projectDir,
|
|
1084
|
+
* spawnWorker, milestoneComplete, runParallelImpl, workerTimeoutMs,
|
|
1085
|
+
* heartbeatImpl, heartbeatEnabled, staleHeartbeatMs, heartbeatPollMs, verbose.
|
|
1086
|
+
*
|
|
1087
|
+
* Returns an IterResult per iter-parallel-contract.md v1.0.0 §4. T2 emits a
|
|
1088
|
+
* minimal shape (tasksDone = []) — T4/T5 will populate tasksDone and use
|
|
1089
|
+
* `status` to drive `_computeIterBatchSize`. For now the while-loop driver
|
|
1090
|
+
* consumes only `isTerminal(state.status)`; the returned value is forward-
|
|
1091
|
+
* compatible scaffolding.
|
|
1092
|
+
*/
|
|
1093
|
+
async function _runOneIter(state, opts) {
|
|
1094
|
+
const {
|
|
1095
|
+
dir, fn, config, spawnWorker, milestoneComplete, runParallelImpl,
|
|
1096
|
+
workerTimeoutMs, heartbeatImpl, heartbeatEnabled,
|
|
1097
|
+
staleHeartbeatMs, heartbeatPollMs, projectDir,
|
|
1098
|
+
} = opts;
|
|
1099
|
+
|
|
1100
|
+
const _result = (status, extras) => ({
|
|
1101
|
+
iter: state.iter,
|
|
1102
|
+
status,
|
|
1103
|
+
tasksDone: [],
|
|
1104
|
+
verifyNeeded: status === "verify-needed",
|
|
1105
|
+
artifacts: extras || {},
|
|
1106
|
+
});
|
|
980
1107
|
|
|
1108
|
+
// ── PRE-WORKER HOOK (contract §12) ─────────────────────────────────────
|
|
1109
|
+
// Refusal → halt with status=failed, lastExit=6 (caps) or 2 (validate).
|
|
1110
|
+
const capIter = fn.checkIterationCap(state, config);
|
|
1111
|
+
if (!capIter.ok) {
|
|
1112
|
+
state.status = "failed";
|
|
1113
|
+
state.lastExit = capIter.code || 6;
|
|
1114
|
+
writeState(state, dir);
|
|
1115
|
+
return _result("failed", { errorMessage: `iteration_cap:${state.lastExit}` });
|
|
1116
|
+
}
|
|
1117
|
+
const capWall = fn.checkWallClockCap(state, config);
|
|
1118
|
+
if (!capWall.ok) {
|
|
1119
|
+
state.status = "failed";
|
|
1120
|
+
state.lastExit = capWall.code || 6;
|
|
1121
|
+
writeState(state, dir);
|
|
1122
|
+
return _result("failed", { errorMessage: `wall_clock_cap:${state.lastExit}` });
|
|
1123
|
+
}
|
|
1124
|
+
const vRes = fn.validateState(state);
|
|
1125
|
+
if (!vRes.ok) {
|
|
1126
|
+
state.status = "failed";
|
|
1127
|
+
state.lastExit = vRes.code || 2;
|
|
1128
|
+
writeState(state, dir);
|
|
1129
|
+
return _result("failed", { errorMessage: `validate_state:${state.lastExit}` });
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
// Pre-spawn bookkeeping
|
|
1133
|
+
state.iter = (state.iter || 0) + 1;
|
|
1134
|
+
const workerStart = new Date();
|
|
1135
|
+
state.lastWorkerStartedAt = workerStart.toISOString();
|
|
1136
|
+
writeState(state, dir);
|
|
1137
|
+
|
|
1138
|
+
_emit(projectDir, {
|
|
1139
|
+
ts: workerStart.toISOString(),
|
|
1140
|
+
iter: state.iter,
|
|
1141
|
+
type: "task_start",
|
|
1142
|
+
source: "supervisor",
|
|
1143
|
+
milestone: state.milestone || "",
|
|
1144
|
+
wave: state.wave || "",
|
|
1145
|
+
task: state.nextTask || "",
|
|
1146
|
+
});
|
|
1147
|
+
|
|
1148
|
+
let res;
|
|
1149
|
+
const workerStartMs = workerStart.getTime();
|
|
1150
|
+
const hbOpts = heartbeatEnabled
|
|
1151
|
+
? {
|
|
1152
|
+
onHeartbeatCheck: () =>
|
|
1153
|
+
heartbeatImpl({
|
|
1154
|
+
projectDir,
|
|
1155
|
+
workerStartedAt: workerStartMs,
|
|
1156
|
+
staleHeartbeatMs,
|
|
1157
|
+
}),
|
|
1158
|
+
heartbeatPollMs,
|
|
1159
|
+
}
|
|
1160
|
+
: {};
|
|
1161
|
+
|
|
1162
|
+
// M44 D9 (v1.5.0) — planner-driven fan-out decision for this iter.
|
|
1163
|
+
// Ask runParallel whether the current task graph supports ≥2 concurrent
|
|
1164
|
+
// workers. Any failure in the planner path MUST fall back to the single-
|
|
1165
|
+
// worker spawn — the parallel path is purely additive.
|
|
1166
|
+
let iterPlan = null;
|
|
1167
|
+
try {
|
|
1168
|
+
iterPlan = runParallelImpl({
|
|
1169
|
+
projectDir,
|
|
1170
|
+
mode: "unattended",
|
|
1171
|
+
milestone: state.milestone || null,
|
|
1172
|
+
dryRun: true,
|
|
1173
|
+
});
|
|
1174
|
+
} catch (e) {
|
|
1175
|
+
iterPlan = null;
|
|
981
1176
|
_emit(projectDir, {
|
|
982
|
-
ts: workerStart.toISOString(),
|
|
983
1177
|
iter: state.iter,
|
|
984
|
-
type: "
|
|
1178
|
+
type: "parallelism_reduced",
|
|
985
1179
|
source: "supervisor",
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
1180
|
+
original_count: null,
|
|
1181
|
+
reduced_count: 1,
|
|
1182
|
+
reason: `planner_error:${(e && e.message) || "unknown"}`,
|
|
989
1183
|
});
|
|
1184
|
+
}
|
|
1185
|
+
const fanOutCount = iterPlan && Number(iterPlan.workerCount) >= 2 ? Number(iterPlan.workerCount) : 1;
|
|
1186
|
+
const parallelTaskIds = iterPlan && Array.isArray(iterPlan.parallelTasks) ? iterPlan.parallelTasks : [];
|
|
1187
|
+
const subsets = fanOutCount >= 2 ? _partitionTasks(parallelTaskIds, fanOutCount) : null;
|
|
1188
|
+
const useFanOut = !!(subsets && subsets.length >= 2);
|
|
990
1189
|
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1190
|
+
try {
|
|
1191
|
+
if (useFanOut) {
|
|
1192
|
+
_emit(projectDir, {
|
|
1193
|
+
ts: workerStart.toISOString(),
|
|
1194
|
+
iter: state.iter,
|
|
1195
|
+
type: "fan_out",
|
|
1196
|
+
source: "supervisor",
|
|
1197
|
+
worker_count: subsets.length,
|
|
1198
|
+
task_ids: parallelTaskIds,
|
|
1199
|
+
});
|
|
1200
|
+
res = await _spawnWorkerFanOut(state, {
|
|
1201
|
+
cwd: projectDir,
|
|
1202
|
+
timeout: workerTimeoutMs,
|
|
1203
|
+
verbose: !!opts.verbose,
|
|
1204
|
+
...hbOpts,
|
|
1205
|
+
}, spawnWorker, subsets);
|
|
1206
|
+
} else {
|
|
1005
1207
|
res = spawnWorker(state, {
|
|
1006
1208
|
cwd: projectDir,
|
|
1007
1209
|
timeout: workerTimeoutMs,
|
|
@@ -1011,207 +1213,294 @@ async function runMainLoop(state, dir, opts, deps, ctx) {
|
|
|
1011
1213
|
if (res && typeof res.then === "function") {
|
|
1012
1214
|
res = await res;
|
|
1013
1215
|
}
|
|
1014
|
-
} catch (e) {
|
|
1015
|
-
// Defensive: a real spawnSync shouldn't throw, but a shim could.
|
|
1016
|
-
res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
|
|
1017
|
-
}
|
|
1018
|
-
res = res || { status: null, stdout: "", stderr: "", signal: null };
|
|
1019
|
-
|
|
1020
|
-
const workerEnd = new Date();
|
|
1021
|
-
const elapsedMs = workerEnd.getTime() - workerStart.getTime();
|
|
1022
|
-
const stdout = typeof res.stdout === "string" ? res.stdout : "";
|
|
1023
|
-
const stderr = typeof res.stderr === "string" ? res.stderr : "";
|
|
1024
|
-
|
|
1025
|
-
// Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
|
|
1026
|
-
// - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
|
|
1027
|
-
// - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
|
|
1028
|
-
// Heartbeat wins on ties because it's the more specific signal.
|
|
1029
|
-
let exitCode;
|
|
1030
|
-
let lastExitReason = null;
|
|
1031
|
-
if (res.staleHeartbeat === true) {
|
|
1032
|
-
exitCode = 125;
|
|
1033
|
-
lastExitReason = "stale_heartbeat";
|
|
1034
|
-
} else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
|
|
1035
|
-
exitCode = 124;
|
|
1036
|
-
lastExitReason = "worker_timeout";
|
|
1037
|
-
} else {
|
|
1038
|
-
exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
|
|
1039
1216
|
}
|
|
1217
|
+
} catch (e) {
|
|
1218
|
+
// Defensive: a real spawnSync shouldn't throw, but a shim could.
|
|
1219
|
+
res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
|
|
1220
|
+
}
|
|
1221
|
+
res = res || { status: null, stdout: "", stderr: "", signal: null };
|
|
1222
|
+
|
|
1223
|
+
const workerEnd = new Date();
|
|
1224
|
+
const elapsedMs = workerEnd.getTime() - workerStart.getTime();
|
|
1225
|
+
const stdout = typeof res.stdout === "string" ? res.stdout : "";
|
|
1226
|
+
const stderr = typeof res.stderr === "string" ? res.stderr : "";
|
|
1227
|
+
|
|
1228
|
+
// Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
|
|
1229
|
+
// - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
|
|
1230
|
+
// - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
|
|
1231
|
+
// Heartbeat wins on ties because it's the more specific signal.
|
|
1232
|
+
let exitCode;
|
|
1233
|
+
let lastExitReason = null;
|
|
1234
|
+
if (res.staleHeartbeat === true) {
|
|
1235
|
+
exitCode = 125;
|
|
1236
|
+
lastExitReason = "stale_heartbeat";
|
|
1237
|
+
} else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
|
|
1238
|
+
exitCode = 124;
|
|
1239
|
+
lastExitReason = "worker_timeout";
|
|
1240
|
+
} else {
|
|
1241
|
+
exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
|
|
1242
|
+
}
|
|
1040
1243
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1244
|
+
// v3.13.11 Bug 1: when a watchdog fires, make the event explicit in
|
|
1245
|
+
// run.log so operators can see WHICH iteration was cut without inferring
|
|
1246
|
+
// from exit codes. The marker is prepended to stdout and written in the
|
|
1247
|
+
// single per-iter run.log append (no duplicate header).
|
|
1248
|
+
let loggedStdout = stdout;
|
|
1249
|
+
if (exitCode === 124) {
|
|
1250
|
+
const marker =
|
|
1251
|
+
`[worker_timeout] iter=${state.iter} budget=${workerTimeoutMs}ms ` +
|
|
1252
|
+
`elapsed=${elapsedMs}ms — absolute-backstop SIGTERM delivered, ` +
|
|
1253
|
+
`supervisor continues relay per contract §16.\n`;
|
|
1254
|
+
loggedStdout = marker + (stdout || "");
|
|
1255
|
+
} else if (exitCode === 125) {
|
|
1256
|
+
const reason = res.heartbeatReason || "no recent events.jsonl writes";
|
|
1257
|
+
const marker =
|
|
1258
|
+
`[stale_heartbeat] iter=${state.iter} threshold=${staleHeartbeatMs}ms ` +
|
|
1259
|
+
`elapsed=${elapsedMs}ms reason="${reason}" — ` +
|
|
1260
|
+
`heartbeat watchdog SIGTERM delivered, supervisor continues relay.\n`;
|
|
1261
|
+
loggedStdout = marker + (stdout || "");
|
|
1262
|
+
}
|
|
1263
|
+
|
|
1264
|
+
// Append the full worker output to run.log (never truncate).
|
|
1265
|
+
_appendRunLog(dir, state.iter, workerEnd, exitCode, loggedStdout, stderr);
|
|
1266
|
+
|
|
1267
|
+
// Append to token-log.md (Fix 1, v3.12.12) — supervisor workers write rows
|
|
1268
|
+
// so the log captures headless/unattended activity, not just interactive spawns.
|
|
1269
|
+
_appendTokenLog(projectDir, {
|
|
1270
|
+
dtStart: workerStart.toISOString().slice(0, 16).replace("T", " "),
|
|
1271
|
+
dtEnd: workerEnd.toISOString().slice(0, 16).replace("T", " "),
|
|
1272
|
+
command: "gsd-t-resume",
|
|
1273
|
+
durationS: Math.round(elapsedMs / 1000),
|
|
1274
|
+
exitCode,
|
|
1275
|
+
iter: state.iter,
|
|
1276
|
+
});
|
|
1060
1277
|
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1278
|
+
// Post-spawn state update
|
|
1279
|
+
state.lastExit = exitCode;
|
|
1280
|
+
state.lastWorkerFinishedAt = workerEnd.toISOString();
|
|
1281
|
+
state.lastElapsedMs = elapsedMs;
|
|
1282
|
+
if (lastExitReason) {
|
|
1283
|
+
state.lastExitReason = lastExitReason;
|
|
1284
|
+
} else if (exitCode === 0) {
|
|
1285
|
+
state.lastExitReason = "clean";
|
|
1286
|
+
} else {
|
|
1287
|
+
state.lastExitReason = `exit_${exitCode}`;
|
|
1288
|
+
}
|
|
1289
|
+
// M44 D9 (v1.5.0) — per-iter multi-worker aggregates. Present only when the
|
|
1290
|
+
// planner selected fan-out; single-worker iters omit these fields so the
|
|
1291
|
+
// state schema stays backward-compatible with v1.4.x readers.
|
|
1292
|
+
if (useFanOut && Array.isArray(res.workerResults)) {
|
|
1293
|
+
state.lastExits = res.workerResults.map((w) => ({
|
|
1294
|
+
idx: w.idx,
|
|
1295
|
+
code: typeof w.status === "number" ? w.status : null,
|
|
1296
|
+
taskIds: w.taskIds || [],
|
|
1297
|
+
elapsedMs: w.elapsedMs,
|
|
1298
|
+
spawnId: w.spawnId || null,
|
|
1299
|
+
}));
|
|
1300
|
+
state.workerPids = res.workerResults.map((w) => w.spawnId || null);
|
|
1301
|
+
state.lastFanOutCount = res.workerResults.length;
|
|
1302
|
+
} else {
|
|
1303
|
+
// Clear stale multi-worker fields on single-worker iters so readers
|
|
1304
|
+
// never see a mix of regimes.
|
|
1305
|
+
if (state.lastExits) delete state.lastExits;
|
|
1306
|
+
if (state.workerPids) delete state.workerPids;
|
|
1307
|
+
if (state.lastFanOutCount) delete state.lastFanOutCount;
|
|
1308
|
+
}
|
|
1309
|
+
writeState(state, dir);
|
|
1310
|
+
|
|
1311
|
+
// Event-stream: task_complete on success, error on non-zero.
|
|
1312
|
+
const durationS = Math.round(elapsedMs / 1000);
|
|
1313
|
+
if (exitCode === 0) {
|
|
1314
|
+
_emit(projectDir, {
|
|
1315
|
+
ts: workerEnd.toISOString(),
|
|
1072
1316
|
iter: state.iter,
|
|
1317
|
+
type: "task_complete",
|
|
1318
|
+
source: "supervisor",
|
|
1319
|
+
task: state.nextTask || "",
|
|
1320
|
+
verdict: "pass",
|
|
1321
|
+
duration_s: durationS,
|
|
1073
1322
|
});
|
|
1323
|
+
} else {
|
|
1324
|
+
_emit(projectDir, {
|
|
1325
|
+
ts: workerEnd.toISOString(),
|
|
1326
|
+
iter: state.iter,
|
|
1327
|
+
type: "error",
|
|
1328
|
+
source: "supervisor",
|
|
1329
|
+
error: `worker exit ${exitCode}`,
|
|
1330
|
+
recoverable: exitCode !== 4 && exitCode !== 5,
|
|
1331
|
+
});
|
|
1332
|
+
}
|
|
1074
1333
|
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1334
|
+
// ── POST-WORKER HOOK (contract §12) ────────────────────────────────────
|
|
1335
|
+
// Read the tail of run.log for pattern detection. ~200 lines is enough
|
|
1336
|
+
// to span the last several iteration blocks for the gutter detector.
|
|
1337
|
+
let runLogTail = "";
|
|
1338
|
+
try {
|
|
1339
|
+
const logPath = path.join(dir, RUN_LOG);
|
|
1340
|
+
if (fs.existsSync(logPath)) {
|
|
1341
|
+
const all = fs.readFileSync(logPath, "utf8");
|
|
1342
|
+
const lines = all.split(/\r?\n/);
|
|
1343
|
+
runLogTail = lines.slice(-200).join("\n");
|
|
1085
1344
|
}
|
|
1345
|
+
} catch (_) {
|
|
1346
|
+
// best effort — tail read failure does not halt the loop
|
|
1347
|
+
}
|
|
1348
|
+
const blocker = fn.detectBlockerSentinel(runLogTail);
|
|
1349
|
+
if (!blocker.ok) {
|
|
1350
|
+
state.status = "failed";
|
|
1351
|
+
state.lastExit = blocker.code || 6;
|
|
1086
1352
|
writeState(state, dir);
|
|
1353
|
+
return _result("failed", { errorMessage: `blocker_sentinel:${state.lastExit}` });
|
|
1354
|
+
}
|
|
1355
|
+
const gutter = fn.detectGutter(state, runLogTail, config);
|
|
1356
|
+
if (!gutter.ok) {
|
|
1357
|
+
state.status = "failed";
|
|
1358
|
+
state.lastExit = gutter.code || 6;
|
|
1359
|
+
writeState(state, dir);
|
|
1360
|
+
return _result("failed", { errorMessage: `gutter:${state.lastExit}` });
|
|
1361
|
+
}
|
|
1087
1362
|
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
if
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
iter: state.iter,
|
|
1094
|
-
type: "task_complete",
|
|
1095
|
-
source: "supervisor",
|
|
1096
|
-
task: state.nextTask || "",
|
|
1097
|
-
verdict: "pass",
|
|
1098
|
-
duration_s: durationS,
|
|
1099
|
-
});
|
|
1100
|
-
} else {
|
|
1101
|
-
_emit(projectDir, {
|
|
1102
|
-
ts: workerEnd.toISOString(),
|
|
1103
|
-
iter: state.iter,
|
|
1104
|
-
type: "error",
|
|
1105
|
-
source: "supervisor",
|
|
1106
|
-
error: `worker exit ${exitCode}`,
|
|
1107
|
-
recoverable: exitCode !== 4 && exitCode !== 5,
|
|
1108
|
-
});
|
|
1109
|
-
}
|
|
1110
|
-
|
|
1111
|
-
// ── POST-WORKER HOOK (contract §12) ────────────────────────────────────
|
|
1112
|
-
// Read the tail of run.log for pattern detection. ~200 lines is enough
|
|
1113
|
-
// to span the last several iteration blocks for the gutter detector.
|
|
1114
|
-
let runLogTail = "";
|
|
1115
|
-
try {
|
|
1116
|
-
const logPath = path.join(dir, RUN_LOG);
|
|
1117
|
-
if (fs.existsSync(logPath)) {
|
|
1118
|
-
const all = fs.readFileSync(logPath, "utf8");
|
|
1119
|
-
const lines = all.split(/\r?\n/);
|
|
1120
|
-
runLogTail = lines.slice(-200).join("\n");
|
|
1121
|
-
}
|
|
1122
|
-
} catch (_) {
|
|
1123
|
-
// best effort — tail read failure does not halt the loop
|
|
1124
|
-
}
|
|
1125
|
-
const blocker = fn.detectBlockerSentinel(runLogTail);
|
|
1126
|
-
if (!blocker.ok) {
|
|
1127
|
-
state.status = "failed";
|
|
1128
|
-
state.lastExit = blocker.code || 6;
|
|
1363
|
+
// Terminal exit classification
|
|
1364
|
+
if (exitCode === 0) {
|
|
1365
|
+
// Success — check if the milestone is now complete.
|
|
1366
|
+
if (milestoneComplete(projectDir, state.milestone)) {
|
|
1367
|
+
state.status = "done";
|
|
1129
1368
|
writeState(state, dir);
|
|
1130
|
-
|
|
1131
|
-
}
|
|
1132
|
-
const gutter = fn.detectGutter(state, runLogTail, config);
|
|
1133
|
-
if (!gutter.ok) {
|
|
1134
|
-
state.status = "failed";
|
|
1135
|
-
state.lastExit = gutter.code || 6;
|
|
1136
|
-
writeState(state, dir);
|
|
1137
|
-
break;
|
|
1138
|
-
}
|
|
1139
|
-
|
|
1140
|
-
// Terminal exit classification
|
|
1141
|
-
if (exitCode === 0) {
|
|
1142
|
-
// Success — check if the milestone is now complete.
|
|
1143
|
-
if (milestoneComplete(projectDir, state.milestone)) {
|
|
1144
|
-
state.status = "done";
|
|
1145
|
-
writeState(state, dir);
|
|
1146
|
-
break;
|
|
1147
|
-
}
|
|
1148
|
-
// Not yet done — continue relay.
|
|
1149
|
-
_emit(projectDir, {
|
|
1150
|
-
iter: state.iter,
|
|
1151
|
-
type: "retry",
|
|
1152
|
-
source: "supervisor",
|
|
1153
|
-
attempt: state.iter,
|
|
1154
|
-
reason: "milestone_incomplete",
|
|
1155
|
-
});
|
|
1156
|
-
continue;
|
|
1369
|
+
return _result("done");
|
|
1157
1370
|
}
|
|
1158
|
-
|
|
1159
|
-
// Unrecoverable blocker.
|
|
1160
|
-
state.status = "failed";
|
|
1161
|
-
writeState(state, dir);
|
|
1162
|
-
break;
|
|
1163
|
-
}
|
|
1164
|
-
if (exitCode === 5) {
|
|
1165
|
-
// Command dispatch failure — worker invocation is broken.
|
|
1166
|
-
state.status = "failed";
|
|
1167
|
-
writeState(state, dir);
|
|
1168
|
-
break;
|
|
1169
|
-
}
|
|
1170
|
-
if (exitCode === 124) {
|
|
1171
|
-
// Timeout — continue unless the iter cap is hit on the next check.
|
|
1172
|
-
_emit(projectDir, {
|
|
1173
|
-
iter: state.iter,
|
|
1174
|
-
type: "retry",
|
|
1175
|
-
source: "supervisor",
|
|
1176
|
-
attempt: state.iter,
|
|
1177
|
-
reason: "timeout",
|
|
1178
|
-
});
|
|
1179
|
-
continue;
|
|
1180
|
-
}
|
|
1181
|
-
if (exitCode === 125) {
|
|
1182
|
-
// Stale heartbeat (M43) — continue unless the iter cap hits. The
|
|
1183
|
-
// heartbeat kill is recoverable by definition: the worker was not
|
|
1184
|
-
// emitting events, which is the most common class of stuck iteration
|
|
1185
|
-
// (e.g. child stuck on a long Bash call with no tool_call emits).
|
|
1186
|
-
_emit(projectDir, {
|
|
1187
|
-
iter: state.iter,
|
|
1188
|
-
type: "retry",
|
|
1189
|
-
source: "supervisor",
|
|
1190
|
-
attempt: state.iter,
|
|
1191
|
-
reason: "stale_heartbeat",
|
|
1192
|
-
});
|
|
1193
|
-
continue;
|
|
1194
|
-
}
|
|
1195
|
-
// Non-terminal (1/2/3) — continue the relay.
|
|
1371
|
+
// Not yet done — continue relay.
|
|
1196
1372
|
_emit(projectDir, {
|
|
1197
1373
|
iter: state.iter,
|
|
1198
1374
|
type: "retry",
|
|
1199
1375
|
source: "supervisor",
|
|
1200
1376
|
attempt: state.iter,
|
|
1201
|
-
reason:
|
|
1377
|
+
reason: "milestone_incomplete",
|
|
1202
1378
|
});
|
|
1379
|
+
return _result("running");
|
|
1203
1380
|
}
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
// The sentinel file itself is NOT removed by the supervisor — it stays on
|
|
1208
|
-
// disk as evidence, to be cleaned by the next launch via
|
|
1209
|
-
// `cleanStaleStopSentinel`.
|
|
1210
|
-
if (!isTerminal(state.status) && stopCheck(projectDir)) {
|
|
1211
|
-
state.status = "stopped";
|
|
1381
|
+
if (exitCode === 4) {
|
|
1382
|
+
// Unrecoverable blocker.
|
|
1383
|
+
state.status = "failed";
|
|
1212
1384
|
writeState(state, dir);
|
|
1385
|
+
return _result("failed", { errorMessage: "exit_4_unrecoverable" });
|
|
1213
1386
|
}
|
|
1214
|
-
|
|
1387
|
+
if (exitCode === 5) {
|
|
1388
|
+
// Command dispatch failure — worker invocation is broken.
|
|
1389
|
+
state.status = "failed";
|
|
1390
|
+
writeState(state, dir);
|
|
1391
|
+
return _result("failed", { errorMessage: "exit_5_dispatch_failure" });
|
|
1392
|
+
}
|
|
1393
|
+
if (exitCode === 124) {
|
|
1394
|
+
// Timeout — continue unless the iter cap is hit on the next check.
|
|
1395
|
+
_emit(projectDir, {
|
|
1396
|
+
iter: state.iter,
|
|
1397
|
+
type: "retry",
|
|
1398
|
+
source: "supervisor",
|
|
1399
|
+
attempt: state.iter,
|
|
1400
|
+
reason: "timeout",
|
|
1401
|
+
});
|
|
1402
|
+
return _result("running");
|
|
1403
|
+
}
|
|
1404
|
+
if (exitCode === 125) {
|
|
1405
|
+
// Stale heartbeat (M43) — continue unless the iter cap hits. The
|
|
1406
|
+
// heartbeat kill is recoverable by definition: the worker was not
|
|
1407
|
+
// emitting events, which is the most common class of stuck iteration
|
|
1408
|
+
// (e.g. child stuck on a long Bash call with no tool_call emits).
|
|
1409
|
+
_emit(projectDir, {
|
|
1410
|
+
iter: state.iter,
|
|
1411
|
+
type: "retry",
|
|
1412
|
+
source: "supervisor",
|
|
1413
|
+
attempt: state.iter,
|
|
1414
|
+
reason: "stale_heartbeat",
|
|
1415
|
+
});
|
|
1416
|
+
return _result("running");
|
|
1417
|
+
}
|
|
1418
|
+
// Non-terminal (1/2/3) — continue the relay.
|
|
1419
|
+
_emit(projectDir, {
|
|
1420
|
+
iter: state.iter,
|
|
1421
|
+
type: "retry",
|
|
1422
|
+
source: "supervisor",
|
|
1423
|
+
attempt: state.iter,
|
|
1424
|
+
reason: `exit_${exitCode}`,
|
|
1425
|
+
});
|
|
1426
|
+
return _result("running");
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
// ── _computeIterBatchSize (M46 D1 T3) ───────────────────────────────────────
|
|
1430
|
+
|
|
1431
|
+
/**
|
|
1432
|
+
* Decide how many iterations the supervisor main loop should dispatch
|
|
1433
|
+
* concurrently in the next pass. Implements the mode-safety rules from
|
|
1434
|
+
* `.gsd-t/contracts/iter-parallel-contract.md` v1.0.0 §3.1.
|
|
1435
|
+
*
|
|
1436
|
+
* Rules evaluated top-down; first match wins:
|
|
1437
|
+
* 1. status === "verify-needed" → 1 (serial verify gate)
|
|
1438
|
+
* 2. milestoneBoundary === true → 1 (milestone boundary)
|
|
1439
|
+
* 3. status === "complete-milestone" → 1 (single-shot closeout)
|
|
1440
|
+
* 4. otherwise → min(opts.maxIterParallel ?? 4, remainingIters, 8)
|
|
1441
|
+
* where remainingIters = (state.maxIterations ?? Infinity) - (state.iter ?? 0)
|
|
1442
|
+
*
|
|
1443
|
+
* Never returns less than 1.
|
|
1444
|
+
*/
|
|
1445
|
+
function _computeIterBatchSize(state, opts) {
|
|
1446
|
+
if (state && state.status === "verify-needed") return 1;
|
|
1447
|
+
if (state && state.milestoneBoundary === true) return 1;
|
|
1448
|
+
if (state && state.status === "complete-milestone") return 1;
|
|
1449
|
+
|
|
1450
|
+
// Production default is 1 (serial, pre-M46 behavior). Iter-parallelism is
|
|
1451
|
+
// opt-in via `opts.maxIterParallel` — callers that pass a number enable it.
|
|
1452
|
+
// Rationale: `_runOneIter` mutates `state.iter` and other shared fields
|
|
1453
|
+
// (heartbeat bookkeeping, writeState) that are not safe to execute on the
|
|
1454
|
+
// same state object concurrently. Unit tests exercise the parallel path
|
|
1455
|
+
// with explicit batch sizes; production main loop omits the flag and runs
|
|
1456
|
+
// strictly serial, preserving the pre-M46 supervisor contract (one iter
|
|
1457
|
+
// counter increment per fan-out pass). See backlog #24 for the follow-up
|
|
1458
|
+
// that makes `_runOneIter` state-clone-safe and lifts this gate.
|
|
1459
|
+
if (!opts || typeof opts.maxIterParallel !== "number") return 1;
|
|
1460
|
+
|
|
1461
|
+
const cap = opts.maxIterParallel;
|
|
1462
|
+
const maxIters = state && typeof state.maxIterations === "number"
|
|
1463
|
+
? state.maxIterations
|
|
1464
|
+
: Infinity;
|
|
1465
|
+
const currentIter = state && typeof state.iter === "number"
|
|
1466
|
+
? state.iter
|
|
1467
|
+
: 0;
|
|
1468
|
+
const remainingIters = maxIters - currentIter;
|
|
1469
|
+
|
|
1470
|
+
const size = Math.min(cap, remainingIters, 8);
|
|
1471
|
+
return size < 1 ? 1 : size;
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
// ── _runIterParallel (M46 D1 T4) ────────────────────────────────────────────
|
|
1475
|
+
|
|
1476
|
+
/**
|
|
1477
|
+
* Dispatch `batchSize` independent iter slices concurrently and return an
|
|
1478
|
+
* IterResult[] of exactly that length. Implements the error-isolation rule
|
|
1479
|
+
* from `.gsd-t/contracts/iter-parallel-contract.md` v1.0.0 §4.2: a single
|
|
1480
|
+
* rejected iter is translated into an IterResult with status "error" and
|
|
1481
|
+
* does NOT cancel siblings. The caller decides how to react.
|
|
1482
|
+
*
|
|
1483
|
+
* iterFn defaults to `_runOneIter` for the T7 tests; production callers
|
|
1484
|
+
* (T5 main-loop rewrite) pass the same.
|
|
1485
|
+
*/
|
|
1486
|
+
async function _runIterParallel(state, opts, iterFn, batchSize) {
|
|
1487
|
+
const fn = typeof iterFn === "function" ? iterFn : _runOneIter;
|
|
1488
|
+
const n = typeof batchSize === "number" && batchSize >= 1 ? batchSize : 1;
|
|
1489
|
+
const slices = [];
|
|
1490
|
+
for (let i = 0; i < n; i++) slices.push(Promise.resolve().then(() => fn(state, opts)));
|
|
1491
|
+
const settled = await Promise.allSettled(slices);
|
|
1492
|
+
return settled.map((s) => {
|
|
1493
|
+
if (s.status === "fulfilled") return s.value;
|
|
1494
|
+
const reason = s.reason;
|
|
1495
|
+
const msg = (reason && reason.message) ? reason.message : String(reason);
|
|
1496
|
+
return {
|
|
1497
|
+
status: "error",
|
|
1498
|
+
tasksDone: [],
|
|
1499
|
+
verifyNeeded: false,
|
|
1500
|
+
artifacts: [],
|
|
1501
|
+
error: msg,
|
|
1502
|
+
};
|
|
1503
|
+
});
|
|
1215
1504
|
}
|
|
1216
1505
|
|
|
1217
1506
|
// ── _appendTokenLog (Fix 1, v3.12.12) ───────────────────────────────────────
|
|
@@ -1285,11 +1574,24 @@ function _spawnWorker(state, opts) {
|
|
|
1285
1574
|
// id as parent, so shims inside the worker write state files that the tree
|
|
1286
1575
|
// builder can attach under the supervisor root.
|
|
1287
1576
|
workerEnv.GSD_T_AGENT_ID =
|
|
1288
|
-
"supervisor-iter-" + (state && state.iter ? state.iter : Date.now())
|
|
1577
|
+
"supervisor-iter-" + (state && state.iter ? state.iter : Date.now()) +
|
|
1578
|
+
(state && typeof state._workerIndex === "number" ? `-w${state._workerIndex}` : "");
|
|
1289
1579
|
if (process.env.GSD_T_AGENT_ID) {
|
|
1290
1580
|
workerEnv.GSD_T_PARENT_AGENT_ID = process.env.GSD_T_AGENT_ID;
|
|
1291
1581
|
}
|
|
1292
1582
|
|
|
1583
|
+
// M44 D9 (v1.5.0) — planner-driven fan-out: when the supervisor partitions
|
|
1584
|
+
// the iter's task graph across N workers, each worker carries its disjoint
|
|
1585
|
+
// task-id subset via env var. The worker prompt consumes this to (a) skip
|
|
1586
|
+
// the intra-worker Team Mode block (the fan-out is the team), (b) restrict
|
|
1587
|
+
// itself to its assigned task IDs.
|
|
1588
|
+
const assignedTaskIds = Array.isArray(opts && opts.taskIds) ? opts.taskIds : null;
|
|
1589
|
+
if (assignedTaskIds && assignedTaskIds.length > 0) {
|
|
1590
|
+
workerEnv.GSD_T_WORKER_TASK_IDS = assignedTaskIds.join(",");
|
|
1591
|
+
workerEnv.GSD_T_WORKER_INDEX = String((state && state._workerIndex) || 0);
|
|
1592
|
+
workerEnv.GSD_T_WORKER_TOTAL = String((state && state._workerTotal) || 1);
|
|
1593
|
+
}
|
|
1594
|
+
|
|
1293
1595
|
// M42 D1 — allocate a spawn-id + open transcript before spawning. parentId
|
|
1294
1596
|
// is the supervisor's own spawn-id (set once at supervisor start via
|
|
1295
1597
|
// GSD_T_SPAWN_ID env) so the sidebar can render parent-indented trees.
|
|
@@ -1353,6 +1655,18 @@ function _spawnWorker(state, opts) {
|
|
|
1353
1655
|
"",
|
|
1354
1656
|
"# Team Mode (Intra-Wave Parallelism)",
|
|
1355
1657
|
"",
|
|
1658
|
+
"M44 D9 (v1.5.0+) — check env `GSD_T_WORKER_TASK_IDS` FIRST. If SET, you",
|
|
1659
|
+
"are one of N planner-assigned workers in a supervisor-level fan-out.",
|
|
1660
|
+
"The value is your disjoint task-id subset. DO NOT spawn Task subagents",
|
|
1661
|
+
"to re-fan-out (the supervisor already did). Execute ONLY your assigned",
|
|
1662
|
+
"task IDs sequentially in this worker, then return. Skip the rest of",
|
|
1663
|
+
"this block.",
|
|
1664
|
+
"",
|
|
1665
|
+
"If GSD_T_WORKER_TASK_IDS is UNSET, the supervisor's planner decided",
|
|
1666
|
+
"N=1 for this iter (sequential fallback: gates vetoed, file-disjointness",
|
|
1667
|
+
"unprovable, or est CW% too high). Proceed with the legacy worker-level",
|
|
1668
|
+
"Team Mode below:",
|
|
1669
|
+
"",
|
|
1356
1670
|
"Before executing tasks for this iteration, read `.gsd-t/partition.md` to",
|
|
1357
1671
|
"identify the current wave and which domains belong to it.",
|
|
1358
1672
|
"",
|
|
@@ -1427,6 +1741,97 @@ function _spawnWorker(state, opts) {
|
|
|
1427
1741
|
return finalize(spawnResult);
|
|
1428
1742
|
}
|
|
1429
1743
|
|
|
1744
|
+
// ── _spawnWorkerFanOut (M44 D9, contract v1.5.0) ────────────────────────────
|
|
1745
|
+
|
|
1746
|
+
/**
|
|
1747
|
+
* Planner-driven multi-worker fan-out. Spawns N concurrent workers via the
|
|
1748
|
+
* injected `spawnWorker` shim, each receiving a disjoint subset of the iter's
|
|
1749
|
+
* parallel task IDs (passed through `opts.taskIds`). Waits on all via
|
|
1750
|
+
* Promise.all before returning a merged result shape compatible with the
|
|
1751
|
+
* single-worker path.
|
|
1752
|
+
*
|
|
1753
|
+
* Merge semantics:
|
|
1754
|
+
* - `status` — 0 if every worker cleanly returned 0, else the first
|
|
1755
|
+
* non-zero status encountered (worst exit wins).
|
|
1756
|
+
* - `stdout` — per-worker blocks joined by `[WORKER i/N tasks=...]` headers.
|
|
1757
|
+
* - `stderr` — concatenated.
|
|
1758
|
+
* - `staleHeartbeat`/`timedOut` — true if any worker triggered them.
|
|
1759
|
+
* - `workerResults` — array of per-worker {status, taskIds, pid, spawnId, elapsedMs}
|
|
1760
|
+
* for state.json aggregation.
|
|
1761
|
+
*
|
|
1762
|
+
* The caller (runMainLoop) treats this result exactly like a single-worker
|
|
1763
|
+
* result for downstream classification. Multi-worker observability lives in
|
|
1764
|
+
* the `workerResults` array, not in new control-flow branches.
|
|
1765
|
+
*/
|
|
1766
|
+
async function _spawnWorkerFanOut(state, opts, spawnWorker, subsets) {
|
|
1767
|
+
const launches = subsets.map((taskIds, i) => {
|
|
1768
|
+
const subState = { ...state, _workerIndex: i, _workerTotal: subsets.length, _workerTaskIds: taskIds };
|
|
1769
|
+
const started = Date.now();
|
|
1770
|
+
return Promise.resolve()
|
|
1771
|
+
.then(() => spawnWorker(subState, { ...opts, taskIds }))
|
|
1772
|
+
.then((r) => ({ r: r || {}, taskIds, started, ended: Date.now(), idx: i }))
|
|
1773
|
+
.catch((e) => ({
|
|
1774
|
+
r: { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null },
|
|
1775
|
+
taskIds, started, ended: Date.now(), idx: i,
|
|
1776
|
+
}));
|
|
1777
|
+
});
|
|
1778
|
+
const outcomes = await Promise.all(launches);
|
|
1779
|
+
outcomes.sort((a, b) => a.idx - b.idx);
|
|
1780
|
+
|
|
1781
|
+
let mergedStatus = 0;
|
|
1782
|
+
let stale = false;
|
|
1783
|
+
let timedOut = false;
|
|
1784
|
+
let heartbeatReason = null;
|
|
1785
|
+
const stdoutBlocks = [];
|
|
1786
|
+
const stderrBlocks = [];
|
|
1787
|
+
const workerResults = [];
|
|
1788
|
+
|
|
1789
|
+
for (const o of outcomes) {
|
|
1790
|
+
const s = typeof o.r.status === "number" ? o.r.status : null;
|
|
1791
|
+
if (mergedStatus === 0 && s !== 0) mergedStatus = s === null ? 1 : s;
|
|
1792
|
+
if (o.r.staleHeartbeat) stale = true;
|
|
1793
|
+
if (o.r.timedOut) timedOut = true;
|
|
1794
|
+
if (!heartbeatReason && o.r.heartbeatReason) heartbeatReason = o.r.heartbeatReason;
|
|
1795
|
+
const tag = `[WORKER ${o.idx + 1}/${outcomes.length} tasks=${(o.taskIds || []).join(",") || "-"}]`;
|
|
1796
|
+
stdoutBlocks.push(`${tag}\n${o.r.stdout || ""}`);
|
|
1797
|
+
if (o.r.stderr) stderrBlocks.push(`${tag}\n${o.r.stderr}`);
|
|
1798
|
+
workerResults.push({
|
|
1799
|
+
idx: o.idx,
|
|
1800
|
+
status: s,
|
|
1801
|
+
taskIds: o.taskIds,
|
|
1802
|
+
spawnId: o.r.spawnId || null,
|
|
1803
|
+
signal: o.r.signal || null,
|
|
1804
|
+
elapsedMs: o.ended - o.started,
|
|
1805
|
+
staleHeartbeat: !!o.r.staleHeartbeat,
|
|
1806
|
+
timedOut: !!o.r.timedOut,
|
|
1807
|
+
});
|
|
1808
|
+
}
|
|
1809
|
+
|
|
1810
|
+
return {
|
|
1811
|
+
status: mergedStatus,
|
|
1812
|
+
stdout: stdoutBlocks.join("\n"),
|
|
1813
|
+
stderr: stderrBlocks.join("\n"),
|
|
1814
|
+
signal: null,
|
|
1815
|
+
timedOut,
|
|
1816
|
+
staleHeartbeat: stale,
|
|
1817
|
+
heartbeatReason,
|
|
1818
|
+
workerResults,
|
|
1819
|
+
fanOutCount: outcomes.length,
|
|
1820
|
+
};
|
|
1821
|
+
}
|
|
1822
|
+
|
|
1823
|
+
/**
|
|
1824
|
+
* Partition a task-id list into `workerCount` roughly-equal subsets. Simple
|
|
1825
|
+
* round-robin — each subset is non-empty as long as `tasks.length >= workerCount`.
|
|
1826
|
+
*/
|
|
1827
|
+
function _partitionTasks(tasks, workerCount) {
|
|
1828
|
+
if (!Array.isArray(tasks) || tasks.length === 0 || workerCount < 1) return [];
|
|
1829
|
+
const n = Math.min(workerCount, tasks.length);
|
|
1830
|
+
const subsets = Array.from({ length: n }, () => []);
|
|
1831
|
+
for (let i = 0; i < tasks.length; i++) subsets[i % n].push(tasks[i]);
|
|
1832
|
+
return subsets;
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1430
1835
|
// ── _testModeSpawnWorker ────────────────────────────────────────────────────
|
|
1431
1836
|
|
|
1432
1837
|
/**
|