@tekyzinc/gsd-t 3.13.16 → 3.16.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CHANGELOG.md +44 -0
  2. package/README.md +1 -0
  3. package/bin/gsd-t-benchmark-orchestrator.js +437 -0
  4. package/bin/gsd-t-capture-lint.cjs +276 -0
  5. package/bin/gsd-t-completion-check.cjs +106 -0
  6. package/bin/gsd-t-orchestrator-config.cjs +64 -0
  7. package/bin/gsd-t-orchestrator-queue.cjs +180 -0
  8. package/bin/gsd-t-orchestrator-recover.cjs +231 -0
  9. package/bin/gsd-t-orchestrator-worker.cjs +219 -0
  10. package/bin/gsd-t-orchestrator.js +534 -0
  11. package/bin/gsd-t-stream-feed-client.cjs +151 -0
  12. package/bin/gsd-t-task-brief-compactor.cjs +89 -0
  13. package/bin/gsd-t-task-brief-template.cjs +96 -0
  14. package/bin/gsd-t-task-brief.js +249 -0
  15. package/bin/gsd-t-token-backfill.cjs +366 -0
  16. package/bin/gsd-t-token-capture.cjs +306 -0
  17. package/bin/gsd-t-token-dashboard.cjs +318 -0
  18. package/bin/gsd-t-token-regenerate-log.cjs +129 -0
  19. package/bin/gsd-t-transcript-tee.cjs +246 -0
  20. package/bin/gsd-t-unattended-heartbeat.cjs +188 -0
  21. package/bin/gsd-t-unattended-platform.cjs +191 -27
  22. package/bin/gsd-t-unattended-safety.cjs +8 -1
  23. package/bin/gsd-t-unattended.cjs +192 -31
  24. package/bin/gsd-t.js +329 -2
  25. package/bin/supervisor-pid-fingerprint.cjs +126 -0
  26. package/commands/gsd-t-debug.md +63 -51
  27. package/commands/gsd-t-design-decompose.md +2 -7
  28. package/commands/gsd-t-doc-ripple.md +20 -11
  29. package/commands/gsd-t-execute.md +82 -50
  30. package/commands/gsd-t-integrate.md +43 -16
  31. package/commands/gsd-t-plan.md +20 -7
  32. package/commands/gsd-t-prd.md +19 -12
  33. package/commands/gsd-t-quick.md +64 -29
  34. package/commands/gsd-t-resume.md +51 -4
  35. package/commands/gsd-t-unattended.md +19 -20
  36. package/commands/gsd-t-verify.md +48 -32
  37. package/commands/gsd-t-visualize.md +19 -17
  38. package/commands/gsd-t-wave.md +29 -27
  39. package/docs/architecture.md +16 -0
  40. package/docs/m40-benchmark-report.md +35 -0
  41. package/docs/requirements.md +20 -0
  42. package/package.json +1 -1
  43. package/scripts/gsd-t-dashboard-server.js +291 -4
  44. package/scripts/gsd-t-dashboard.html +31 -1
  45. package/scripts/gsd-t-design-review-server.js +3 -1
  46. package/scripts/gsd-t-stream-feed-server.js +428 -0
  47. package/scripts/gsd-t-stream-feed.html +1168 -0
  48. package/scripts/gsd-t-token-aggregator.js +373 -0
  49. package/scripts/gsd-t-transcript.html +422 -0
  50. package/scripts/hooks/gsd-t-in-session-probe.js +62 -0
  51. package/scripts/hooks/pre-commit-capture-lint +26 -0
  52. package/templates/CLAUDE-global.md +69 -0
  53. package/scripts/gsd-t-agent-dashboard-server.js +0 -424
  54. package/scripts/gsd-t-agent-dashboard.html +0 -1043
@@ -62,9 +62,18 @@ function _emit(projectDir, ev) {
62
62
  try { _esAppendEvent(projectDir, ev); } catch (_) { /* never halt the loop */ }
63
63
  }
64
64
 
65
+ // M42 D1 — transcript tee. Captures each worker's stdout lines to an ndjson
66
+ // file and registers the spawn so the dashboard sidebar can list + render it.
67
+ // Best-effort: every call is swallowed so tee failures never halt the loop.
68
+ const transcriptTee = require("./gsd-t-transcript-tee.cjs");
69
+
70
+ // M43 liveness heartbeat watchdog (contract v1.4.0 §"Heartbeat Watchdog") —
71
+ // pure, testable staleness checker against .gsd-t/events/YYYY-MM-DD.jsonl mtime.
72
+ const { checkHeartbeat: _checkHeartbeat } = require("./gsd-t-unattended-heartbeat.cjs");
73
+
65
74
  // ── Constants ───────────────────────────────────────────────────────────────
66
75
 
67
- const CONTRACT_VERSION = "1.0.0";
76
+ const CONTRACT_VERSION = "1.4.0";
68
77
  const UNATTENDED_DIR_REL = path.join(".gsd-t", ".unattended");
69
78
  const PID_FILE = "supervisor.pid";
70
79
  const STATE_FILE = "state.json";
@@ -73,13 +82,17 @@ const RUN_LOG = "run.log";
73
82
 
74
83
  const DEFAULT_HOURS = 24;
75
84
  const DEFAULT_MAX_ITERATIONS = 200;
76
- // Anthropic prompt-cache TTL is 5 minutes (300,000 ms). The supervisor→worker
77
- // handoff budget is ~30 s (process exit + state persist + next spawn). A 270 s
78
- // worker timeout leaves room to complete the iter AND still relaunch against
79
- // a warm cache. If a single iter legitimately exceeds 270 s, the supervisor
80
- // kills the worker and logs a cache-miss warning; the next iter pays a
81
- // cold-cache cost but execution continues.
82
- const DEFAULT_WORKER_TIMEOUT_MS = 270 * 1000; // 270s — see cache-warm-pacing note above; contract §13/§16
85
+ // M43 liveness heartbeat (contract v1.1.0 §"Heartbeat Watchdog"):
86
+ // Healthy workers producing events every poll cycle (60 s) run under the
87
+ // absolute backstop raised from 270 s to 1 hour so long-running legitimate
88
+ // iterations are NOT cut. Stuck workers are detected by the heartbeat
89
+ // checker via events/YYYY-MM-DD.jsonl mtime and SIGTERM'd at the 5-min
90
+ // staleness threshold. The 270 s cache-pacing rationale is subsumed by the
91
+ // heartbeat check, which fires long before cache-miss cost becomes
92
+ // dominant.
93
+ const DEFAULT_WORKER_TIMEOUT_MS = 60 * 60 * 1000; // 1 h absolute backstop (contract §13/§16)
94
+ const DEFAULT_STALE_HEARTBEAT_MS = 5 * 60 * 1000; // 5 min — stuck-worker threshold
95
+ const DEFAULT_HEARTBEAT_POLL_MS = 60 * 1000; // 60 s poll cadence
83
96
 
84
97
  const TERMINAL_STATUSES = new Set(["done", "failed", "stopped", "crashed"]);
85
98
  const VALID_STATUSES = new Set([
@@ -115,6 +128,8 @@ module.exports = {
115
128
  TERMINAL_STATUSES,
116
129
  VALID_STATUSES,
117
130
  DEFAULT_WORKER_TIMEOUT_MS,
131
+ DEFAULT_STALE_HEARTBEAT_MS,
132
+ DEFAULT_HEARTBEAT_POLL_MS,
118
133
  };
119
134
 
120
135
  // ── parseArgs ───────────────────────────────────────────────────────────────
@@ -498,7 +513,7 @@ function finalizeState(state, dir, terminalStatus) {
498
513
  * reason?: string,
499
514
  * }}
500
515
  */
501
- function doUnattended(argv, deps) {
516
+ async function doUnattended(argv, deps) {
502
517
  deps = deps || {};
503
518
  const rawArgv = argv || [];
504
519
 
@@ -582,6 +597,13 @@ function doUnattended(argv, deps) {
582
597
  ) {
583
598
  opts.workerTimeoutMs = config.workerTimeoutMs;
584
599
  }
600
+ if (
601
+ opts.staleHeartbeatMs === undefined &&
602
+ typeof config.staleHeartbeatMs === "number" &&
603
+ config.staleHeartbeatMs > 0
604
+ ) {
605
+ opts.staleHeartbeatMs = config.staleHeartbeatMs;
606
+ }
585
607
  // CLI values now win — mirror them back into config so the pre-worker
586
608
  // safety caps (checkIterationCap / checkWallClockCap) use the effective
587
609
  // supervisor-scoped limits rather than the on-disk file defaults.
@@ -700,8 +722,11 @@ function doUnattended(argv, deps) {
700
722
  // Write the PID file. Singleton enforcement (refusing if another
701
723
  // supervisor is already alive) is owned by the launch handshake — see
702
724
  // contract §7. We trust the caller for now and just write our PID.
703
- const pidPath = path.join(dir, PID_FILE);
704
- fs.writeFileSync(pidPath, String(process.pid) + "\n", "utf8");
725
+ // Contract v1.4.1: write JSON fingerprint {pid, projectDir, startedAt}
726
+ // so resume-time liveness checks can distinguish "our supervisor" from
727
+ // "some other process recycled this PID" (macOS PID recycling).
728
+ const { writePidFile } = require("./supervisor-pid-fingerprint.cjs");
729
+ writePidFile(projectDir, process.pid);
705
730
 
706
731
  // Install terminal handlers BEFORE transitioning to `running` so a crash
707
732
  // mid-transition is still finalized.
@@ -793,7 +818,7 @@ function doUnattended(argv, deps) {
793
818
  // Main relay loop. Workers spawn fresh each iteration until the milestone
794
819
  // completes, the iteration cap is hit, a terminal exit code is returned,
795
820
  // a stop sentinel is observed, or a safety rails halt fires.
796
- runMainLoop(state, dir, opts, deps, { fn, config });
821
+ await runMainLoop(state, dir, opts, deps, { fn, config });
797
822
 
798
823
  // Terminal notification + explicit finalize. finalizeState is idempotent —
799
824
  // the process.on('exit') handler will be a no-op after this.
@@ -880,7 +905,7 @@ function _notifyAndFinalize(state, dir, fn, terminalHint) {
880
905
  * `stopRequested`). Task 4 will replace `_spawnWorker` with the real
881
906
  * cross-platform helper.
882
907
  */
883
- function runMainLoop(state, dir, opts, deps, ctx) {
908
+ async function runMainLoop(state, dir, opts, deps, ctx) {
884
909
  deps = deps || {};
885
910
  ctx = ctx || {};
886
911
  // Safety rails + platform helpers wired by doUnattended (fn) + loaded
@@ -904,6 +929,22 @@ function runMainLoop(state, dir, opts, deps, ctx) {
904
929
  deps._isMilestoneComplete || (useTestStub ? () => true : isMilestoneComplete);
905
930
  const stopCheck = deps._stopRequested || stopRequested;
906
931
  const workerTimeoutMs = opts.workerTimeoutMs || DEFAULT_WORKER_TIMEOUT_MS;
932
+ const staleHeartbeatMs =
933
+ (typeof opts.staleHeartbeatMs === "number" && opts.staleHeartbeatMs > 0
934
+ ? opts.staleHeartbeatMs
935
+ : (typeof config.staleHeartbeatMs === "number" && config.staleHeartbeatMs > 0
936
+ ? config.staleHeartbeatMs
937
+ : DEFAULT_STALE_HEARTBEAT_MS));
938
+ const heartbeatPollMs =
939
+ (typeof opts.heartbeatPollMs === "number" && opts.heartbeatPollMs > 0
940
+ ? opts.heartbeatPollMs
941
+ : DEFAULT_HEARTBEAT_POLL_MS);
942
+ // Test hook: deps._checkHeartbeat lets tests substitute the staleness
943
+ // checker without mocking fs. Production uses the real module.
944
+ const heartbeatImpl = deps._checkHeartbeat || _checkHeartbeat;
945
+ // Test hook: deps._disableHeartbeat lets unit tests bypass the async path
946
+ // for test-mode / stub spawns that return synchronously.
947
+ const heartbeatEnabled = !deps._disableHeartbeat && !useTestStub;
907
948
  const projectDir = state.projectDir;
908
949
 
909
950
  while (!isDone(state) && !stopCheck(projectDir)) {
@@ -948,12 +989,28 @@ function runMainLoop(state, dir, opts, deps, ctx) {
948
989
  });
949
990
 
950
991
  let res;
992
+ const workerStartMs = workerStart.getTime();
993
+ const hbOpts = heartbeatEnabled
994
+ ? {
995
+ onHeartbeatCheck: () =>
996
+ heartbeatImpl({
997
+ projectDir,
998
+ workerStartedAt: workerStartMs,
999
+ staleHeartbeatMs,
1000
+ }),
1001
+ heartbeatPollMs,
1002
+ }
1003
+ : {};
951
1004
  try {
952
1005
  res = spawnWorker(state, {
953
1006
  cwd: projectDir,
954
1007
  timeout: workerTimeoutMs,
955
1008
  verbose: !!opts.verbose,
1009
+ ...hbOpts,
956
1010
  });
1011
+ if (res && typeof res.then === "function") {
1012
+ res = await res;
1013
+ }
957
1014
  } catch (e) {
958
1015
  // Defensive: a real spawnSync shouldn't throw, but a shim could.
959
1016
  res = { status: 3, stdout: "", stderr: String((e && e.message) || e), signal: null };
@@ -965,28 +1022,40 @@ function runMainLoop(state, dir, opts, deps, ctx) {
965
1022
  const stdout = typeof res.stdout === "string" ? res.stdout : "";
966
1023
  const stderr = typeof res.stderr === "string" ? res.stderr : "";
967
1024
 
968
- // Timeout detection: spawnSync sets status=null and signal='SIGTERM' on
969
- // timeout (legacy shim), OR sets res.timedOut=true (platform.spawnWorker).
970
- // Map to contract code 124.
1025
+ // Kill-path detection (M43 heartbeat watchdog precedes wall-clock timeout):
1026
+ // - res.staleHeartbeat === true → heartbeat fired, code 125 (new)
1027
+ // - res.timedOut === true OR status=null+SIGTERM → wall-clock, code 124
1028
+ // Heartbeat wins on ties because it's the more specific signal.
971
1029
  let exitCode;
972
- if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
1030
+ let lastExitReason = null;
1031
+ if (res.staleHeartbeat === true) {
1032
+ exitCode = 125;
1033
+ lastExitReason = "stale_heartbeat";
1034
+ } else if (res.timedOut === true || res.status === null || res.signal === "SIGTERM") {
973
1035
  exitCode = 124;
1036
+ lastExitReason = "worker_timeout";
974
1037
  } else {
975
1038
  exitCode = mapHeadlessExitCode(res.status, stdout + "\n" + stderr);
976
1039
  }
977
1040
 
978
- // v3.13.11 Bug 1: when the watchdog fires (spawnSync timeout SIGTERM or
979
- // platform.spawnWorker timedOut flag), make the event explicit in run.log
980
- // so operators can see WHICH iteration timed out without inferring from
981
- // exit codes. The marker is prepended to stdout and written in the single
982
- // per-iter run.log append (no duplicate header).
1041
+ // v3.13.11 Bug 1: when a watchdog fires, make the event explicit in
1042
+ // run.log so operators can see WHICH iteration was cut without inferring
1043
+ // from exit codes. The marker is prepended to stdout and written in the
1044
+ // single per-iter run.log append (no duplicate header).
983
1045
  let loggedStdout = stdout;
984
1046
  if (exitCode === 124) {
985
1047
  const marker =
986
1048
  `[worker_timeout] iter=${state.iter} budget=${workerTimeoutMs}ms ` +
987
- `elapsed=${elapsedMs}ms — watchdog SIGTERM delivered, ` +
1049
+ `elapsed=${elapsedMs}ms — absolute-backstop SIGTERM delivered, ` +
988
1050
  `supervisor continues relay per contract §16.\n`;
989
1051
  loggedStdout = marker + (stdout || "");
1052
+ } else if (exitCode === 125) {
1053
+ const reason = res.heartbeatReason || "no recent events.jsonl writes";
1054
+ const marker =
1055
+ `[stale_heartbeat] iter=${state.iter} threshold=${staleHeartbeatMs}ms ` +
1056
+ `elapsed=${elapsedMs}ms reason="${reason}" — ` +
1057
+ `heartbeat watchdog SIGTERM delivered, supervisor continues relay.\n`;
1058
+ loggedStdout = marker + (stdout || "");
990
1059
  }
991
1060
 
992
1061
  // Append the full worker output to run.log (never truncate).
@@ -1007,6 +1076,13 @@ function runMainLoop(state, dir, opts, deps, ctx) {
1007
1076
  state.lastExit = exitCode;
1008
1077
  state.lastWorkerFinishedAt = workerEnd.toISOString();
1009
1078
  state.lastElapsedMs = elapsedMs;
1079
+ if (lastExitReason) {
1080
+ state.lastExitReason = lastExitReason;
1081
+ } else if (exitCode === 0) {
1082
+ state.lastExitReason = "clean";
1083
+ } else {
1084
+ state.lastExitReason = `exit_${exitCode}`;
1085
+ }
1010
1086
  writeState(state, dir);
1011
1087
 
1012
1088
  // Event-stream: task_complete on success, error on non-zero.
@@ -1102,6 +1178,20 @@ function runMainLoop(state, dir, opts, deps, ctx) {
1102
1178
  });
1103
1179
  continue;
1104
1180
  }
1181
+ if (exitCode === 125) {
1182
+ // Stale heartbeat (M43) — continue unless the iter cap hits. The
1183
+ // heartbeat kill is recoverable by definition: the worker was not
1184
+ // emitting events, which is the most common class of stuck iteration
1185
+ // (e.g. child stuck on a long Bash call with no tool_call emits).
1186
+ _emit(projectDir, {
1187
+ iter: state.iter,
1188
+ type: "retry",
1189
+ source: "supervisor",
1190
+ attempt: state.iter,
1191
+ reason: "stale_heartbeat",
1192
+ });
1193
+ continue;
1194
+ }
1105
1195
  // Non-terminal (1/2/3) — continue the relay.
1106
1196
  _emit(projectDir, {
1107
1197
  iter: state.iter,
@@ -1199,8 +1289,46 @@ function _spawnWorker(state, opts) {
1199
1289
  if (process.env.GSD_T_AGENT_ID) {
1200
1290
  workerEnv.GSD_T_PARENT_AGENT_ID = process.env.GSD_T_AGENT_ID;
1201
1291
  }
1202
- const res = platformSpawnWorker(opts.cwd, opts.timeout, {
1292
+
1293
+ // M42 D1 — allocate a spawn-id + open transcript before spawning. parentId
1294
+ // is the supervisor's own spawn-id (set once at supervisor start via
1295
+ // GSD_T_SPAWN_ID env) so the sidebar can render parent-indented trees.
1296
+ const parentSpawnId = process.env.GSD_T_SPAWN_ID || null;
1297
+ let teeSpawnId = null;
1298
+ try {
1299
+ teeSpawnId = transcriptTee.allocateSpawnId({ parentId: parentSpawnId });
1300
+ transcriptTee.openTranscript({
1301
+ spawnId: teeSpawnId,
1302
+ projectDir: opts.cwd,
1303
+ meta: {
1304
+ parentId: parentSpawnId,
1305
+ command: "gsd-t-unattended-worker",
1306
+ description: `iter=${state && state.iter ? state.iter : "?"} milestone=${state && state.milestone ? state.milestone : "-"}`,
1307
+ model: (state && state.model) || null,
1308
+ },
1309
+ });
1310
+ workerEnv.GSD_T_SPAWN_ID = teeSpawnId;
1311
+ } catch (_) { /* tee is best-effort */ }
1312
+
1313
+ const spawnResult = platformSpawnWorker(opts.cwd, opts.timeout, {
1203
1314
  bin,
1315
+ onHeartbeatCheck: opts.onHeartbeatCheck,
1316
+ heartbeatPollMs: opts.heartbeatPollMs,
1317
+ onHeartbeatSample: opts.onHeartbeatSample,
1318
+ // M43 live transcript tee — append each worker stdout line to the
1319
+ // transcript file as it arrives, so /transcript/:id/stream renders the
1320
+ // run in real time instead of waiting for the worker to exit.
1321
+ onStdoutLine: teeSpawnId
1322
+ ? (line) => {
1323
+ try {
1324
+ transcriptTee.appendFrame({
1325
+ spawnId: teeSpawnId,
1326
+ projectDir: opts.cwd,
1327
+ frame: line,
1328
+ });
1329
+ } catch (_) { /* tee is best-effort */ }
1330
+ }
1331
+ : undefined,
1204
1332
  args: [
1205
1333
  "-p",
1206
1334
  [
@@ -1254,14 +1382,47 @@ function _spawnWorker(state, opts) {
1254
1382
  ],
1255
1383
  env: workerEnv,
1256
1384
  });
1257
- return {
1258
- status: typeof res.status === "number" ? res.status : null,
1259
- stdout: res.stdout || "",
1260
- stderr: res.stderr || "",
1261
- signal: res.signal || null,
1262
- timedOut: !!res.timedOut,
1263
- error: res.error || null,
1385
+
1386
+ // M43 — finalize: live tee already wrote each line via onStdoutLine in the
1387
+ // platform layer; here we only mark the transcript closed with the worker's
1388
+ // terminal status. Legacy sync path (no onHeartbeatCheck) doesn't fire
1389
+ // onStdoutLine, but the supervisor always provides a heartbeat callback so
1390
+ // that branch is unreachable in production. If a future caller goes async
1391
+ // without heartbeat, transcripts would be empty — acceptable until then.
1392
+ const finalize = (res) => {
1393
+ if (teeSpawnId) {
1394
+ try {
1395
+ const status =
1396
+ typeof res.status === "number" && res.status === 0
1397
+ ? "done"
1398
+ : res.timedOut
1399
+ ? "stopped"
1400
+ : "failed";
1401
+ transcriptTee.closeTranscript({
1402
+ spawnId: teeSpawnId,
1403
+ projectDir: opts.cwd,
1404
+ status,
1405
+ });
1406
+ } catch (_) { /* tee is best-effort */ }
1407
+ }
1408
+
1409
+ return {
1410
+ status: typeof res.status === "number" ? res.status : null,
1411
+ stdout: res.stdout || "",
1412
+ stderr: res.stderr || "",
1413
+ signal: res.signal || null,
1414
+ timedOut: !!res.timedOut,
1415
+ staleHeartbeat: !!res.staleHeartbeat,
1416
+ heartbeatReason: res.heartbeatReason || null,
1417
+ error: res.error || null,
1418
+ spawnId: teeSpawnId,
1419
+ };
1264
1420
  };
1421
+
1422
+ if (spawnResult && typeof spawnResult.then === "function") {
1423
+ return spawnResult.then(finalize);
1424
+ }
1425
+ return finalize(spawnResult);
1265
1426
  }
1266
1427
 
1267
1428
  // ── _testModeSpawnWorker ────────────────────────────────────────────────────