@link-assistant/hive-mind 2.0.3 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,18 +8,25 @@
8
8
  * 1. Screen mode (default): Uses `screen -ls` to detect session completion
9
9
  * 2. Isolation mode: Uses `$ --status <uuid>` from start-command CLI for reliable tracking
10
10
  *
11
- * Session state is stored in-memory. The `$` CLI (start-command) is accessed
12
- * purely via its CLI interface, not as a library dependency.
11
+ * Session state is stored in-memory and, since issue #1927, mirrored to a
12
+ * durable on-disk store so a bot restart can reload and resume monitoring of
13
+ * detached sessions that were still running when the previous process died. The
14
+ * `$` CLI (start-command) is accessed purely via its CLI interface, not as a
15
+ * library dependency.
13
16
  *
14
17
  * @see https://github.com/link-foundation/start
15
18
  * @see https://github.com/link-assistant/hive-mind/issues/380
19
+ * @see https://github.com/link-assistant/hive-mind/issues/1927
16
20
  */
17
21
 
18
22
  import { exec as execCallback } from 'child_process';
19
23
  import fs from 'fs/promises';
20
24
  import { promisify } from 'util';
21
- import { formatSessionCompletionMessage, getSessionCompletionExitCode } from './work-session-formatting.lib.mjs';
25
+ import { formatSessionCompletionMessage, getSessionCompletionExitCode, classifySessionOutcome } from './work-session-formatting.lib.mjs';
22
26
  import { notifySubscribers, getSubscriberCount } from './telegram-subscribers.lib.mjs';
27
+ import { classifyExitStatus } from './session-status.lib.mjs';
28
+ import path from 'node:path';
29
+ import { readLastSessionIdFromLog, findLatestSessionLogId, buildResumeCommand, formatResumeSection } from './session-resume.lib.mjs';
23
30
 
24
31
  export { formatSessionCompletionMessage, getSessionCompletionExitCode } from './work-session-formatting.lib.mjs';
25
32
 
@@ -36,8 +43,41 @@ async function getIsolationRunner() {
36
43
  // In-memory session store
37
44
  const activeSessions = new Map();
38
45
 
46
+ // Issue #1927: optional durable mirror of the in-memory registry. When set (by
47
+ // the bot at startup via setSessionStore), every track/complete is persisted so
48
+ // a restart can reload and keep monitoring detached sessions. Left null in unit
49
+ // tests and one-off CLI paths, where in-memory tracking is sufficient.
50
+ let sessionStore = null;
51
+ let sessionLogger = null;
52
+
53
+ /**
54
+ * Attach a durable session store (see session-store.lib.mjs) so tracked sessions
55
+ * survive a bot restart. Passing null disconnects the store (used by tests).
56
+ * @param {object|null} store
57
+ */
58
+ export function setSessionStore(store) {
59
+ sessionStore = store || null;
60
+ }
61
+
62
+ /**
63
+ * Attach a structured logger (see bot-logger.lib.mjs) so session lifecycle
64
+ * transitions are recorded with timestamps. Optional; console is used otherwise.
65
+ * @param {object|null} logger
66
+ */
67
+ export function setSessionLogger(logger) {
68
+ sessionLogger = logger || null;
69
+ }
70
+
71
+ function logEvent(type, data) {
72
+ if (sessionLogger && typeof sessionLogger.event === 'function') {
73
+ sessionLogger.event(type, data);
74
+ }
75
+ }
76
+
39
77
  export function resetSessionMonitorForTests() {
40
78
  activeSessions.clear();
79
+ sessionStore = null;
80
+ sessionLogger = null;
41
81
  }
42
82
 
43
83
  /**
@@ -102,10 +142,41 @@ export async function checkScreenSessionExists(sessionName) {
102
142
  */
103
143
  export function trackSession(sessionName, sessionInfo, verbose = false) {
104
144
  activeSessions.set(sessionName, sessionInfo);
145
+ const mode = sessionInfo.isolationBackend ? `isolation:${sessionInfo.isolationBackend}` : 'screen';
105
146
  if (verbose) {
106
- const mode = sessionInfo.isolationBackend ? `isolation:${sessionInfo.isolationBackend}` : 'screen';
107
147
  console.log(`[VERBOSE] Session ${sessionName} tracked in memory (mode: ${mode})`);
108
148
  }
149
+ // Issue #1927: mirror to the durable store so a restart can resume monitoring.
150
+ // Only isolation-backed sessions are persisted — they are the ones tracked in
151
+ // `$` (start-command) with a reliable status record (requirement #2). Plain
152
+ // screen sessions are timeout-based best-effort; resuming them after a restart
153
+ // could fabricate a "finished" message with no real exit code, so they stay
154
+ // in-memory only.
155
+ if (sessionStore && isPersistableSession(sessionInfo)) {
156
+ try {
157
+ sessionStore.persist(sessionName, sessionInfo);
158
+ } catch (error) {
159
+ console.error(`[session-monitor] Could not persist session ${sessionName}: ${error.message}`);
160
+ }
161
+ }
162
+ logEvent('session_tracked', {
163
+ sessionName,
164
+ mode,
165
+ url: sessionInfo.url || null,
166
+ command: sessionInfo.command || null,
167
+ sessionId: sessionInfo.sessionId || null,
168
+ startTime: sessionInfo.startTime instanceof Date ? sessionInfo.startTime.toISOString() : sessionInfo.startTime || null,
169
+ });
170
+ }
171
+
172
+ /**
173
+ * Whether a session should be mirrored to the durable store. Only isolation
174
+ * sessions with a start-command UUID qualify (see trackSession rationale).
175
+ * @param {object} sessionInfo
176
+ * @returns {boolean}
177
+ */
178
+ function isPersistableSession(sessionInfo) {
179
+ return Boolean(sessionInfo?.isolationBackend && sessionInfo?.sessionId);
109
180
  }
110
181
 
111
182
  /**
@@ -156,11 +227,22 @@ function getActiveSessions(verbose = false) {
156
227
  * @param {string} sessionName - Name of the session to remove
157
228
  * @param {boolean} verbose - Whether to log verbose output
158
229
  */
159
- function completeSession(sessionName, exitCode = 0, verbose = false) {
230
+ function completeSession(sessionName, exitCode = 0, verbose = false, status = null) {
231
+ const sessionInfo = activeSessions.get(sessionName) || null;
160
232
  activeSessions.delete(sessionName);
161
233
  if (verbose) {
162
- console.log(`[VERBOSE] Session ${sessionName} removed from tracking (exit: ${exitCode})`);
234
+ console.log(`[VERBOSE] Session ${sessionName} removed from tracking (exit: ${exitCode}${status ? `, status: ${status}` : ''})`);
235
+ }
236
+ // Issue #1927: drop from the durable snapshot (and append a `complete` audit
237
+ // event recording how it ended) so a later restart does not try to resume it.
238
+ if (sessionStore && isPersistableSession(sessionInfo)) {
239
+ try {
240
+ sessionStore.remove(sessionName, { status, exitCode });
241
+ } catch (error) {
242
+ console.error(`[session-monitor] Could not remove persisted session ${sessionName}: ${error.message}`);
243
+ }
163
244
  }
245
+ logEvent('session_completed', { sessionName, exitCode: exitCode ?? null, status: status || null });
164
246
  }
165
247
 
166
248
  function isMessageAlreadyUpdatedError(error) {
@@ -232,8 +314,72 @@ function isNonIsolationSessionActive(sessionName, sessionInfo, verbose = false)
232
314
  return true;
233
315
  }
234
316
 
317
+ /**
318
+ * Issue #1927: minimum age before a session that `$ --status` still reports as
319
+ * `executing` is allowed to be declared dead purely on a backend-liveness probe
320
+ * (the screen/tmux/docker session is gone). This avoids a race where a session
321
+ * that has just been launched — but whose backend has not registered yet — is
322
+ * falsely reported as killed. The authoritative log-footer check is NOT gated by
323
+ * this, because a written "Exit Code:" footer is proof the command terminated.
324
+ */
325
+ export const STALE_EXECUTING_MIN_AGE_MS = 90 * 1000;
326
+
327
+ function sessionStartMs(sessionInfo) {
328
+ const start = sessionInfo?.startTime;
329
+ if (!start) return null;
330
+ const date = start instanceof Date ? start : new Date(start);
331
+ const ms = date.getTime();
332
+ return Number.isFinite(ms) ? ms : null;
333
+ }
334
+
335
+ /**
336
+ * Cross-check whether a session that `$ --status` still reports as `executing`
337
+ * has actually terminated. Issue #1927: start-command's status can get stuck on
338
+ * `executing` after the process was killed (a lingering shell keeps the screen
339
+ * session alive, flipping executed→executing), so a SIGKILLed /solve was never
340
+ * reported. Two independent signals are consulted, strongest first:
341
+ *
342
+ * 1. The execution log FOOTER. When start-command wrote "Exit Code: N" the
343
+ * command terminated, full stop — regardless of what `--status` claims.
344
+ * This is authoritative and catches the dominant lingering-shell case.
345
+ * 2. Backend LIVENESS. If no footer was written (e.g. the wrapper itself was
346
+ * hard-killed) but the backing screen/tmux/docker session is gone, the
347
+ * process cannot still be executing. Gated by STALE_EXECUTING_MIN_AGE_MS to
348
+ * avoid a just-launched-not-yet-registered race.
349
+ *
350
+ * @returns {Promise<{exitCode: number|null, status: string, reason: string}|null>}
351
+ * Terminal details when the session is actually dead, else null (still running).
352
+ */
353
+ async function resolveStaleExecutingState(sessionName, sessionInfo, statusResult, { verbose, runner, exitFromLog, backendAlive }) {
354
+ // 1. Authoritative: the log footer.
355
+ const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
356
+ if (logPath) {
357
+ const readFooter = exitFromLog || runner.readSessionExitFromLog;
358
+ const footer = readFooter ? readFooter(logPath, { verbose }) : null;
359
+ if (footer?.finished) {
360
+ const status = classifyExitStatus(footer.exitCode) || (footer.exitCode === 0 ? 'executed' : 'failed');
361
+ return { exitCode: footer.exitCode, status, reason: `log-footer(exit ${footer.exitCode})` };
362
+ }
363
+ }
364
+
365
+ // 2. Liveness probe, only once the session is old enough to have registered.
366
+ const startMs = sessionStartMs(sessionInfo);
367
+ const ageMs = startMs != null ? Date.now() - startMs : Infinity;
368
+ if (ageMs >= STALE_EXECUTING_MIN_AGE_MS && sessionInfo?.isolationBackend) {
369
+ const probe = backendAlive || runner.checkBackendSessionAlive;
370
+ const alive = probe ? await probe(sessionInfo.sessionId || sessionName, sessionInfo.isolationBackend, verbose) : null;
371
+ // Only `false` (definitively gone) counts as killed; `null` (unknown backend)
372
+ // is treated as "no signal" so we don't kill on an indeterminate probe.
373
+ if (alive === false) {
374
+ return { exitCode: null, status: 'killed', reason: 'backend-gone' };
375
+ }
376
+ }
377
+
378
+ return null;
379
+ }
380
+
235
381
  async function getIsolationSessionState(sessionName, sessionInfo, options = {}) {
236
- const { verbose = false, statusProvider = null } = options;
382
+ const { verbose = false, statusProvider = null, exitFromLog = null, backendAlive = null, sessionRunning = null } = options;
237
383
  const sessionId = sessionInfo.sessionId || sessionName;
238
384
 
239
385
  try {
@@ -242,30 +388,81 @@ async function getIsolationSessionState(sessionName, sessionInfo, options = {})
242
388
 
243
389
  if (statusResult?.exists && statusResult.status) {
244
390
  if (runner.isExecutingSessionStatus(statusResult.status)) {
391
+ // Issue #1927: an `executing` status is not trusted blindly — verify the
392
+ // process is really alive. start-command can keep reporting `executing`
393
+ // after a kill, which is exactly how an OOM-killed /solve went unreported.
394
+ const stale = await resolveStaleExecutingState(sessionName, sessionInfo, statusResult, { verbose, runner, exitFromLog, backendAlive });
395
+ if (stale) {
396
+ if (verbose) {
397
+ console.log(`[VERBOSE] Session ${sessionName} reported '${statusResult.status}' but is actually terminated (${stale.reason}); treating as ${stale.status} (exit ${stale.exitCode})`);
398
+ }
399
+ // Rewrite the status payload so downstream completion formatting sees
400
+ // the real terminal status/exit code instead of the stale `executing`.
401
+ const correctedStatus = stale.status || 'killed';
402
+ const corrected = { ...statusResult, status: correctedStatus, exitCode: stale.exitCode, endTime: statusResult.endTime || stale.endTime || null };
403
+ return { running: false, exitCode: stale.exitCode, status: correctedStatus, statusResult: corrected, stale: true };
404
+ }
245
405
  return { running: true, exitCode: null, status: statusResult.status, statusResult };
246
406
  }
247
407
  if (runner.isTerminalSessionStatus(statusResult.status)) {
408
+ let exitCode = statusResult.exitCode !== undefined ? statusResult.exitCode : null;
409
+ // Issue #1927: when start-command reports a terminal status but a missing
410
+ // or sentinel (-1) exit code — which its lingering-shell reverse-flip can
411
+ // produce — recover the real code from the log footer so a SIGKILL is not
412
+ // mislabelled as a generic failure.
413
+ if ((exitCode === null || exitCode === -1) && (statusResult.logPath || sessionInfo?.logPath)) {
414
+ const readFooter = exitFromLog || runner.readSessionExitFromLog;
415
+ const footer = readFooter ? readFooter(statusResult.logPath || sessionInfo.logPath, { verbose }) : null;
416
+ if (footer?.finished) {
417
+ exitCode = footer.exitCode;
418
+ const correctedStatus = classifyExitStatus(footer.exitCode) || statusResult.status;
419
+ if (verbose) {
420
+ console.log(`[VERBOSE] Session ${sessionName} reported terminal '${statusResult.status}' with exit ${statusResult.exitCode}; recovered real exit ${exitCode} (${correctedStatus}) from log footer`);
421
+ }
422
+ return { running: false, exitCode, status: correctedStatus, statusResult: { ...statusResult, status: correctedStatus, exitCode } };
423
+ }
424
+ }
248
425
  // Issue #1939: a native docker session can report a terminal status
249
426
  // ("executed") with the unknown exit-code sentinel (-1) while the
250
- // container is still running. Such a status is provisional fall
251
- // through to isSessionRunning(), which cross-checks the live container
252
- // via `docker inspect` before we notify the user the work finished.
253
- const ambiguousDockerTerminal = sessionInfo.isolationBackend === 'docker' && typeof runner.isUnknownDockerExitCode === 'function' && runner.isUnknownDockerExitCode(statusResult.exitCode);
427
+ // container is still running. When the log footer above did not recover
428
+ // a real terminal exit, such a status is provisional — fall through to
429
+ // isSessionRunning() below, which cross-checks the live container via
430
+ // `docker inspect` before we notify the user the work finished.
431
+ const ambiguousDockerTerminal = sessionInfo.isolationBackend === 'docker' && typeof runner.isUnknownDockerExitCode === 'function' && runner.isUnknownDockerExitCode(exitCode);
254
432
  if (!ambiguousDockerTerminal) {
255
- return {
256
- running: false,
257
- exitCode: statusResult.exitCode !== undefined ? statusResult.exitCode : null,
258
- status: statusResult.status,
259
- statusResult,
260
- };
433
+ return { running: false, exitCode, status: statusResult.status, statusResult };
261
434
  }
262
435
  }
263
436
  }
264
437
 
265
- const running = await runner.isSessionRunning(sessionId, {
438
+ // The status record is unavailable (no `exists`/`status`). Fall back to a
439
+ // direct backend liveness check. `sessionRunning` is injectable purely so
440
+ // this path is testable without the real `$`/`screen` binaries; production
441
+ // always uses the runner's real check.
442
+ const checkRunning = sessionRunning || runner.isSessionRunning;
443
+ const running = await checkRunning(sessionId, {
266
444
  backend: sessionInfo.isolationBackend,
267
445
  verbose,
268
446
  });
447
+ if (!running) {
448
+ // Issue #1927: the `$ --status` record is unavailable (e.g. garbage-
449
+ // collected while the bot was down) and the backend reports not-running.
450
+ // Before declaring a bare null exit — which classifies as success — try
451
+ // the log footer so a session that was killed while we were offline is
452
+ // reported as the kill it was, not a silent success.
453
+ const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
454
+ if (logPath) {
455
+ const readFooter = exitFromLog || runner.readSessionExitFromLog;
456
+ const footer = readFooter ? readFooter(logPath, { verbose }) : null;
457
+ if (footer?.finished) {
458
+ const correctedStatus = classifyExitStatus(footer.exitCode) || (footer.exitCode === 0 ? 'executed' : 'failed');
459
+ if (verbose) {
460
+ console.log(`[VERBOSE] Session ${sessionName} has no live status record; recovered exit ${footer.exitCode} (${correctedStatus}) from log footer`);
461
+ }
462
+ return { running: false, exitCode: footer.exitCode, status: correctedStatus, statusResult: { ...(statusResult || {}), status: correctedStatus, exitCode: footer.exitCode, endTime: statusResult?.endTime || footer.endTime || null } };
463
+ }
464
+ }
465
+ }
269
466
  return {
270
467
  running,
271
468
  exitCode: running ? null : (statusResult?.exitCode ?? null),
@@ -300,6 +497,7 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
300
497
  let stillRunning;
301
498
  let exitCode = null;
302
499
  let statusResult = null;
500
+ let resolvedStatus = null;
303
501
 
304
502
  if (sessionInfo.isolationBackend && sessionInfo.sessionId) {
305
503
  // Isolation mode: use $ --status, with screen -ls only as a fallback
@@ -308,10 +506,31 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
308
506
  const state = await getIsolationSessionState(sessionName, sessionInfo, {
309
507
  verbose,
310
508
  statusProvider: options.statusProvider,
509
+ exitFromLog: options.exitFromLog,
510
+ backendAlive: options.backendAlive,
511
+ sessionRunning: options.sessionRunning,
311
512
  });
312
513
  stillRunning = state.running;
313
514
  exitCode = state.exitCode;
314
515
  statusResult = state.statusResult;
516
+ resolvedStatus = state.status || statusResult?.status || null;
517
+ if (state.stale && verbose) {
518
+ console.log(`[VERBOSE] Session ${sessionName} detected as killed/terminated despite an 'executing' status report (issue #1927 cross-check)`);
519
+ }
520
+ // Issue #1927: once start-command reveals the log path, record it in the
521
+ // durable snapshot. If the bot dies and restarts after start-command has
522
+ // garbage-collected the status record, the resumed session can still read
523
+ // the log footer to learn whether it was killed.
524
+ if (statusResult?.logPath && sessionInfo.logPath !== statusResult.logPath) {
525
+ sessionInfo.logPath = statusResult.logPath;
526
+ if (sessionStore) {
527
+ try {
528
+ sessionStore.persist(sessionName, sessionInfo);
529
+ } catch {
530
+ /* best effort — persistence must never break monitoring */
531
+ }
532
+ }
533
+ }
315
534
  } else {
316
535
  // Issue #1586: Non-isolation screen sessions cannot reliably detect
317
536
  // completion because start-screen keeps the screen alive via `exec bash`.
@@ -389,6 +608,46 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
389
608
  }
390
609
  }
391
610
 
611
+ // Issue #1927 (review follow-up): when a /solve session was KILLED
612
+ // (OOM/SIGKILL — the silent failure this issue is about), surface a
613
+ // ready-to-run `--resume <lastSessionId>` command so the surviving
614
+ // parent (the operator, or an automation watching the bot) can pick the
615
+ // work back up. We deliberately do NOT auto-relaunch here: a job that
616
+ // reliably OOMs would storm. The rule "use the LAST of multiple
617
+ // sessions" is honored by reading the last `Session ID:` marker from
618
+ // the captured log. Purely additive — failures never block the
619
+ // completion notification, preserving backward compatibility.
620
+ const resumeExtraSections = [];
621
+ try {
622
+ const outcome = classifySessionOutcome({ exitCode: finalExitCode, status: resolvedStatus });
623
+ const isResumableCommand = (sessionInfo?.command || 'solve') === 'solve';
624
+ if (outcome.killed && isResumableCommand) {
625
+ const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
626
+ // The id must be the AI TOOL's session id, not the isolation session
627
+ // id (sessionInfo.sessionId — wrong namespace for `solve --resume`).
628
+ // Prefer the last `Session ID:` marker in the captured log; fall
629
+ // back to the newest `<sessionId>.log` start-command wrote in the
630
+ // same directory. If neither exists, offer no command (a bogus
631
+ // resume id would be worse than none).
632
+ let lastSessionId = readLastSessionIdFromLog(logPath, { verbose });
633
+ if (!lastSessionId && logPath) {
634
+ lastSessionId = findLatestSessionLogId({ dir: path.dirname(logPath), verbose });
635
+ }
636
+ const resumeCommand = buildResumeCommand({ sessionInfo, lastSessionId });
637
+ const resumeSection = formatResumeSection({ lastSessionId, command: resumeCommand });
638
+ if (resumeSection) {
639
+ resumeExtraSections.push(resumeSection);
640
+ if (verbose) {
641
+ console.log(`[VERBOSE] Session ${sessionName} was killed; offering resume from last session ${lastSessionId}`);
642
+ }
643
+ }
644
+ }
645
+ } catch (resumeError) {
646
+ if (verbose) {
647
+ console.log(`[VERBOSE] Could not build resume section for ${sessionName}: ${resumeError?.message || resumeError}`);
648
+ }
649
+ }
650
+
392
651
  const message = formatSessionCompletionMessage({
393
652
  sessionName,
394
653
  sessionInfo,
@@ -397,7 +656,7 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
397
656
  exitCode: finalExitCode,
398
657
  infoBlock: sessionInfo?.infoBlock || '',
399
658
  pullRequestUrl,
400
- extraSections: limitsExtraSections,
659
+ extraSections: [...limitsExtraSections, ...resumeExtraSections],
401
660
  });
402
661
 
403
662
  // Update the original reply message if messageId is available, otherwise send new message
@@ -437,11 +696,11 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
437
696
  }
438
697
  }
439
698
 
440
- completeSession(sessionName, finalExitCode || 0, verbose);
699
+ completeSession(sessionName, finalExitCode || 0, verbose, resolvedStatus);
441
700
  } catch (error) {
442
701
  console.error(`Failed to send completion notification for ${sessionName}:`, error);
443
702
  if (isMessageAlreadyUpdatedError(error)) {
444
- completeSession(sessionName, exitCode || 0, verbose);
703
+ completeSession(sessionName, exitCode || 0, verbose, resolvedStatus);
445
704
  } else {
446
705
  sessionInfo.lastNotificationError = error.message;
447
706
  sessionInfo.lastKnownStatus = statusResult?.status || sessionInfo.lastKnownStatus || null;
@@ -529,10 +788,88 @@ export function startSessionMonitoring(bot, verbose = false, intervalMs = 30000,
529
788
  };
530
789
  const timer = setInterval(runMonitor, intervalMs);
531
790
  runMonitor();
532
- console.log(`📊 Session monitoring started (checking every ${intervalMs / 1000} seconds, storage: in-memory)`);
791
+ const storage = sessionStore ? `durable+in-memory (${sessionStore.snapshotPath})` : 'in-memory';
792
+ console.log(`📊 Session monitoring started (checking every ${intervalMs / 1000} seconds, storage: ${storage})`);
533
793
  return timer;
534
794
  }
535
795
 
796
+ /**
797
+ * Issue #1927 (requirements #2 and #4): after a bot restart, reload the sessions
798
+ * that were still being tracked when the previous process died and re-register
799
+ * them so {@link monitorSessions} resumes watching them to completion. The very
800
+ * next monitor tick re-queries each session's status — so a session that was
801
+ * *killed while the bot was down* is finally reported (via the log-footer /
802
+ * backend-liveness cross-check in {@link getIsolationSessionState}) instead of
803
+ * vanishing silently.
804
+ *
805
+ * Only sessions persisted by this bot are resumed (they carry the chatId /
806
+ * messageId needed to notify). The durable snapshot already contains exactly the
807
+ * sessions that had not completed when the previous process died, because
808
+ * completed sessions are removed from it. As a guard we additionally skip any
809
+ * record whose startTime is after the current bot start (it cannot belong to a
810
+ * previous run), satisfying requirement #2's "started before bot start time".
811
+ *
812
+ * @param {object} [options]
813
+ * @param {object} [options.store] - Session store to load from (default: the store set via setSessionStore).
814
+ * @param {number} [options.botStartTime] - Epoch seconds; only sessions started strictly before this are resumed. Defaults to now.
815
+ * @param {boolean} [options.verbose]
816
+ * @returns {Promise<{resumed: Array<{sessionName: string, sessionInfo: object}>, skipped: Array<{sessionName: string, reason: string}>}>}
817
+ */
818
+ export async function resumeTrackedSessions(options = {}) {
819
+ const { store = sessionStore, verbose = false, botStartTime = Math.floor(Date.now() / 1000) } = options;
820
+ const resumed = [];
821
+ const skipped = [];
822
+
823
+ if (!store) {
824
+ if (verbose) console.log('[VERBOSE] resumeTrackedSessions: no durable session store configured, nothing to resume');
825
+ return { resumed, skipped };
826
+ }
827
+
828
+ let persisted = [];
829
+ try {
830
+ persisted = store.load();
831
+ } catch (error) {
832
+ console.error(`[session-monitor] resumeTrackedSessions: could not load persisted sessions: ${error.message}`);
833
+ return { resumed, skipped };
834
+ }
835
+
836
+ for (const { sessionName, sessionInfo } of persisted) {
837
+ if (activeSessions.has(sessionName)) {
838
+ skipped.push({ sessionName, reason: 'already-tracked' });
839
+ continue;
840
+ }
841
+ // Requirement #2/#4: a session that started after this bot came up cannot be
842
+ // a leftover from a previous run, so never resume it here.
843
+ const startMs = sessionStartMs(sessionInfo);
844
+ if (startMs != null && startMs > botStartTime * 1000) {
845
+ skipped.push({ sessionName, reason: 'started-after-bot-start' });
846
+ if (verbose) console.log(`[VERBOSE] Skipping resume of ${sessionName}: started after bot start`);
847
+ continue;
848
+ }
849
+
850
+ activeSessions.set(sessionName, sessionInfo);
851
+ resumed.push({ sessionName, sessionInfo });
852
+ logEvent('session_resumed', {
853
+ sessionName,
854
+ url: sessionInfo.url || null,
855
+ command: sessionInfo.command || null,
856
+ sessionId: sessionInfo.sessionId || null,
857
+ startTime: sessionInfo.startTime instanceof Date ? sessionInfo.startTime.toISOString() : sessionInfo.startTime || null,
858
+ });
859
+ if (verbose) {
860
+ console.log(`[VERBOSE] Resumed tracking of session ${sessionName} (url: ${sessionInfo.url || 'n/a'}, command: ${sessionInfo.command || 'n/a'}, backend: ${sessionInfo.isolationBackend || 'screen'})`);
861
+ }
862
+ }
863
+
864
+ if (resumed.length > 0) {
865
+ console.log(`♻️ Resumed monitoring of ${resumed.length} session(s) from durable store after restart`);
866
+ } else if (verbose) {
867
+ console.log('[VERBOSE] resumeTrackedSessions: no eligible sessions to resume');
868
+ }
869
+
870
+ return { resumed, skipped };
871
+ }
872
+
536
873
  /**
537
874
  * Issue #1567: Check if there's an active session for a given URL.
538
875
  * This prevents concurrent sessions on the same PR/issue, which causes
@@ -764,9 +1101,17 @@ export async function getRunningSessionItems(verbose = false, options = {}) {
764
1101
  let status = null;
765
1102
 
766
1103
  if (sessionInfo.isolationBackend) {
1104
+ // Forward every injectable seam so the listing applies the same #1927
1105
+ // stale-`executing` reconciliation the monitor does — a session that
1106
+ // start-command still reports as `executing` but whose backend is gone (or
1107
+ // whose log footer shows a kill) must not be listed as running — and so the
1108
+ // whole path stays controllable from tests.
767
1109
  const state = await getIsolationSessionState(sessionName, sessionInfo, {
768
1110
  verbose,
769
1111
  statusProvider: options.statusProvider,
1112
+ exitFromLog: options.exitFromLog,
1113
+ backendAlive: options.backendAlive,
1114
+ sessionRunning: options.sessionRunning,
770
1115
  });
771
1116
  running = state.running;
772
1117
  status = state.status || null;