@link-assistant/hive-mind 2.0.2 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,18 +8,25 @@
8
8
  * 1. Screen mode (default): Uses `screen -ls` to detect session completion
9
9
  * 2. Isolation mode: Uses `$ --status <uuid>` from start-command CLI for reliable tracking
10
10
  *
11
- * Session state is stored in-memory. The `$` CLI (start-command) is accessed
12
- * purely via its CLI interface, not as a library dependency.
11
+ * Session state is stored in-memory and, since issue #1927, mirrored to a
12
+ * durable on-disk store so a bot restart can reload and resume monitoring of
13
+ * detached sessions that were still running when the previous process died. The
14
+ * `$` CLI (start-command) is accessed purely via its CLI interface, not as a
15
+ * library dependency.
13
16
  *
14
17
  * @see https://github.com/link-foundation/start
15
18
  * @see https://github.com/link-assistant/hive-mind/issues/380
19
+ * @see https://github.com/link-assistant/hive-mind/issues/1927
16
20
  */
17
21
 
18
22
  import { exec as execCallback } from 'child_process';
19
23
  import fs from 'fs/promises';
20
24
  import { promisify } from 'util';
21
- import { formatSessionCompletionMessage, getSessionCompletionExitCode } from './work-session-formatting.lib.mjs';
25
+ import { formatSessionCompletionMessage, getSessionCompletionExitCode, classifySessionOutcome } from './work-session-formatting.lib.mjs';
22
26
  import { notifySubscribers, getSubscriberCount } from './telegram-subscribers.lib.mjs';
27
+ import { classifyExitStatus } from './session-status.lib.mjs';
28
+ import path from 'node:path';
29
+ import { readLastSessionIdFromLog, findLatestSessionLogId, buildResumeCommand, formatResumeSection } from './session-resume.lib.mjs';
23
30
 
24
31
  export { formatSessionCompletionMessage, getSessionCompletionExitCode } from './work-session-formatting.lib.mjs';
25
32
 
@@ -36,8 +43,59 @@ async function getIsolationRunner() {
36
43
  // In-memory session store
37
44
  const activeSessions = new Map();
38
45
 
46
+ // Issue #1927: optional durable mirror of the in-memory registry. When set (by
47
+ // the bot at startup via setSessionStore), every track/complete is persisted so
48
+ // a restart can reload and keep monitoring detached sessions. Left null in unit
49
+ // tests and one-off CLI paths, where in-memory tracking is sufficient.
50
+ let sessionStore = null;
51
+ let sessionLogger = null;
52
+
53
+ /**
54
+ * Attach a durable session store (see session-store.lib.mjs) so tracked sessions
55
+ * survive a bot restart. Passing null disconnects the store (used by tests).
56
+ * @param {object|null} store
57
+ */
58
+ export function setSessionStore(store) {
59
+ sessionStore = store || null;
60
+ }
61
+
62
+ /**
63
+ * Attach a structured logger (see bot-logger.lib.mjs) so session lifecycle
64
+ * transitions are recorded with timestamps. Optional; console is used otherwise.
65
+ * @param {object|null} logger
66
+ */
67
+ export function setSessionLogger(logger) {
68
+ sessionLogger = logger || null;
69
+ }
70
+
71
+ function logEvent(type, data) {
72
+ if (sessionLogger && typeof sessionLogger.event === 'function') {
73
+ sessionLogger.event(type, data);
74
+ }
75
+ }
76
+
39
77
  export function resetSessionMonitorForTests() {
40
78
  activeSessions.clear();
79
+ sessionStore = null;
80
+ sessionLogger = null;
81
+ }
82
+
83
+ /**
84
+ * Inject a stub isolation runner so tests can drive getIsolationSessionState
85
+ * without spawning real `$ --status` / docker probes. Pass `null` to restore the
86
+ * lazy real import on the next call. See issue #1939.
87
+ */
88
+ export function __setIsolationRunnerForTests(runner) {
89
+ _isolationRunner = runner;
90
+ }
91
+
92
+ /**
93
+ * Test-only accessor for getIsolationSessionState (otherwise module-private).
94
+ * Used by tests/test-issue-1939-docker-isolation.mjs to verify that an ambiguous
95
+ * docker terminal status falls through to the live container cross-check.
96
+ */
97
+ export function getIsolationSessionStateForTests(sessionName, sessionInfo, options = {}) {
98
+ return getIsolationSessionState(sessionName, sessionInfo, options);
41
99
  }
42
100
 
43
101
  /**
@@ -84,10 +142,41 @@ export async function checkScreenSessionExists(sessionName) {
84
142
  */
85
143
  export function trackSession(sessionName, sessionInfo, verbose = false) {
86
144
  activeSessions.set(sessionName, sessionInfo);
145
+ const mode = sessionInfo.isolationBackend ? `isolation:${sessionInfo.isolationBackend}` : 'screen';
87
146
  if (verbose) {
88
- const mode = sessionInfo.isolationBackend ? `isolation:${sessionInfo.isolationBackend}` : 'screen';
89
147
  console.log(`[VERBOSE] Session ${sessionName} tracked in memory (mode: ${mode})`);
90
148
  }
149
+ // Issue #1927: mirror to the durable store so a restart can resume monitoring.
150
+ // Only isolation-backed sessions are persisted — they are the ones tracked in
151
+ // `$` (start-command) with a reliable status record (requirement #2). Plain
152
+ // screen sessions are timeout-based best-effort; resuming them after a restart
153
+ // could fabricate a "finished" message with no real exit code, so they stay
154
+ // in-memory only.
155
+ if (sessionStore && isPersistableSession(sessionInfo)) {
156
+ try {
157
+ sessionStore.persist(sessionName, sessionInfo);
158
+ } catch (error) {
159
+ console.error(`[session-monitor] Could not persist session ${sessionName}: ${error.message}`);
160
+ }
161
+ }
162
+ logEvent('session_tracked', {
163
+ sessionName,
164
+ mode,
165
+ url: sessionInfo.url || null,
166
+ command: sessionInfo.command || null,
167
+ sessionId: sessionInfo.sessionId || null,
168
+ startTime: sessionInfo.startTime instanceof Date ? sessionInfo.startTime.toISOString() : sessionInfo.startTime || null,
169
+ });
170
+ }
171
+
172
+ /**
173
+ * Whether a session should be mirrored to the durable store. Only isolation
174
+ * sessions with a start-command UUID qualify (see trackSession rationale).
175
+ * @param {object} sessionInfo
176
+ * @returns {boolean}
177
+ */
178
+ function isPersistableSession(sessionInfo) {
179
+ return Boolean(sessionInfo?.isolationBackend && sessionInfo?.sessionId);
91
180
  }
92
181
 
93
182
  /**
@@ -138,11 +227,22 @@ function getActiveSessions(verbose = false) {
138
227
  * @param {string} sessionName - Name of the session to remove
139
228
  * @param {boolean} verbose - Whether to log verbose output
140
229
  */
141
- function completeSession(sessionName, exitCode = 0, verbose = false) {
230
+ function completeSession(sessionName, exitCode = 0, verbose = false, status = null) {
231
+ const sessionInfo = activeSessions.get(sessionName) || null;
142
232
  activeSessions.delete(sessionName);
143
233
  if (verbose) {
144
- console.log(`[VERBOSE] Session ${sessionName} removed from tracking (exit: ${exitCode})`);
234
+ console.log(`[VERBOSE] Session ${sessionName} removed from tracking (exit: ${exitCode}${status ? `, status: ${status}` : ''})`);
145
235
  }
236
+ // Issue #1927: drop from the durable snapshot (and append a `complete` audit
237
+ // event recording how it ended) so a later restart does not try to resume it.
238
+ if (sessionStore && isPersistableSession(sessionInfo)) {
239
+ try {
240
+ sessionStore.remove(sessionName, { status, exitCode });
241
+ } catch (error) {
242
+ console.error(`[session-monitor] Could not remove persisted session ${sessionName}: ${error.message}`);
243
+ }
244
+ }
245
+ logEvent('session_completed', { sessionName, exitCode: exitCode ?? null, status: status || null });
146
246
  }
147
247
 
148
248
  function isMessageAlreadyUpdatedError(error) {
@@ -214,8 +314,72 @@ function isNonIsolationSessionActive(sessionName, sessionInfo, verbose = false)
214
314
  return true;
215
315
  }
216
316
 
317
+ /**
318
+ * Issue #1927: minimum age before a session that `$ --status` still reports as
319
+ * `executing` is allowed to be declared dead purely on a backend-liveness probe
320
+ * (the screen/tmux/docker session is gone). This avoids a race where a session
321
+ * that has just been launched — but whose backend has not registered yet — is
322
+ * falsely reported as killed. The authoritative log-footer check is NOT gated by
323
+ * this, because a written "Exit Code:" footer is proof the command terminated.
324
+ */
325
+ export const STALE_EXECUTING_MIN_AGE_MS = 90 * 1000;
326
+
327
+ function sessionStartMs(sessionInfo) {
328
+ const start = sessionInfo?.startTime;
329
+ if (!start) return null;
330
+ const date = start instanceof Date ? start : new Date(start);
331
+ const ms = date.getTime();
332
+ return Number.isFinite(ms) ? ms : null;
333
+ }
334
+
335
+ /**
336
+ * Cross-check whether a session that `$ --status` still reports as `executing`
337
+ * has actually terminated. Issue #1927: start-command's status can get stuck on
338
+ * `executing` after the process was killed (a lingering shell keeps the screen
339
+ * session alive, flipping executed→executing), so a SIGKILLed /solve was never
340
+ * reported. Two independent signals are consulted, strongest first:
341
+ *
342
+ * 1. The execution log FOOTER. When start-command wrote "Exit Code: N" the
343
+ * command terminated, full stop — regardless of what `--status` claims.
344
+ * This is authoritative and catches the dominant lingering-shell case.
345
+ * 2. Backend LIVENESS. If no footer was written (e.g. the wrapper itself was
346
+ * hard-killed) but the backing screen/tmux/docker session is gone, the
347
+ * process cannot still be executing. Gated by STALE_EXECUTING_MIN_AGE_MS to
348
+ * avoid a just-launched-not-yet-registered race.
349
+ *
350
+ * @returns {Promise<{exitCode: number|null, status: string, reason: string}|null>}
351
+ * Terminal details when the session is actually dead, else null (still running).
352
+ */
353
+ async function resolveStaleExecutingState(sessionName, sessionInfo, statusResult, { verbose, runner, exitFromLog, backendAlive }) {
354
+ // 1. Authoritative: the log footer.
355
+ const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
356
+ if (logPath) {
357
+ const readFooter = exitFromLog || runner.readSessionExitFromLog;
358
+ const footer = readFooter ? readFooter(logPath, { verbose }) : null;
359
+ if (footer?.finished) {
360
+ const status = classifyExitStatus(footer.exitCode) || (footer.exitCode === 0 ? 'executed' : 'failed');
361
+ return { exitCode: footer.exitCode, status, reason: `log-footer(exit ${footer.exitCode})` };
362
+ }
363
+ }
364
+
365
+ // 2. Liveness probe, only once the session is old enough to have registered.
366
+ const startMs = sessionStartMs(sessionInfo);
367
+ const ageMs = startMs != null ? Date.now() - startMs : Infinity;
368
+ if (ageMs >= STALE_EXECUTING_MIN_AGE_MS && sessionInfo?.isolationBackend) {
369
+ const probe = backendAlive || runner.checkBackendSessionAlive;
370
+ const alive = probe ? await probe(sessionInfo.sessionId || sessionName, sessionInfo.isolationBackend, verbose) : null;
371
+ // Only `false` (definitively gone) counts as killed; `null` (unknown backend)
372
+ // is treated as "no signal" so we don't kill on an indeterminate probe.
373
+ if (alive === false) {
374
+ return { exitCode: null, status: 'killed', reason: 'backend-gone' };
375
+ }
376
+ }
377
+
378
+ return null;
379
+ }
380
+
217
381
  async function getIsolationSessionState(sessionName, sessionInfo, options = {}) {
218
- const { verbose = false, statusProvider = null } = options;
382
+ const { verbose = false, statusProvider = null, exitFromLog = null, backendAlive = null, sessionRunning = null } = options;
219
383
  const sessionId = sessionInfo.sessionId || sessionName;
220
384
 
221
385
  try {
@@ -224,22 +388,81 @@ async function getIsolationSessionState(sessionName, sessionInfo, options = {})
224
388
 
225
389
  if (statusResult?.exists && statusResult.status) {
226
390
  if (runner.isExecutingSessionStatus(statusResult.status)) {
391
+ // Issue #1927: an `executing` status is not trusted blindly — verify the
392
+ // process is really alive. start-command can keep reporting `executing`
393
+ // after a kill, which is exactly how an OOM-killed /solve went unreported.
394
+ const stale = await resolveStaleExecutingState(sessionName, sessionInfo, statusResult, { verbose, runner, exitFromLog, backendAlive });
395
+ if (stale) {
396
+ if (verbose) {
397
+ console.log(`[VERBOSE] Session ${sessionName} reported '${statusResult.status}' but is actually terminated (${stale.reason}); treating as ${stale.status} (exit ${stale.exitCode})`);
398
+ }
399
+ // Rewrite the status payload so downstream completion formatting sees
400
+ // the real terminal status/exit code instead of the stale `executing`.
401
+ const correctedStatus = stale.status || 'killed';
402
+ const corrected = { ...statusResult, status: correctedStatus, exitCode: stale.exitCode, endTime: statusResult.endTime || stale.endTime || null };
403
+ return { running: false, exitCode: stale.exitCode, status: correctedStatus, statusResult: corrected, stale: true };
404
+ }
227
405
  return { running: true, exitCode: null, status: statusResult.status, statusResult };
228
406
  }
229
407
  if (runner.isTerminalSessionStatus(statusResult.status)) {
230
- return {
231
- running: false,
232
- exitCode: statusResult.exitCode !== undefined ? statusResult.exitCode : null,
233
- status: statusResult.status,
234
- statusResult,
235
- };
408
+ let exitCode = statusResult.exitCode !== undefined ? statusResult.exitCode : null;
409
+ // Issue #1927: when start-command reports a terminal status but a missing
410
+ // or sentinel (-1) exit code which its lingering-shell reverse-flip can
411
+ // produce — recover the real code from the log footer so a SIGKILL is not
412
+ // mislabelled as a generic failure.
413
+ if ((exitCode === null || exitCode === -1) && (statusResult.logPath || sessionInfo?.logPath)) {
414
+ const readFooter = exitFromLog || runner.readSessionExitFromLog;
415
+ const footer = readFooter ? readFooter(statusResult.logPath || sessionInfo.logPath, { verbose }) : null;
416
+ if (footer?.finished) {
417
+ exitCode = footer.exitCode;
418
+ const correctedStatus = classifyExitStatus(footer.exitCode) || statusResult.status;
419
+ if (verbose) {
420
+ console.log(`[VERBOSE] Session ${sessionName} reported terminal '${statusResult.status}' with exit ${statusResult.exitCode}; recovered real exit ${exitCode} (${correctedStatus}) from log footer`);
421
+ }
422
+ return { running: false, exitCode, status: correctedStatus, statusResult: { ...statusResult, status: correctedStatus, exitCode } };
423
+ }
424
+ }
425
+ // Issue #1939: a native docker session can report a terminal status
426
+ // ("executed") with the unknown exit-code sentinel (-1) while the
427
+ // container is still running. When the log footer above did not recover
428
+ // a real terminal exit, such a status is provisional — fall through to
429
+ // isSessionRunning() below, which cross-checks the live container via
430
+ // `docker inspect` before we notify the user the work finished.
431
+ const ambiguousDockerTerminal = sessionInfo.isolationBackend === 'docker' && typeof runner.isUnknownDockerExitCode === 'function' && runner.isUnknownDockerExitCode(exitCode);
432
+ if (!ambiguousDockerTerminal) {
433
+ return { running: false, exitCode, status: statusResult.status, statusResult };
434
+ }
236
435
  }
237
436
  }
238
437
 
239
- const running = await runner.isSessionRunning(sessionId, {
438
+ // The status record is unavailable (no `exists`/`status`). Fall back to a
439
+ // direct backend liveness check. `sessionRunning` is injectable purely so
440
+ // this path is testable without the real `$`/`screen` binaries; production
441
+ // always uses the runner's real check.
442
+ const checkRunning = sessionRunning || runner.isSessionRunning;
443
+ const running = await checkRunning(sessionId, {
240
444
  backend: sessionInfo.isolationBackend,
241
445
  verbose,
242
446
  });
447
+ if (!running) {
448
+ // Issue #1927: the `$ --status` record is unavailable (e.g. garbage-
449
+ // collected while the bot was down) and the backend reports not-running.
450
+ // Before declaring a bare null exit — which classifies as success — try
451
+ // the log footer so a session that was killed while we were offline is
452
+ // reported as the kill it was, not a silent success.
453
+ const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
454
+ if (logPath) {
455
+ const readFooter = exitFromLog || runner.readSessionExitFromLog;
456
+ const footer = readFooter ? readFooter(logPath, { verbose }) : null;
457
+ if (footer?.finished) {
458
+ const correctedStatus = classifyExitStatus(footer.exitCode) || (footer.exitCode === 0 ? 'executed' : 'failed');
459
+ if (verbose) {
460
+ console.log(`[VERBOSE] Session ${sessionName} has no live status record; recovered exit ${footer.exitCode} (${correctedStatus}) from log footer`);
461
+ }
462
+ return { running: false, exitCode: footer.exitCode, status: correctedStatus, statusResult: { ...(statusResult || {}), status: correctedStatus, exitCode: footer.exitCode, endTime: statusResult?.endTime || footer.endTime || null } };
463
+ }
464
+ }
465
+ }
243
466
  return {
244
467
  running,
245
468
  exitCode: running ? null : (statusResult?.exitCode ?? null),
@@ -274,6 +497,7 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
274
497
  let stillRunning;
275
498
  let exitCode = null;
276
499
  let statusResult = null;
500
+ let resolvedStatus = null;
277
501
 
278
502
  if (sessionInfo.isolationBackend && sessionInfo.sessionId) {
279
503
  // Isolation mode: use $ --status, with screen -ls only as a fallback
@@ -282,10 +506,31 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
282
506
  const state = await getIsolationSessionState(sessionName, sessionInfo, {
283
507
  verbose,
284
508
  statusProvider: options.statusProvider,
509
+ exitFromLog: options.exitFromLog,
510
+ backendAlive: options.backendAlive,
511
+ sessionRunning: options.sessionRunning,
285
512
  });
286
513
  stillRunning = state.running;
287
514
  exitCode = state.exitCode;
288
515
  statusResult = state.statusResult;
516
+ resolvedStatus = state.status || statusResult?.status || null;
517
+ if (state.stale && verbose) {
518
+ console.log(`[VERBOSE] Session ${sessionName} detected as killed/terminated despite an 'executing' status report (issue #1927 cross-check)`);
519
+ }
520
+ // Issue #1927: once start-command reveals the log path, record it in the
521
+ // durable snapshot. If the bot dies and restarts after start-command has
522
+ // garbage-collected the status record, the resumed session can still read
523
+ // the log footer to learn whether it was killed.
524
+ if (statusResult?.logPath && sessionInfo.logPath !== statusResult.logPath) {
525
+ sessionInfo.logPath = statusResult.logPath;
526
+ if (sessionStore) {
527
+ try {
528
+ sessionStore.persist(sessionName, sessionInfo);
529
+ } catch {
530
+ /* best effort — persistence must never break monitoring */
531
+ }
532
+ }
533
+ }
289
534
  } else {
290
535
  // Issue #1586: Non-isolation screen sessions cannot reliably detect
291
536
  // completion because start-screen keeps the screen alive via `exec bash`.
@@ -363,6 +608,46 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
363
608
  }
364
609
  }
365
610
 
611
+ // Issue #1927 (review follow-up): when a /solve session was KILLED
612
+ // (OOM/SIGKILL — the silent failure this issue is about), surface a
613
+ // ready-to-run `--resume <lastSessionId>` command so the surviving
614
+ // parent (the operator, or an automation watching the bot) can pick the
615
+ // work back up. We deliberately do NOT auto-relaunch here: a job that
616
+ // reliably OOMs would storm. The rule "use the LAST of multiple
617
+ // sessions" is honored by reading the last `Session ID:` marker from
618
+ // the captured log. Purely additive — failures never block the
619
+ // completion notification, preserving backward compatibility.
620
+ const resumeExtraSections = [];
621
+ try {
622
+ const outcome = classifySessionOutcome({ exitCode: finalExitCode, status: resolvedStatus });
623
+ const isResumableCommand = (sessionInfo?.command || 'solve') === 'solve';
624
+ if (outcome.killed && isResumableCommand) {
625
+ const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
626
+ // The id must be the AI TOOL's session id, not the isolation session
627
+ // id (sessionInfo.sessionId — wrong namespace for `solve --resume`).
628
+ // Prefer the last `Session ID:` marker in the captured log; fall
629
+ // back to the newest `<sessionId>.log` start-command wrote in the
630
+ // same directory. If neither exists, offer no command (a bogus
631
+ // resume id would be worse than none).
632
+ let lastSessionId = readLastSessionIdFromLog(logPath, { verbose });
633
+ if (!lastSessionId && logPath) {
634
+ lastSessionId = findLatestSessionLogId({ dir: path.dirname(logPath), verbose });
635
+ }
636
+ const resumeCommand = buildResumeCommand({ sessionInfo, lastSessionId });
637
+ const resumeSection = formatResumeSection({ lastSessionId, command: resumeCommand });
638
+ if (resumeSection) {
639
+ resumeExtraSections.push(resumeSection);
640
+ if (verbose) {
641
+ console.log(`[VERBOSE] Session ${sessionName} was killed; offering resume from last session ${lastSessionId}`);
642
+ }
643
+ }
644
+ }
645
+ } catch (resumeError) {
646
+ if (verbose) {
647
+ console.log(`[VERBOSE] Could not build resume section for ${sessionName}: ${resumeError?.message || resumeError}`);
648
+ }
649
+ }
650
+
366
651
  const message = formatSessionCompletionMessage({
367
652
  sessionName,
368
653
  sessionInfo,
@@ -371,7 +656,7 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
371
656
  exitCode: finalExitCode,
372
657
  infoBlock: sessionInfo?.infoBlock || '',
373
658
  pullRequestUrl,
374
- extraSections: limitsExtraSections,
659
+ extraSections: [...limitsExtraSections, ...resumeExtraSections],
375
660
  });
376
661
 
377
662
  // Update the original reply message if messageId is available, otherwise send new message
@@ -411,11 +696,11 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
411
696
  }
412
697
  }
413
698
 
414
- completeSession(sessionName, finalExitCode || 0, verbose);
699
+ completeSession(sessionName, finalExitCode || 0, verbose, resolvedStatus);
415
700
  } catch (error) {
416
701
  console.error(`Failed to send completion notification for ${sessionName}:`, error);
417
702
  if (isMessageAlreadyUpdatedError(error)) {
418
- completeSession(sessionName, exitCode || 0, verbose);
703
+ completeSession(sessionName, exitCode || 0, verbose, resolvedStatus);
419
704
  } else {
420
705
  sessionInfo.lastNotificationError = error.message;
421
706
  sessionInfo.lastKnownStatus = statusResult?.status || sessionInfo.lastKnownStatus || null;
@@ -503,10 +788,88 @@ export function startSessionMonitoring(bot, verbose = false, intervalMs = 30000,
503
788
  };
504
789
  const timer = setInterval(runMonitor, intervalMs);
505
790
  runMonitor();
506
- console.log(`📊 Session monitoring started (checking every ${intervalMs / 1000} seconds, storage: in-memory)`);
791
+ const storage = sessionStore ? `durable+in-memory (${sessionStore.snapshotPath})` : 'in-memory';
792
+ console.log(`📊 Session monitoring started (checking every ${intervalMs / 1000} seconds, storage: ${storage})`);
507
793
  return timer;
508
794
  }
509
795
 
796
+ /**
797
+ * Issue #1927 (requirements #2 and #4): after a bot restart, reload the sessions
798
+ * that were still being tracked when the previous process died and re-register
799
+ * them so {@link monitorSessions} resumes watching them to completion. The very
800
+ * next monitor tick re-queries each session's status — so a session that was
801
+ * *killed while the bot was down* is finally reported (via the log-footer /
802
+ * backend-liveness cross-check in {@link getIsolationSessionState}) instead of
803
+ * vanishing silently.
804
+ *
805
+ * Only sessions persisted by this bot are resumed (they carry the chatId /
806
+ * messageId needed to notify). The durable snapshot already contains exactly the
807
+ * sessions that had not completed when the previous process died, because
808
+ * completed sessions are removed from it. As a guard we additionally skip any
809
+ * record whose startTime is after the current bot start (it cannot belong to a
810
+ * previous run), satisfying requirement #2's "started before bot start time".
811
+ *
812
+ * @param {object} [options]
813
+ * @param {object} [options.store] - Session store to load from (default: the store set via setSessionStore).
814
+ * @param {number} [options.botStartTime] - Epoch seconds; only sessions started strictly before this are resumed. Defaults to now.
815
+ * @param {boolean} [options.verbose]
816
+ * @returns {Promise<{resumed: Array<{sessionName: string, sessionInfo: object}>, skipped: Array<{sessionName: string, reason: string}>}>}
817
+ */
818
+ export async function resumeTrackedSessions(options = {}) {
819
+ const { store = sessionStore, verbose = false, botStartTime = Math.floor(Date.now() / 1000) } = options;
820
+ const resumed = [];
821
+ const skipped = [];
822
+
823
+ if (!store) {
824
+ if (verbose) console.log('[VERBOSE] resumeTrackedSessions: no durable session store configured, nothing to resume');
825
+ return { resumed, skipped };
826
+ }
827
+
828
+ let persisted = [];
829
+ try {
830
+ persisted = store.load();
831
+ } catch (error) {
832
+ console.error(`[session-monitor] resumeTrackedSessions: could not load persisted sessions: ${error.message}`);
833
+ return { resumed, skipped };
834
+ }
835
+
836
+ for (const { sessionName, sessionInfo } of persisted) {
837
+ if (activeSessions.has(sessionName)) {
838
+ skipped.push({ sessionName, reason: 'already-tracked' });
839
+ continue;
840
+ }
841
+ // Requirement #2/#4: a session that started after this bot came up cannot be
842
+ // a leftover from a previous run, so never resume it here.
843
+ const startMs = sessionStartMs(sessionInfo);
844
+ if (startMs != null && startMs > botStartTime * 1000) {
845
+ skipped.push({ sessionName, reason: 'started-after-bot-start' });
846
+ if (verbose) console.log(`[VERBOSE] Skipping resume of ${sessionName}: started after bot start`);
847
+ continue;
848
+ }
849
+
850
+ activeSessions.set(sessionName, sessionInfo);
851
+ resumed.push({ sessionName, sessionInfo });
852
+ logEvent('session_resumed', {
853
+ sessionName,
854
+ url: sessionInfo.url || null,
855
+ command: sessionInfo.command || null,
856
+ sessionId: sessionInfo.sessionId || null,
857
+ startTime: sessionInfo.startTime instanceof Date ? sessionInfo.startTime.toISOString() : sessionInfo.startTime || null,
858
+ });
859
+ if (verbose) {
860
+ console.log(`[VERBOSE] Resumed tracking of session ${sessionName} (url: ${sessionInfo.url || 'n/a'}, command: ${sessionInfo.command || 'n/a'}, backend: ${sessionInfo.isolationBackend || 'screen'})`);
861
+ }
862
+ }
863
+
864
+ if (resumed.length > 0) {
865
+ console.log(`♻️ Resumed monitoring of ${resumed.length} session(s) from durable store after restart`);
866
+ } else if (verbose) {
867
+ console.log('[VERBOSE] resumeTrackedSessions: no eligible sessions to resume');
868
+ }
869
+
870
+ return { resumed, skipped };
871
+ }
872
+
510
873
  /**
511
874
  * Issue #1567: Check if there's an active session for a given URL.
512
875
  * This prevents concurrent sessions on the same PR/issue, which causes
@@ -738,9 +1101,17 @@ export async function getRunningSessionItems(verbose = false, options = {}) {
738
1101
  let status = null;
739
1102
 
740
1103
  if (sessionInfo.isolationBackend) {
1104
+ // Forward every injectable seam so the listing applies the same #1927
1105
+ // stale-`executing` reconciliation the monitor does — a session that
1106
+ // start-command still reports as `executing` but whose backend is gone (or
1107
+ // whose log footer shows a kill) must not be listed as running — and so the
1108
+ // whole path stays controllable from tests.
741
1109
  const state = await getIsolationSessionState(sessionName, sessionInfo, {
742
1110
  verbose,
743
1111
  statusProvider: options.statusProvider,
1112
+ exitFromLog: options.exitFromLog,
1113
+ backendAlive: options.backendAlive,
1114
+ sessionRunning: options.sessionRunning,
744
1115
  });
745
1116
  running = state.running;
746
1117
  status = state.status || null;