@link-assistant/hive-mind 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +92 -0
- package/package.json +1 -1
- package/src/bot-lifecycle.lib.mjs +128 -0
- package/src/bot-logger.lib.mjs +253 -0
- package/src/cleanup.lib.mjs +22 -4
- package/src/cleanup.mjs +15 -2
- package/src/cleanup.os.lib.mjs +94 -8
- package/src/isolation-runner.lib.mjs +194 -10
- package/src/session-monitor.lib.mjs +367 -22
- package/src/session-resume.lib.mjs +269 -0
- package/src/session-status.lib.mjs +141 -0
- package/src/session-store.lib.mjs +232 -0
- package/src/telegram-bot.mjs +54 -13
- package/src/telegram-command-execution.lib.mjs +3 -1
- package/src/telegram-terminal-watch-command.lib.mjs +47 -6
- package/src/work-session-formatting.lib.mjs +44 -11
|
@@ -8,18 +8,25 @@
|
|
|
8
8
|
* 1. Screen mode (default): Uses `screen -ls` to detect session completion
|
|
9
9
|
* 2. Isolation mode: Uses `$ --status <uuid>` from start-command CLI for reliable tracking
|
|
10
10
|
*
|
|
11
|
-
* Session state is stored in-memory
|
|
12
|
-
*
|
|
11
|
+
* Session state is stored in-memory and, since issue #1927, mirrored to a
|
|
12
|
+
* durable on-disk store so a bot restart can reload and resume monitoring of
|
|
13
|
+
* detached sessions that were still running when the previous process died. The
|
|
14
|
+
* `$` CLI (start-command) is accessed purely via its CLI interface, not as a
|
|
15
|
+
* library dependency.
|
|
13
16
|
*
|
|
14
17
|
* @see https://github.com/link-foundation/start
|
|
15
18
|
* @see https://github.com/link-assistant/hive-mind/issues/380
|
|
19
|
+
* @see https://github.com/link-assistant/hive-mind/issues/1927
|
|
16
20
|
*/
|
|
17
21
|
|
|
18
22
|
import { exec as execCallback } from 'child_process';
|
|
19
23
|
import fs from 'fs/promises';
|
|
20
24
|
import { promisify } from 'util';
|
|
21
|
-
import { formatSessionCompletionMessage, getSessionCompletionExitCode } from './work-session-formatting.lib.mjs';
|
|
25
|
+
import { formatSessionCompletionMessage, getSessionCompletionExitCode, classifySessionOutcome } from './work-session-formatting.lib.mjs';
|
|
22
26
|
import { notifySubscribers, getSubscriberCount } from './telegram-subscribers.lib.mjs';
|
|
27
|
+
import { classifyExitStatus } from './session-status.lib.mjs';
|
|
28
|
+
import path from 'node:path';
|
|
29
|
+
import { readLastSessionIdFromLog, findLatestSessionLogId, buildResumeCommand, formatResumeSection } from './session-resume.lib.mjs';
|
|
23
30
|
|
|
24
31
|
export { formatSessionCompletionMessage, getSessionCompletionExitCode } from './work-session-formatting.lib.mjs';
|
|
25
32
|
|
|
@@ -36,8 +43,41 @@ async function getIsolationRunner() {
|
|
|
36
43
|
// In-memory session store
|
|
37
44
|
const activeSessions = new Map();
|
|
38
45
|
|
|
46
|
+
// Issue #1927: optional durable mirror of the in-memory registry. When set (by
|
|
47
|
+
// the bot at startup via setSessionStore), every track/complete is persisted so
|
|
48
|
+
// a restart can reload and keep monitoring detached sessions. Left null in unit
|
|
49
|
+
// tests and one-off CLI paths, where in-memory tracking is sufficient.
|
|
50
|
+
let sessionStore = null;
|
|
51
|
+
let sessionLogger = null;
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Attach a durable session store (see session-store.lib.mjs) so tracked sessions
|
|
55
|
+
* survive a bot restart. Passing null disconnects the store (used by tests).
|
|
56
|
+
* @param {object|null} store
|
|
57
|
+
*/
|
|
58
|
+
export function setSessionStore(store) {
|
|
59
|
+
sessionStore = store || null;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Attach a structured logger (see bot-logger.lib.mjs) so session lifecycle
|
|
64
|
+
* transitions are recorded with timestamps. Optional; console is used otherwise.
|
|
65
|
+
* @param {object|null} logger
|
|
66
|
+
*/
|
|
67
|
+
export function setSessionLogger(logger) {
|
|
68
|
+
sessionLogger = logger || null;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function logEvent(type, data) {
|
|
72
|
+
if (sessionLogger && typeof sessionLogger.event === 'function') {
|
|
73
|
+
sessionLogger.event(type, data);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
39
77
|
export function resetSessionMonitorForTests() {
|
|
40
78
|
activeSessions.clear();
|
|
79
|
+
sessionStore = null;
|
|
80
|
+
sessionLogger = null;
|
|
41
81
|
}
|
|
42
82
|
|
|
43
83
|
/**
|
|
@@ -102,10 +142,41 @@ export async function checkScreenSessionExists(sessionName) {
|
|
|
102
142
|
*/
|
|
103
143
|
export function trackSession(sessionName, sessionInfo, verbose = false) {
|
|
104
144
|
activeSessions.set(sessionName, sessionInfo);
|
|
145
|
+
const mode = sessionInfo.isolationBackend ? `isolation:${sessionInfo.isolationBackend}` : 'screen';
|
|
105
146
|
if (verbose) {
|
|
106
|
-
const mode = sessionInfo.isolationBackend ? `isolation:${sessionInfo.isolationBackend}` : 'screen';
|
|
107
147
|
console.log(`[VERBOSE] Session ${sessionName} tracked in memory (mode: ${mode})`);
|
|
108
148
|
}
|
|
149
|
+
// Issue #1927: mirror to the durable store so a restart can resume monitoring.
|
|
150
|
+
// Only isolation-backed sessions are persisted — they are the ones tracked in
|
|
151
|
+
// `$` (start-command) with a reliable status record (requirement #2). Plain
|
|
152
|
+
// screen sessions are timeout-based best-effort; resuming them after a restart
|
|
153
|
+
// could fabricate a "finished" message with no real exit code, so they stay
|
|
154
|
+
// in-memory only.
|
|
155
|
+
if (sessionStore && isPersistableSession(sessionInfo)) {
|
|
156
|
+
try {
|
|
157
|
+
sessionStore.persist(sessionName, sessionInfo);
|
|
158
|
+
} catch (error) {
|
|
159
|
+
console.error(`[session-monitor] Could not persist session ${sessionName}: ${error.message}`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
logEvent('session_tracked', {
|
|
163
|
+
sessionName,
|
|
164
|
+
mode,
|
|
165
|
+
url: sessionInfo.url || null,
|
|
166
|
+
command: sessionInfo.command || null,
|
|
167
|
+
sessionId: sessionInfo.sessionId || null,
|
|
168
|
+
startTime: sessionInfo.startTime instanceof Date ? sessionInfo.startTime.toISOString() : sessionInfo.startTime || null,
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Whether a session should be mirrored to the durable store. Only isolation
|
|
174
|
+
* sessions with a start-command UUID qualify (see trackSession rationale).
|
|
175
|
+
* @param {object} sessionInfo
|
|
176
|
+
* @returns {boolean}
|
|
177
|
+
*/
|
|
178
|
+
function isPersistableSession(sessionInfo) {
|
|
179
|
+
return Boolean(sessionInfo?.isolationBackend && sessionInfo?.sessionId);
|
|
109
180
|
}
|
|
110
181
|
|
|
111
182
|
/**
|
|
@@ -156,11 +227,22 @@ function getActiveSessions(verbose = false) {
|
|
|
156
227
|
* @param {string} sessionName - Name of the session to remove
|
|
157
228
|
* @param {boolean} verbose - Whether to log verbose output
|
|
158
229
|
*/
|
|
159
|
-
function completeSession(sessionName, exitCode = 0, verbose = false) {
|
|
230
|
+
function completeSession(sessionName, exitCode = 0, verbose = false, status = null) {
|
|
231
|
+
const sessionInfo = activeSessions.get(sessionName) || null;
|
|
160
232
|
activeSessions.delete(sessionName);
|
|
161
233
|
if (verbose) {
|
|
162
|
-
console.log(`[VERBOSE] Session ${sessionName} removed from tracking (exit: ${exitCode})`);
|
|
234
|
+
console.log(`[VERBOSE] Session ${sessionName} removed from tracking (exit: ${exitCode}${status ? `, status: ${status}` : ''})`);
|
|
235
|
+
}
|
|
236
|
+
// Issue #1927: drop from the durable snapshot (and append a `complete` audit
|
|
237
|
+
// event recording how it ended) so a later restart does not try to resume it.
|
|
238
|
+
if (sessionStore && isPersistableSession(sessionInfo)) {
|
|
239
|
+
try {
|
|
240
|
+
sessionStore.remove(sessionName, { status, exitCode });
|
|
241
|
+
} catch (error) {
|
|
242
|
+
console.error(`[session-monitor] Could not remove persisted session ${sessionName}: ${error.message}`);
|
|
243
|
+
}
|
|
163
244
|
}
|
|
245
|
+
logEvent('session_completed', { sessionName, exitCode: exitCode ?? null, status: status || null });
|
|
164
246
|
}
|
|
165
247
|
|
|
166
248
|
function isMessageAlreadyUpdatedError(error) {
|
|
@@ -232,8 +314,72 @@ function isNonIsolationSessionActive(sessionName, sessionInfo, verbose = false)
|
|
|
232
314
|
return true;
|
|
233
315
|
}
|
|
234
316
|
|
|
317
|
+
/**
|
|
318
|
+
* Issue #1927: minimum age before a session that `$ --status` still reports as
|
|
319
|
+
* `executing` is allowed to be declared dead purely on a backend-liveness probe
|
|
320
|
+
* (the screen/tmux/docker session is gone). This avoids a race where a session
|
|
321
|
+
* that has just been launched — but whose backend has not registered yet — is
|
|
322
|
+
* falsely reported as killed. The authoritative log-footer check is NOT gated by
|
|
323
|
+
* this, because a written "Exit Code:" footer is proof the command terminated.
|
|
324
|
+
*/
|
|
325
|
+
export const STALE_EXECUTING_MIN_AGE_MS = 90 * 1000;
|
|
326
|
+
|
|
327
|
+
function sessionStartMs(sessionInfo) {
|
|
328
|
+
const start = sessionInfo?.startTime;
|
|
329
|
+
if (!start) return null;
|
|
330
|
+
const date = start instanceof Date ? start : new Date(start);
|
|
331
|
+
const ms = date.getTime();
|
|
332
|
+
return Number.isFinite(ms) ? ms : null;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Cross-check whether a session that `$ --status` still reports as `executing`
|
|
337
|
+
* has actually terminated. Issue #1927: start-command's status can get stuck on
|
|
338
|
+
* `executing` after the process was killed (a lingering shell keeps the screen
|
|
339
|
+
* session alive, flipping executed→executing), so a SIGKILLed /solve was never
|
|
340
|
+
* reported. Two independent signals are consulted, strongest first:
|
|
341
|
+
*
|
|
342
|
+
* 1. The execution log FOOTER. When start-command wrote "Exit Code: N" the
|
|
343
|
+
* command terminated, full stop — regardless of what `--status` claims.
|
|
344
|
+
* This is authoritative and catches the dominant lingering-shell case.
|
|
345
|
+
* 2. Backend LIVENESS. If no footer was written (e.g. the wrapper itself was
|
|
346
|
+
* hard-killed) but the backing screen/tmux/docker session is gone, the
|
|
347
|
+
* process cannot still be executing. Gated by STALE_EXECUTING_MIN_AGE_MS to
|
|
348
|
+
* avoid a just-launched-not-yet-registered race.
|
|
349
|
+
*
|
|
350
|
+
* @returns {Promise<{exitCode: number|null, status: string, reason: string}|null>}
|
|
351
|
+
* Terminal details when the session is actually dead, else null (still running).
|
|
352
|
+
*/
|
|
353
|
+
async function resolveStaleExecutingState(sessionName, sessionInfo, statusResult, { verbose, runner, exitFromLog, backendAlive }) {
|
|
354
|
+
// 1. Authoritative: the log footer.
|
|
355
|
+
const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
|
|
356
|
+
if (logPath) {
|
|
357
|
+
const readFooter = exitFromLog || runner.readSessionExitFromLog;
|
|
358
|
+
const footer = readFooter ? readFooter(logPath, { verbose }) : null;
|
|
359
|
+
if (footer?.finished) {
|
|
360
|
+
const status = classifyExitStatus(footer.exitCode) || (footer.exitCode === 0 ? 'executed' : 'failed');
|
|
361
|
+
return { exitCode: footer.exitCode, status, reason: `log-footer(exit ${footer.exitCode})` };
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// 2. Liveness probe, only once the session is old enough to have registered.
|
|
366
|
+
const startMs = sessionStartMs(sessionInfo);
|
|
367
|
+
const ageMs = startMs != null ? Date.now() - startMs : Infinity;
|
|
368
|
+
if (ageMs >= STALE_EXECUTING_MIN_AGE_MS && sessionInfo?.isolationBackend) {
|
|
369
|
+
const probe = backendAlive || runner.checkBackendSessionAlive;
|
|
370
|
+
const alive = probe ? await probe(sessionInfo.sessionId || sessionName, sessionInfo.isolationBackend, verbose) : null;
|
|
371
|
+
// Only `false` (definitively gone) counts as killed; `null` (unknown backend)
|
|
372
|
+
// is treated as "no signal" so we don't kill on an indeterminate probe.
|
|
373
|
+
if (alive === false) {
|
|
374
|
+
return { exitCode: null, status: 'killed', reason: 'backend-gone' };
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
return null;
|
|
379
|
+
}
|
|
380
|
+
|
|
235
381
|
async function getIsolationSessionState(sessionName, sessionInfo, options = {}) {
|
|
236
|
-
const { verbose = false, statusProvider = null } = options;
|
|
382
|
+
const { verbose = false, statusProvider = null, exitFromLog = null, backendAlive = null, sessionRunning = null } = options;
|
|
237
383
|
const sessionId = sessionInfo.sessionId || sessionName;
|
|
238
384
|
|
|
239
385
|
try {
|
|
@@ -242,30 +388,81 @@ async function getIsolationSessionState(sessionName, sessionInfo, options = {})
|
|
|
242
388
|
|
|
243
389
|
if (statusResult?.exists && statusResult.status) {
|
|
244
390
|
if (runner.isExecutingSessionStatus(statusResult.status)) {
|
|
391
|
+
// Issue #1927: an `executing` status is not trusted blindly — verify the
|
|
392
|
+
// process is really alive. start-command can keep reporting `executing`
|
|
393
|
+
// after a kill, which is exactly how an OOM-killed /solve went unreported.
|
|
394
|
+
const stale = await resolveStaleExecutingState(sessionName, sessionInfo, statusResult, { verbose, runner, exitFromLog, backendAlive });
|
|
395
|
+
if (stale) {
|
|
396
|
+
if (verbose) {
|
|
397
|
+
console.log(`[VERBOSE] Session ${sessionName} reported '${statusResult.status}' but is actually terminated (${stale.reason}); treating as ${stale.status} (exit ${stale.exitCode})`);
|
|
398
|
+
}
|
|
399
|
+
// Rewrite the status payload so downstream completion formatting sees
|
|
400
|
+
// the real terminal status/exit code instead of the stale `executing`.
|
|
401
|
+
const correctedStatus = stale.status || 'killed';
|
|
402
|
+
const corrected = { ...statusResult, status: correctedStatus, exitCode: stale.exitCode, endTime: statusResult.endTime || stale.endTime || null };
|
|
403
|
+
return { running: false, exitCode: stale.exitCode, status: correctedStatus, statusResult: corrected, stale: true };
|
|
404
|
+
}
|
|
245
405
|
return { running: true, exitCode: null, status: statusResult.status, statusResult };
|
|
246
406
|
}
|
|
247
407
|
if (runner.isTerminalSessionStatus(statusResult.status)) {
|
|
408
|
+
let exitCode = statusResult.exitCode !== undefined ? statusResult.exitCode : null;
|
|
409
|
+
// Issue #1927: when start-command reports a terminal status but a missing
|
|
410
|
+
// or sentinel (-1) exit code — which its lingering-shell reverse-flip can
|
|
411
|
+
// produce — recover the real code from the log footer so a SIGKILL is not
|
|
412
|
+
// mislabelled as a generic failure.
|
|
413
|
+
if ((exitCode === null || exitCode === -1) && (statusResult.logPath || sessionInfo?.logPath)) {
|
|
414
|
+
const readFooter = exitFromLog || runner.readSessionExitFromLog;
|
|
415
|
+
const footer = readFooter ? readFooter(statusResult.logPath || sessionInfo.logPath, { verbose }) : null;
|
|
416
|
+
if (footer?.finished) {
|
|
417
|
+
exitCode = footer.exitCode;
|
|
418
|
+
const correctedStatus = classifyExitStatus(footer.exitCode) || statusResult.status;
|
|
419
|
+
if (verbose) {
|
|
420
|
+
console.log(`[VERBOSE] Session ${sessionName} reported terminal '${statusResult.status}' with exit ${statusResult.exitCode}; recovered real exit ${exitCode} (${correctedStatus}) from log footer`);
|
|
421
|
+
}
|
|
422
|
+
return { running: false, exitCode, status: correctedStatus, statusResult: { ...statusResult, status: correctedStatus, exitCode } };
|
|
423
|
+
}
|
|
424
|
+
}
|
|
248
425
|
// Issue #1939: a native docker session can report a terminal status
|
|
249
426
|
// ("executed") with the unknown exit-code sentinel (-1) while the
|
|
250
|
-
// container is still running.
|
|
251
|
-
//
|
|
252
|
-
//
|
|
253
|
-
|
|
427
|
+
// container is still running. When the log footer above did not recover
|
|
428
|
+
// a real terminal exit, such a status is provisional — fall through to
|
|
429
|
+
// isSessionRunning() below, which cross-checks the live container via
|
|
430
|
+
// `docker inspect` before we notify the user the work finished.
|
|
431
|
+
const ambiguousDockerTerminal = sessionInfo.isolationBackend === 'docker' && typeof runner.isUnknownDockerExitCode === 'function' && runner.isUnknownDockerExitCode(exitCode);
|
|
254
432
|
if (!ambiguousDockerTerminal) {
|
|
255
|
-
return {
|
|
256
|
-
running: false,
|
|
257
|
-
exitCode: statusResult.exitCode !== undefined ? statusResult.exitCode : null,
|
|
258
|
-
status: statusResult.status,
|
|
259
|
-
statusResult,
|
|
260
|
-
};
|
|
433
|
+
return { running: false, exitCode, status: statusResult.status, statusResult };
|
|
261
434
|
}
|
|
262
435
|
}
|
|
263
436
|
}
|
|
264
437
|
|
|
265
|
-
|
|
438
|
+
// The status record is unavailable (no `exists`/`status`). Fall back to a
|
|
439
|
+
// direct backend liveness check. `sessionRunning` is injectable purely so
|
|
440
|
+
// this path is testable without the real `$`/`screen` binaries; production
|
|
441
|
+
// always uses the runner's real check.
|
|
442
|
+
const checkRunning = sessionRunning || runner.isSessionRunning;
|
|
443
|
+
const running = await checkRunning(sessionId, {
|
|
266
444
|
backend: sessionInfo.isolationBackend,
|
|
267
445
|
verbose,
|
|
268
446
|
});
|
|
447
|
+
if (!running) {
|
|
448
|
+
// Issue #1927: the `$ --status` record is unavailable (e.g. garbage-
|
|
449
|
+
// collected while the bot was down) and the backend reports not-running.
|
|
450
|
+
// Before declaring a bare null exit — which classifies as success — try
|
|
451
|
+
// the log footer so a session that was killed while we were offline is
|
|
452
|
+
// reported as the kill it was, not a silent success.
|
|
453
|
+
const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
|
|
454
|
+
if (logPath) {
|
|
455
|
+
const readFooter = exitFromLog || runner.readSessionExitFromLog;
|
|
456
|
+
const footer = readFooter ? readFooter(logPath, { verbose }) : null;
|
|
457
|
+
if (footer?.finished) {
|
|
458
|
+
const correctedStatus = classifyExitStatus(footer.exitCode) || (footer.exitCode === 0 ? 'executed' : 'failed');
|
|
459
|
+
if (verbose) {
|
|
460
|
+
console.log(`[VERBOSE] Session ${sessionName} has no live status record; recovered exit ${footer.exitCode} (${correctedStatus}) from log footer`);
|
|
461
|
+
}
|
|
462
|
+
return { running: false, exitCode: footer.exitCode, status: correctedStatus, statusResult: { ...(statusResult || {}), status: correctedStatus, exitCode: footer.exitCode, endTime: statusResult?.endTime || footer.endTime || null } };
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
}
|
|
269
466
|
return {
|
|
270
467
|
running,
|
|
271
468
|
exitCode: running ? null : (statusResult?.exitCode ?? null),
|
|
@@ -300,6 +497,7 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
|
|
|
300
497
|
let stillRunning;
|
|
301
498
|
let exitCode = null;
|
|
302
499
|
let statusResult = null;
|
|
500
|
+
let resolvedStatus = null;
|
|
303
501
|
|
|
304
502
|
if (sessionInfo.isolationBackend && sessionInfo.sessionId) {
|
|
305
503
|
// Isolation mode: use $ --status, with screen -ls only as a fallback
|
|
@@ -308,10 +506,31 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
|
|
|
308
506
|
const state = await getIsolationSessionState(sessionName, sessionInfo, {
|
|
309
507
|
verbose,
|
|
310
508
|
statusProvider: options.statusProvider,
|
|
509
|
+
exitFromLog: options.exitFromLog,
|
|
510
|
+
backendAlive: options.backendAlive,
|
|
511
|
+
sessionRunning: options.sessionRunning,
|
|
311
512
|
});
|
|
312
513
|
stillRunning = state.running;
|
|
313
514
|
exitCode = state.exitCode;
|
|
314
515
|
statusResult = state.statusResult;
|
|
516
|
+
resolvedStatus = state.status || statusResult?.status || null;
|
|
517
|
+
if (state.stale && verbose) {
|
|
518
|
+
console.log(`[VERBOSE] Session ${sessionName} detected as killed/terminated despite an 'executing' status report (issue #1927 cross-check)`);
|
|
519
|
+
}
|
|
520
|
+
// Issue #1927: once start-command reveals the log path, record it in the
|
|
521
|
+
// durable snapshot. If the bot dies and restarts after start-command has
|
|
522
|
+
// garbage-collected the status record, the resumed session can still read
|
|
523
|
+
// the log footer to learn whether it was killed.
|
|
524
|
+
if (statusResult?.logPath && sessionInfo.logPath !== statusResult.logPath) {
|
|
525
|
+
sessionInfo.logPath = statusResult.logPath;
|
|
526
|
+
if (sessionStore) {
|
|
527
|
+
try {
|
|
528
|
+
sessionStore.persist(sessionName, sessionInfo);
|
|
529
|
+
} catch {
|
|
530
|
+
/* best effort — persistence must never break monitoring */
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
}
|
|
315
534
|
} else {
|
|
316
535
|
// Issue #1586: Non-isolation screen sessions cannot reliably detect
|
|
317
536
|
// completion because start-screen keeps the screen alive via `exec bash`.
|
|
@@ -389,6 +608,46 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
|
|
|
389
608
|
}
|
|
390
609
|
}
|
|
391
610
|
|
|
611
|
+
// Issue #1927 (review follow-up): when a /solve session was KILLED
|
|
612
|
+
// (OOM/SIGKILL — the silent failure this issue is about), surface a
|
|
613
|
+
// ready-to-run `--resume <lastSessionId>` command so the surviving
|
|
614
|
+
// parent (the operator, or an automation watching the bot) can pick the
|
|
615
|
+
// work back up. We deliberately do NOT auto-relaunch here: a job that
|
|
616
|
+
// reliably OOMs would storm. The rule "use the LAST of multiple
|
|
617
|
+
// sessions" is honored by reading the last `Session ID:` marker from
|
|
618
|
+
// the captured log. Purely additive — failures never block the
|
|
619
|
+
// completion notification, preserving backward compatibility.
|
|
620
|
+
const resumeExtraSections = [];
|
|
621
|
+
try {
|
|
622
|
+
const outcome = classifySessionOutcome({ exitCode: finalExitCode, status: resolvedStatus });
|
|
623
|
+
const isResumableCommand = (sessionInfo?.command || 'solve') === 'solve';
|
|
624
|
+
if (outcome.killed && isResumableCommand) {
|
|
625
|
+
const logPath = statusResult?.logPath || sessionInfo?.logPath || null;
|
|
626
|
+
// The id must be the AI TOOL's session id, not the isolation session
|
|
627
|
+
// id (sessionInfo.sessionId — wrong namespace for `solve --resume`).
|
|
628
|
+
// Prefer the last `Session ID:` marker in the captured log; fall
|
|
629
|
+
// back to the newest `<sessionId>.log` start-command wrote in the
|
|
630
|
+
// same directory. If neither exists, offer no command (a bogus
|
|
631
|
+
// resume id would be worse than none).
|
|
632
|
+
let lastSessionId = readLastSessionIdFromLog(logPath, { verbose });
|
|
633
|
+
if (!lastSessionId && logPath) {
|
|
634
|
+
lastSessionId = findLatestSessionLogId({ dir: path.dirname(logPath), verbose });
|
|
635
|
+
}
|
|
636
|
+
const resumeCommand = buildResumeCommand({ sessionInfo, lastSessionId });
|
|
637
|
+
const resumeSection = formatResumeSection({ lastSessionId, command: resumeCommand });
|
|
638
|
+
if (resumeSection) {
|
|
639
|
+
resumeExtraSections.push(resumeSection);
|
|
640
|
+
if (verbose) {
|
|
641
|
+
console.log(`[VERBOSE] Session ${sessionName} was killed; offering resume from last session ${lastSessionId}`);
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
} catch (resumeError) {
|
|
646
|
+
if (verbose) {
|
|
647
|
+
console.log(`[VERBOSE] Could not build resume section for ${sessionName}: ${resumeError?.message || resumeError}`);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
|
|
392
651
|
const message = formatSessionCompletionMessage({
|
|
393
652
|
sessionName,
|
|
394
653
|
sessionInfo,
|
|
@@ -397,7 +656,7 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
|
|
|
397
656
|
exitCode: finalExitCode,
|
|
398
657
|
infoBlock: sessionInfo?.infoBlock || '',
|
|
399
658
|
pullRequestUrl,
|
|
400
|
-
extraSections: limitsExtraSections,
|
|
659
|
+
extraSections: [...limitsExtraSections, ...resumeExtraSections],
|
|
401
660
|
});
|
|
402
661
|
|
|
403
662
|
// Update the original reply message if messageId is available, otherwise send new message
|
|
@@ -437,11 +696,11 @@ export async function monitorSessions(bot, verbose = false, options = {}) {
|
|
|
437
696
|
}
|
|
438
697
|
}
|
|
439
698
|
|
|
440
|
-
completeSession(sessionName, finalExitCode || 0, verbose);
|
|
699
|
+
completeSession(sessionName, finalExitCode || 0, verbose, resolvedStatus);
|
|
441
700
|
} catch (error) {
|
|
442
701
|
console.error(`Failed to send completion notification for ${sessionName}:`, error);
|
|
443
702
|
if (isMessageAlreadyUpdatedError(error)) {
|
|
444
|
-
completeSession(sessionName, exitCode || 0, verbose);
|
|
703
|
+
completeSession(sessionName, exitCode || 0, verbose, resolvedStatus);
|
|
445
704
|
} else {
|
|
446
705
|
sessionInfo.lastNotificationError = error.message;
|
|
447
706
|
sessionInfo.lastKnownStatus = statusResult?.status || sessionInfo.lastKnownStatus || null;
|
|
@@ -529,10 +788,88 @@ export function startSessionMonitoring(bot, verbose = false, intervalMs = 30000,
|
|
|
529
788
|
};
|
|
530
789
|
const timer = setInterval(runMonitor, intervalMs);
|
|
531
790
|
runMonitor();
|
|
532
|
-
|
|
791
|
+
const storage = sessionStore ? `durable+in-memory (${sessionStore.snapshotPath})` : 'in-memory';
|
|
792
|
+
console.log(`📊 Session monitoring started (checking every ${intervalMs / 1000} seconds, storage: ${storage})`);
|
|
533
793
|
return timer;
|
|
534
794
|
}
|
|
535
795
|
|
|
796
|
+
/**
|
|
797
|
+
* Issue #1927 (requirements #2 and #4): after a bot restart, reload the sessions
|
|
798
|
+
* that were still being tracked when the previous process died and re-register
|
|
799
|
+
* them so {@link monitorSessions} resumes watching them to completion. The very
|
|
800
|
+
* next monitor tick re-queries each session's status — so a session that was
|
|
801
|
+
* *killed while the bot was down* is finally reported (via the log-footer /
|
|
802
|
+
* backend-liveness cross-check in {@link getIsolationSessionState}) instead of
|
|
803
|
+
* vanishing silently.
|
|
804
|
+
*
|
|
805
|
+
* Only sessions persisted by this bot are resumed (they carry the chatId /
|
|
806
|
+
* messageId needed to notify). The durable snapshot already contains exactly the
|
|
807
|
+
* sessions that had not completed when the previous process died, because
|
|
808
|
+
* completed sessions are removed from it. As a guard we additionally skip any
|
|
809
|
+
* record whose startTime is after the current bot start (it cannot belong to a
|
|
810
|
+
* previous run), satisfying requirement #2's "started before bot start time".
|
|
811
|
+
*
|
|
812
|
+
* @param {object} [options]
|
|
813
|
+
* @param {object} [options.store] - Session store to load from (default: the store set via setSessionStore).
|
|
814
|
+
* @param {number} [options.botStartTime] - Epoch seconds; only sessions started strictly before this are resumed. Defaults to now.
|
|
815
|
+
* @param {boolean} [options.verbose]
|
|
816
|
+
* @returns {Promise<{resumed: Array<{sessionName: string, sessionInfo: object}>, skipped: Array<{sessionName: string, reason: string}>}>}
|
|
817
|
+
*/
|
|
818
|
+
export async function resumeTrackedSessions(options = {}) {
|
|
819
|
+
const { store = sessionStore, verbose = false, botStartTime = Math.floor(Date.now() / 1000) } = options;
|
|
820
|
+
const resumed = [];
|
|
821
|
+
const skipped = [];
|
|
822
|
+
|
|
823
|
+
if (!store) {
|
|
824
|
+
if (verbose) console.log('[VERBOSE] resumeTrackedSessions: no durable session store configured, nothing to resume');
|
|
825
|
+
return { resumed, skipped };
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
let persisted = [];
|
|
829
|
+
try {
|
|
830
|
+
persisted = store.load();
|
|
831
|
+
} catch (error) {
|
|
832
|
+
console.error(`[session-monitor] resumeTrackedSessions: could not load persisted sessions: ${error.message}`);
|
|
833
|
+
return { resumed, skipped };
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
for (const { sessionName, sessionInfo } of persisted) {
|
|
837
|
+
if (activeSessions.has(sessionName)) {
|
|
838
|
+
skipped.push({ sessionName, reason: 'already-tracked' });
|
|
839
|
+
continue;
|
|
840
|
+
}
|
|
841
|
+
// Requirement #2/#4: a session that started after this bot came up cannot be
|
|
842
|
+
// a leftover from a previous run, so never resume it here.
|
|
843
|
+
const startMs = sessionStartMs(sessionInfo);
|
|
844
|
+
if (startMs != null && startMs > botStartTime * 1000) {
|
|
845
|
+
skipped.push({ sessionName, reason: 'started-after-bot-start' });
|
|
846
|
+
if (verbose) console.log(`[VERBOSE] Skipping resume of ${sessionName}: started after bot start`);
|
|
847
|
+
continue;
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
activeSessions.set(sessionName, sessionInfo);
|
|
851
|
+
resumed.push({ sessionName, sessionInfo });
|
|
852
|
+
logEvent('session_resumed', {
|
|
853
|
+
sessionName,
|
|
854
|
+
url: sessionInfo.url || null,
|
|
855
|
+
command: sessionInfo.command || null,
|
|
856
|
+
sessionId: sessionInfo.sessionId || null,
|
|
857
|
+
startTime: sessionInfo.startTime instanceof Date ? sessionInfo.startTime.toISOString() : sessionInfo.startTime || null,
|
|
858
|
+
});
|
|
859
|
+
if (verbose) {
|
|
860
|
+
console.log(`[VERBOSE] Resumed tracking of session ${sessionName} (url: ${sessionInfo.url || 'n/a'}, command: ${sessionInfo.command || 'n/a'}, backend: ${sessionInfo.isolationBackend || 'screen'})`);
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
if (resumed.length > 0) {
|
|
865
|
+
console.log(`♻️ Resumed monitoring of ${resumed.length} session(s) from durable store after restart`);
|
|
866
|
+
} else if (verbose) {
|
|
867
|
+
console.log('[VERBOSE] resumeTrackedSessions: no eligible sessions to resume');
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
return { resumed, skipped };
|
|
871
|
+
}
|
|
872
|
+
|
|
536
873
|
/**
|
|
537
874
|
* Issue #1567: Check if there's an active session for a given URL.
|
|
538
875
|
* This prevents concurrent sessions on the same PR/issue, which causes
|
|
@@ -764,9 +1101,17 @@ export async function getRunningSessionItems(verbose = false, options = {}) {
|
|
|
764
1101
|
let status = null;
|
|
765
1102
|
|
|
766
1103
|
if (sessionInfo.isolationBackend) {
|
|
1104
|
+
// Forward every injectable seam so the listing applies the same #1927
|
|
1105
|
+
// stale-`executing` reconciliation the monitor does — a session that
|
|
1106
|
+
// start-command still reports as `executing` but whose backend is gone (or
|
|
1107
|
+
// whose log footer shows a kill) must not be listed as running — and so the
|
|
1108
|
+
// whole path stays controllable from tests.
|
|
767
1109
|
const state = await getIsolationSessionState(sessionName, sessionInfo, {
|
|
768
1110
|
verbose,
|
|
769
1111
|
statusProvider: options.statusProvider,
|
|
1112
|
+
exitFromLog: options.exitFromLog,
|
|
1113
|
+
backendAlive: options.backendAlive,
|
|
1114
|
+
sessionRunning: options.sessionRunning,
|
|
770
1115
|
});
|
|
771
1116
|
running = state.running;
|
|
772
1117
|
status = state.status || null;
|