@link-assistant/hive-mind 2.0.2 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +138 -0
- package/package.json +1 -1
- package/src/bot-lifecycle.lib.mjs +128 -0
- package/src/bot-logger.lib.mjs +253 -0
- package/src/cleanup.lib.mjs +22 -4
- package/src/cleanup.mjs +15 -2
- package/src/cleanup.os.lib.mjs +94 -8
- package/src/isolation-runner.lib.mjs +378 -11
- package/src/session-monitor.lib.mjs +389 -18
- package/src/session-resume.lib.mjs +269 -0
- package/src/session-status.lib.mjs +141 -0
- package/src/session-store.lib.mjs +232 -0
- package/src/telegram-bot.mjs +65 -13
- package/src/telegram-command-execution.lib.mjs +3 -1
- package/src/telegram-terminal-watch-command.lib.mjs +47 -6
- package/src/work-session-formatting.lib.mjs +44 -11
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,143 @@
|
|
|
1
1
|
# @link-assistant/hive-mind
|
|
2
2
|
|
|
3
|
+
## 2.0.4
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- f1f9b10: fix(telegram): detect OOM/SIGKILL-ed detached sessions and resume tracking after a bot restart (#1927)
|
|
8
|
+
|
|
9
|
+
A `/solve` running in a detached `screen` session was OOM-killed (exit `137`),
|
|
10
|
+
but the Telegram bot stayed alive and **never reported the failure** — the job
|
|
11
|
+
silently hung forever. Two compounding gaps caused this:
|
|
12
|
+
|
|
13
|
+
**Root cause (RC-1, upstream).** The external `start-command` CLI's
|
|
14
|
+
`enrichDetachedStatus` re-derives a detached session's status from backend
|
|
15
|
+
liveness (`screen -ls`). When a shell lingers after the wrapped command is
|
|
16
|
+
already dead, `$ <id> --status` flips an already-completed record
|
|
17
|
+
(`status: executed`, `exitCode: 137`) **back to `executing` and nulls the exit
|
|
18
|
+
code**, even though `start` itself wrote an authoritative `Exit Code: 137` footer
|
|
19
|
+
to the log. The bot's monitor only reacts to a _terminal_ status, so the kill is
|
|
20
|
+
never surfaced. Confirmed against upstream source and filed with a runnable repro
|
|
21
|
+
as [link-foundation/start#134](https://github.com/link-foundation/start/issues/134)
|
|
22
|
+
(a regression of the fix for upstream #60/#101).
|
|
23
|
+
|
|
24
|
+
**Root cause (RC-2/3/4).** The session monitor's registry was in-memory only, so
|
|
25
|
+
a bot restart orphaned every detached `/solve`; there was no "last alive" marker
|
|
26
|
+
to bound what to resume; and the bot log could be overwritten on restart,
|
|
27
|
+
destroying the evidence needed to reconstruct the failure.
|
|
28
|
+
|
|
29
|
+
**Fix (defensive, consumer side — correct regardless of when upstream lands):**
|
|
30
|
+
- **`src/session-status.lib.mjs`** — a shared, dependency-free status vocabulary
|
|
31
|
+
(`RUNNING`/`KILLED`/`FAILURE`, signal classification for 137/143/139/130) so
|
|
32
|
+
every call site agrees on what an exit code means.
|
|
33
|
+
- **`src/isolation-runner.lib.mjs`** — `parseSessionExitFooter` /
|
|
34
|
+
`readSessionExitFromLog` read the **authoritative log footer**, plus
|
|
35
|
+
`checkBackendSessionAlive` / `isSessionRunning` probe the real backend.
|
|
36
|
+
- **`src/session-monitor.lib.mjs`** — when `--status` says `executing`, cross-check
|
|
37
|
+
the footer (authoritative) and a backend-liveness probe gated by a 90s minimum
|
|
38
|
+
session age, so a SIGKILL is reported instead of hanging, while a just-started
|
|
39
|
+
session is never misread.
|
|
40
|
+
- **`src/session-store.lib.mjs`** — durable session registry (atomic
|
|
41
|
+
`sessions.json` snapshot + append-only, never-truncated `sessions-events.jsonl`)
|
|
42
|
+
so a restart can **resume** tracking and finally report sessions killed while
|
|
43
|
+
the bot was down — resuming only sessions started **before** the bot's start
|
|
44
|
+
time.
|
|
45
|
+
- **`src/bot-logger.lib.mjs`** — every log line is prefixed with an ISO-8601
|
|
46
|
+
millisecond timestamp; structured `event()`/`heartbeat()` markers record "last
|
|
47
|
+
alive"; logs **rotate, never overwrite** (prior log preserved as a timestamped
|
|
48
|
+
backup) so no evidence is destroyed.
|
|
49
|
+
- **`src/bot-lifecycle.lib.mjs`** — heartbeat / resume-on-launch / orderly
|
|
50
|
+
shutdown extracted from `telegram-bot.mjs` as pure injectable factories; a
|
|
51
|
+
timestamped `bot_shutdown` marker distinguishes a clean stop from a hard kill.
|
|
52
|
+
- **`src/work-session-formatting.lib.mjs`** + `telegram-bot.mjs` — completion
|
|
53
|
+
messages now call out a **killed** outcome (❌ killed / signal) distinctly from
|
|
54
|
+
an ordinary failure.
|
|
55
|
+
- **`src/telegram-terminal-watch-command.lib.mjs`** — the same fix applied to the
|
|
56
|
+
live `/terminal_watch` loop (req #8, "fix in all places"): it decided
|
|
57
|
+
"completed" purely from `--status`, so a session killed while `--status` still
|
|
58
|
+
read `executing` would be **polled forever** with a misleading "running"
|
|
59
|
+
snapshot — the #1927 silent-hang, in the watch path. It now cross-checks the
|
|
60
|
+
authoritative log footer (`reconcileWatchCompletion`), stops on a recorded exit,
|
|
61
|
+
corrects the displayed status to the real terminal one (e.g. `killed`), and a
|
|
62
|
+
completed-but-failed session renders a ❌ failure title instead of a ✅.
|
|
63
|
+
- **`src/cleanup.os.lib.mjs`** + `src/cleanup.lib.mjs` — review follow-up:
|
|
64
|
+
deduplicated `$` session-data access (cleanup no longer re-derives sessions
|
|
65
|
+
from `screen -ls`/`tmux ls` + per-session `$ --status`; a single
|
|
66
|
+
`listSessionTasks()` reads the whole catalog from `$ --list`, the same source
|
|
67
|
+
`/queue`, `/limits` and the monitor already funnel through), and the cleanup
|
|
68
|
+
listing now annotates **every** hive-mind folder — active _and_ finished — with
|
|
69
|
+
which PR/issue and which session it belongs (or belonged) to.
|
|
70
|
+
- **`src/session-resume.lib.mjs`** — review follow-up: when a detached `/solve`
|
|
71
|
+
is killed, the surviving parent (the bot, or `/hive`) now surfaces a
|
|
72
|
+
ready-to-run `solve <url> … --resume <lastSessionId>` command in the
|
|
73
|
+
killed-session notification. A single `/solve` run prints many `Session ID:`
|
|
74
|
+
markers (auto-continue, watch restarts, manual resume chains); the module reads
|
|
75
|
+
the **last** marker from the log tail (`selectLastSessionId` /
|
|
76
|
+
`readLastSessionIdFromLog`), with a filesystem fallback
|
|
77
|
+
(`findLatestSessionLogId`). The bot deliberately **surfaces** the command rather
|
|
78
|
+
than auto-relaunching (a job that reliably OOMs would storm);
|
|
79
|
+
`planKilledSessionResume` bounds any automatic resume (default `maxAttempts: 1`).
|
|
80
|
+
The section is additive (existing `extraSections` path), emitted only for
|
|
81
|
+
`killed` `/solve` sessions, and failure-isolated so it can never break the
|
|
82
|
+
notification. `args` was added to the persisted session fields so the resume
|
|
83
|
+
command reproduces the original invocation exactly.
|
|
84
|
+
|
|
85
|
+
A `verbose` flag is threaded through the new status/footer/liveness/resume paths
|
|
86
|
+
with explicit `[VERBOSE]` tracing so the next failure leaves a trail (req #6).
|
|
87
|
+
|
|
88
|
+
Added `tests/test-issue-1927-*.mjs` (9 suites, 266 assertions: status vocabulary,
|
|
89
|
+
log-footer parsing, completion labeling, killed-detection, session store, resume,
|
|
90
|
+
bot logger, bot lifecycle, terminal-watch kill). Full deep-dive in
|
|
91
|
+
`docs/case-studies/issue-1927`
|
|
92
|
+
(timeline, 8 requirements, 5 root causes, per-requirement solutions, preserved
|
|
93
|
+
source artifacts), plus a runnable upstream repro under `experiments/`.
|
|
94
|
+
|
|
95
|
+
## 2.0.3
|
|
96
|
+
|
|
97
|
+
### Patch Changes
|
|
98
|
+
|
|
99
|
+
- 40fbf3d: fix(isolation): mount git identity into docker-isolated containers and stop trusting premature terminal status (#1939)
|
|
100
|
+
|
|
101
|
+
A `solve` task launched with `--isolation docker` inside a Docker-in-Docker host
|
|
102
|
+
(`konard/hive-mind-dind:2.0.2`) failed at the system-check stage with
|
|
103
|
+
`❌ Git identity not configured`, even though `gh` was fully authenticated
|
|
104
|
+
(account `konard`). The captured terminal log shows the native start-command
|
|
105
|
+
(`$`) invocation mounting only `~/.config/gh`, `~/.claude`, and `~/.claude.json`
|
|
106
|
+
— **no git identity** — so `git config user.name`/`user.email` were unset inside
|
|
107
|
+
the container and `solve` aborted before doing any work.
|
|
108
|
+
|
|
109
|
+
Root cause: `getDockerIsolationAuthMounts` (`src/isolation-runner.lib.mjs`)
|
|
110
|
+
mounted `gh` and the per-tool credentials but never the git identity. `gh`
|
|
111
|
+
authentication is not a git identity. The fix mounts the host git identity
|
|
112
|
+
(`~/.gitconfig` and the XDG `~/.config/git`, honoring `GIT_CONFIG_GLOBAL` /
|
|
113
|
+
`XDG_CONFIG_HOME`) for **every** tool, alongside `gh`, so the fix applies to all
|
|
114
|
+
isolation callers at once. A new self-healing preflight,
|
|
115
|
+
`ensureHostGitIdentityForIsolation`, gives the mount something to mount: when the
|
|
116
|
+
host has no git identity it derives one from the authenticated `gh` account
|
|
117
|
+
(`gh-setup-git-identity` / `repairGitIdentity`) and, if that is impossible, emits
|
|
118
|
+
one actionable warning naming the exact downstream failure.
|
|
119
|
+
|
|
120
|
+
The same run also exposed a second problem: `$ --list` reported the detached
|
|
121
|
+
session as `status executed` with `exitCode -1` (and no `containerId`) while the
|
|
122
|
+
container was still running, masking the live container and its real exit code.
|
|
123
|
+
`isUnknownDockerExitCode` plus a docker-only cross-check in `isSessionRunning`
|
|
124
|
+
and `getIsolationSessionState` (`src/session-monitor.lib.mjs`) keep an ambiguous
|
|
125
|
+
`terminal + -1` docker session "running" until `docker inspect` confirms the
|
|
126
|
+
container has actually exited; real exit codes and non-docker backends are
|
|
127
|
+
unaffected. A verbose post-launch diagnostic now records `$ --status`, container
|
|
128
|
+
state, and local image presence so the next iteration can confirm the premature
|
|
129
|
+
status and the image re-pull from data.
|
|
130
|
+
|
|
131
|
+
The premature-terminal-status behaviour was reported upstream to
|
|
132
|
+
link-foundation/start and fixed there in `start-command@0.29.1`
|
|
133
|
+
(link-foundation/start#136); `Dockerfile` and `Dockerfile.dind` now pin
|
|
134
|
+
`start-command@0.29.1` so the fixed `$` binary ships in the images, while the
|
|
135
|
+
downstream cross-check stays as defense-in-depth for older hosts.
|
|
136
|
+
|
|
137
|
+
Added `tests/test-issue-1939-docker-isolation.mjs` (25 assertions) and a full
|
|
138
|
+
case study with timeline, root-cause analysis, and the captured logs under
|
|
139
|
+
`docs/case-studies/issue-1939`.
|
|
140
|
+
|
|
3
141
|
## 2.0.2
|
|
4
142
|
|
|
5
143
|
### Patch Changes
|
package/package.json
CHANGED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bot lifecycle helpers extracted from telegram-bot.mjs (issue #1927).
|
|
3
|
+
*
|
|
4
|
+
* These three concerns — a periodic liveness heartbeat, resuming tracked
|
|
5
|
+
* sessions on launch, and an orderly shutdown that records a final timestamped
|
|
6
|
+
* marker — were inline in the bot entrypoint, where they could not be unit
|
|
7
|
+
* tested and pushed the file toward the 1500-line limit (see issue #1593). They
|
|
8
|
+
* are pure factories here: every external dependency (logger, clock, process,
|
|
9
|
+
* console, timer) is injected, so production wiring stays identical while the
|
|
10
|
+
* behaviour is exercised directly by tests.
|
|
11
|
+
*
|
|
12
|
+
* @see https://github.com/link-assistant/hive-mind/issues/1927
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
const DEFAULT_HEARTBEAT_INTERVAL_MS = 60 * 1000;
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Periodic timestamped heartbeat (requirements #3/#4).
|
|
19
|
+
*
|
|
20
|
+
* Writes a heartbeat line on a fixed interval so the "last time the bot was
|
|
21
|
+
* alive" is always discoverable from the log, even when nothing else happens.
|
|
22
|
+
* That marker is what a later restart uses to decide which sessions were running
|
|
23
|
+
* when the bot was last alive. The beat is wrapped so a logging failure can never
|
|
24
|
+
* crash the bot, and the interval is unref'd so it never keeps the process alive
|
|
25
|
+
* on its own.
|
|
26
|
+
*
|
|
27
|
+
* @returns {{ start: () => void, stop: () => void, beat: () => void, get timer(): any }}
|
|
28
|
+
*/
|
|
29
|
+
export function createHeartbeat({ logger, getActiveSessionCount, intervalMs = DEFAULT_HEARTBEAT_INTERVAL_MS, processImpl = process, setIntervalImpl = setInterval, clearIntervalImpl = clearInterval } = {}) {
|
|
30
|
+
let timer = null;
|
|
31
|
+
|
|
32
|
+
const beat = () => {
|
|
33
|
+
try {
|
|
34
|
+
logger.heartbeat({
|
|
35
|
+
activeSessions: typeof getActiveSessionCount === 'function' ? getActiveSessionCount(false) : undefined,
|
|
36
|
+
uptimeSec: Math.floor(processImpl.uptime()),
|
|
37
|
+
});
|
|
38
|
+
} catch {
|
|
39
|
+
/* heartbeat must never crash the bot */
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
start() {
|
|
45
|
+
if (timer) return;
|
|
46
|
+
timer = setIntervalImpl(beat, intervalMs);
|
|
47
|
+
if (timer && typeof timer.unref === 'function') timer.unref();
|
|
48
|
+
beat();
|
|
49
|
+
},
|
|
50
|
+
stop() {
|
|
51
|
+
if (timer) {
|
|
52
|
+
clearIntervalImpl(timer);
|
|
53
|
+
timer = null;
|
|
54
|
+
}
|
|
55
|
+
},
|
|
56
|
+
beat,
|
|
57
|
+
get timer() {
|
|
58
|
+
return timer;
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Resume sessions left tracked by a previous run (requirements #2/#4).
|
|
65
|
+
*
|
|
66
|
+
* After a restart, reload sessions that were still being tracked when the
|
|
67
|
+
* previous process died and re-register them so the monitor resumes watching —
|
|
68
|
+
* and finally reports any that were killed while the bot was down. Logs a
|
|
69
|
+
* `sessions_resumed` event either way and never throws: a resume failure must
|
|
70
|
+
* not stop the bot from coming up.
|
|
71
|
+
*
|
|
72
|
+
* @returns {Promise<{ resumed: any[], skipped: any[], error?: Error }>}
|
|
73
|
+
*/
|
|
74
|
+
export async function resumeSessionsOnLaunch({ resumeTrackedSessions, botStartTime, verbose = false, logger, consoleImpl = console } = {}) {
|
|
75
|
+
try {
|
|
76
|
+
const { resumed, skipped } = await resumeTrackedSessions({ botStartTime, verbose });
|
|
77
|
+
if (resumed.length > 0) {
|
|
78
|
+
consoleImpl.log(`♻️ Resumed ${resumed.length} session(s) from previous run`);
|
|
79
|
+
}
|
|
80
|
+
logger.event('sessions_resumed', {
|
|
81
|
+
resumed: resumed.length,
|
|
82
|
+
skipped: skipped.length,
|
|
83
|
+
sessions: resumed.map(r => r.sessionName),
|
|
84
|
+
});
|
|
85
|
+
return { resumed, skipped };
|
|
86
|
+
} catch (error) {
|
|
87
|
+
consoleImpl.error(`[telegram-bot] Failed to resume tracked sessions: ${error.message}`);
|
|
88
|
+
logger.error('Failed to resume tracked sessions', { error: error.message });
|
|
89
|
+
return { resumed: [], skipped: [], error };
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Build the shutdown signal handler (requirement #3).
|
|
95
|
+
*
|
|
96
|
+
* Records a `bot_shutdown` event (with a timestamp) so the log shows the bot
|
|
97
|
+
* stopped cleanly — the ABSENCE of this line before the next startup is exactly
|
|
98
|
+
* how a later analysis tells an orderly stop apart from a hard kill. The
|
|
99
|
+
* mutation of module state (the `isShuttingDown` flag, aborting the launch
|
|
100
|
+
* controller, clearing timers, stopping the queue) stays with the caller via the
|
|
101
|
+
* injected `onShutdown` / `cleanup` closures, so the timer references live where
|
|
102
|
+
* they are created. Neither logging nor cleanup is allowed to block `bot.stop`.
|
|
103
|
+
*
|
|
104
|
+
* @returns {(signal: string) => void}
|
|
105
|
+
*/
|
|
106
|
+
export function createShutdownHandler({ logger, getActiveSessionCount, verbose = false, onShutdown, cleanup, bot, processImpl = process, consoleImpl = console } = {}) {
|
|
107
|
+
return function handleShutdownSignal(signal) {
|
|
108
|
+
if (typeof onShutdown === 'function') onShutdown();
|
|
109
|
+
try {
|
|
110
|
+
logger.event('bot_shutdown', {
|
|
111
|
+
signal,
|
|
112
|
+
pid: processImpl.pid,
|
|
113
|
+
ppid: processImpl.ppid,
|
|
114
|
+
activeSessions: typeof getActiveSessionCount === 'function' ? getActiveSessionCount(false) : undefined,
|
|
115
|
+
uptimeSec: Math.floor(processImpl.uptime()),
|
|
116
|
+
});
|
|
117
|
+
} catch {
|
|
118
|
+
/* a logging failure must never block shutdown */
|
|
119
|
+
}
|
|
120
|
+
if (verbose) consoleImpl.log(`[VERBOSE] Signal: ${signal}, PID: ${processImpl.pid}, PPID: ${processImpl.ppid}`);
|
|
121
|
+
try {
|
|
122
|
+
if (typeof cleanup === 'function') cleanup();
|
|
123
|
+
} catch {
|
|
124
|
+
/* cleanup is best-effort during shutdown */
|
|
125
|
+
}
|
|
126
|
+
if (bot && typeof bot.stop === 'function') bot.stop(signal);
|
|
127
|
+
};
|
|
128
|
+
}
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Durable, timestamped logger for the Telegram bot.
|
|
3
|
+
*
|
|
4
|
+
* Issue #1927: when a detached /solve session was OOM-killed (exit 137) the bot
|
|
5
|
+
* stayed alive but never reported the failure, and there was NO bot log file to
|
|
6
|
+
* reconstruct what happened — only ephemeral console output that scrolled away.
|
|
7
|
+
* Requirements #3 and #4 of that issue ask for:
|
|
8
|
+
*
|
|
9
|
+
* - Every log line carries a timestamp, so the exact moment of a total failure
|
|
10
|
+
* (process killed mid-write) can be located afterwards.
|
|
11
|
+
* - Previous bot logs are never destroyed. A restart must not overwrite the
|
|
12
|
+
* log of the run that was killed — that log is the only evidence of when the
|
|
13
|
+
* bot was last alive, which gates which sessions we try to resume.
|
|
14
|
+
*
|
|
15
|
+
* This module mirrors every line to the console (so existing behaviour and
|
|
16
|
+
* `journalctl`/screen capture are unchanged) AND appends it to a rotating log
|
|
17
|
+
* file. On startup the previous active log is preserved under a timestamped
|
|
18
|
+
* backup name instead of being overwritten, and oversized logs rotate the same
|
|
19
|
+
* way mid-run. Backups are pruned only down to a generous configurable cap.
|
|
20
|
+
*
|
|
21
|
+
* The logger is intentionally dependency-free (node:fs/node:path only) and fully
|
|
22
|
+
* injectable so it can be unit-tested without touching the real filesystem.
|
|
23
|
+
*
|
|
24
|
+
* @see https://github.com/link-assistant/hive-mind/issues/1927
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import fs from 'node:fs';
|
|
28
|
+
import os from 'node:os';
|
|
29
|
+
import path from 'node:path';
|
|
30
|
+
|
|
31
|
+
export const LOG_LEVELS = ['debug', 'info', 'warn', 'error'];
|
|
32
|
+
|
|
33
|
+
const DEFAULT_MAX_BYTES = 10 * 1024 * 1024; // 10 MiB per active file before rotating
|
|
34
|
+
const DEFAULT_MAX_BACKUPS = 100; // keep up to 100 rotated logs (newest wins on prune)
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Resolve the directory bot logs are written to. Honors HIVE_MIND_LOG_DIR, then
|
|
38
|
+
* the start-command log root, then a stable per-user fallback. Never throws.
|
|
39
|
+
*
|
|
40
|
+
* @param {object} [env=process.env]
|
|
41
|
+
* @param {Function} [homedir=os.homedir]
|
|
42
|
+
* @returns {string} Absolute directory path
|
|
43
|
+
*/
|
|
44
|
+
export function resolveBotLogDir(env = process.env, homedir = os.homedir) {
|
|
45
|
+
const explicit = String(env.HIVE_MIND_LOG_DIR || '').trim();
|
|
46
|
+
if (explicit) return explicit;
|
|
47
|
+
const home = (() => {
|
|
48
|
+
try {
|
|
49
|
+
return homedir();
|
|
50
|
+
} catch {
|
|
51
|
+
return '/tmp';
|
|
52
|
+
}
|
|
53
|
+
})();
|
|
54
|
+
return path.join(home, '.hive-mind', 'logs');
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Build the timestamp prefix used on every line: ISO 8601 with milliseconds.
|
|
59
|
+
* @param {Date} date
|
|
60
|
+
* @returns {string}
|
|
61
|
+
*/
|
|
62
|
+
export function formatLogTimestamp(date) {
|
|
63
|
+
try {
|
|
64
|
+
return date.toISOString();
|
|
65
|
+
} catch {
|
|
66
|
+
return new Date(0).toISOString();
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Turn an ISO timestamp into a filesystem-safe token (no colons) so it can be
|
|
72
|
+
* embedded in a backup filename on every platform.
|
|
73
|
+
* @param {Date} date
|
|
74
|
+
* @returns {string}
|
|
75
|
+
*/
|
|
76
|
+
function fileStamp(date) {
|
|
77
|
+
return formatLogTimestamp(date).replace(/[:.]/g, '-');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function serializeMeta(meta) {
|
|
81
|
+
if (meta === undefined || meta === null) return '';
|
|
82
|
+
if (typeof meta === 'string') return meta ? ` ${meta}` : '';
|
|
83
|
+
try {
|
|
84
|
+
const json = JSON.stringify(meta, (_key, value) => (typeof value === 'bigint' ? value.toString() : value));
|
|
85
|
+
return json && json !== '{}' ? ` ${json}` : '';
|
|
86
|
+
} catch {
|
|
87
|
+
return ` ${String(meta)}`;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Format a single structured log line (without trailing newline).
|
|
93
|
+
* Shape: `<ISO> <LEVEL> <message> <json-meta?>`
|
|
94
|
+
*
|
|
95
|
+
* @param {string} level
|
|
96
|
+
* @param {string} message
|
|
97
|
+
* @param {*} [meta]
|
|
98
|
+
* @param {Date} [date]
|
|
99
|
+
* @returns {string}
|
|
100
|
+
*/
|
|
101
|
+
export function formatLogLine(level, message, meta, date = new Date()) {
|
|
102
|
+
const lvl = String(level || 'info')
|
|
103
|
+
.toUpperCase()
|
|
104
|
+
.padEnd(5);
|
|
105
|
+
return `${formatLogTimestamp(date)} ${lvl} ${message}${serializeMeta(meta)}`;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Create a durable bot logger.
|
|
110
|
+
*
|
|
111
|
+
* @param {object} [options]
|
|
112
|
+
* @param {string} [options.dir] - Directory for log files (default: resolveBotLogDir()).
|
|
113
|
+
* @param {string} [options.baseName='telegram-bot'] - Base file name (without extension).
|
|
114
|
+
* @param {number} [options.maxBytes] - Rotate the active file once it exceeds this size.
|
|
115
|
+
* @param {number} [options.maxBackups] - Keep at most this many rotated backups.
|
|
116
|
+
* @param {boolean} [options.mirrorConsole=true] - Also write each line to console.
|
|
117
|
+
* @param {boolean} [options.verbose=false] - Emit debug-level lines (otherwise suppressed).
|
|
118
|
+
* @param {boolean} [options.rotateOnStart=true] - Preserve a previous active log on startup.
|
|
119
|
+
* @param {object} [options.fsImpl=fs] - Injectable fs (for tests).
|
|
120
|
+
* @param {Function} [options.now] - Injectable clock returning a Date (for tests).
|
|
121
|
+
* @param {object} [options.consoleImpl=console] - Injectable console (for tests).
|
|
122
|
+
* @returns {object} Logger instance.
|
|
123
|
+
*/
|
|
124
|
+
export function createBotLogger(options = {}) {
|
|
125
|
+
const { dir = resolveBotLogDir(), baseName = 'telegram-bot', maxBytes = DEFAULT_MAX_BYTES, maxBackups = DEFAULT_MAX_BACKUPS, mirrorConsole = true, verbose = false, rotateOnStart = true, fsImpl = fs, now = () => new Date(), consoleImpl = console } = options;
|
|
126
|
+
|
|
127
|
+
const activePath = path.join(dir, `${baseName}.log`);
|
|
128
|
+
let fileDisabled = false; // set if the filesystem is unusable; console still works
|
|
129
|
+
|
|
130
|
+
function ensureDir() {
|
|
131
|
+
try {
|
|
132
|
+
fsImpl.mkdirSync(dir, { recursive: true });
|
|
133
|
+
return true;
|
|
134
|
+
} catch (error) {
|
|
135
|
+
if (!fileDisabled) {
|
|
136
|
+
consoleImpl.error(`[bot-logger] Could not create log dir ${dir}: ${error.message} — file logging disabled, console only`);
|
|
137
|
+
}
|
|
138
|
+
fileDisabled = true;
|
|
139
|
+
return false;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function backupName(date) {
|
|
144
|
+
return path.join(dir, `${baseName}-${fileStamp(date)}.log`);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Preserve the previous run's log instead of overwriting it (requirement #4).
|
|
148
|
+
function rotateExisting(reason) {
|
|
149
|
+
try {
|
|
150
|
+
if (!fsImpl.existsSync(activePath)) return;
|
|
151
|
+
const stat = fsImpl.statSync(activePath);
|
|
152
|
+
if (!stat || stat.size === 0) return;
|
|
153
|
+
let target = backupName(now());
|
|
154
|
+
// Avoid clobbering an existing backup created within the same millisecond.
|
|
155
|
+
let suffix = 1;
|
|
156
|
+
while (fsImpl.existsSync(target)) {
|
|
157
|
+
target = path.join(dir, `${baseName}-${fileStamp(now())}-${suffix}.log`);
|
|
158
|
+
suffix += 1;
|
|
159
|
+
}
|
|
160
|
+
fsImpl.renameSync(activePath, target);
|
|
161
|
+
if (verbose) consoleImpl.log(`[bot-logger] Rotated previous log to ${target} (${reason})`);
|
|
162
|
+
pruneBackups();
|
|
163
|
+
} catch (error) {
|
|
164
|
+
consoleImpl.error(`[bot-logger] Log rotation failed (${reason}): ${error.message}`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
function pruneBackups() {
|
|
169
|
+
if (!Number.isFinite(maxBackups) || maxBackups < 0) return; // unbounded: never destroy
|
|
170
|
+
try {
|
|
171
|
+
const entries = fsImpl
|
|
172
|
+
.readdirSync(dir)
|
|
173
|
+
.filter(name => name.startsWith(`${baseName}-`) && name.endsWith('.log'))
|
|
174
|
+
.sort(); // timestamped names sort chronologically
|
|
175
|
+
const excess = entries.length - maxBackups;
|
|
176
|
+
for (let i = 0; i < excess; i++) {
|
|
177
|
+
try {
|
|
178
|
+
fsImpl.unlinkSync(path.join(dir, entries[i]));
|
|
179
|
+
} catch {
|
|
180
|
+
/* best effort */
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
} catch {
|
|
184
|
+
/* best effort */
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (ensureDir() && rotateOnStart) {
|
|
189
|
+
rotateExisting('startup');
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function appendLine(line) {
|
|
193
|
+
if (fileDisabled) return;
|
|
194
|
+
try {
|
|
195
|
+
// Size-based rotation: keep the active file bounded mid-run while never
|
|
196
|
+
// destroying data (the oversized file becomes a timestamped backup).
|
|
197
|
+
let size = 0;
|
|
198
|
+
try {
|
|
199
|
+
size = fsImpl.statSync(activePath).size;
|
|
200
|
+
} catch {
|
|
201
|
+
size = 0;
|
|
202
|
+
}
|
|
203
|
+
if (size > 0 && size + line.length + 1 > maxBytes) {
|
|
204
|
+
rotateExisting('size');
|
|
205
|
+
}
|
|
206
|
+
fsImpl.appendFileSync(activePath, line + '\n');
|
|
207
|
+
} catch (error) {
|
|
208
|
+
if (!fileDisabled) {
|
|
209
|
+
consoleImpl.error(`[bot-logger] Could not write log line: ${error.message} — file logging disabled`);
|
|
210
|
+
}
|
|
211
|
+
fileDisabled = true;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function emit(level, message, meta) {
|
|
216
|
+
if (level === 'debug' && !verbose) return;
|
|
217
|
+
const date = now();
|
|
218
|
+
const line = formatLogLine(level, message, meta, date);
|
|
219
|
+
appendLine(line);
|
|
220
|
+
if (mirrorConsole) {
|
|
221
|
+
const sink = level === 'error' ? consoleImpl.error : level === 'warn' ? consoleImpl.warn : consoleImpl.log;
|
|
222
|
+
sink(line);
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return {
|
|
227
|
+
/** Absolute path of the active log file. */
|
|
228
|
+
get filePath() {
|
|
229
|
+
return activePath;
|
|
230
|
+
},
|
|
231
|
+
/** Absolute directory holding the active + backup logs. */
|
|
232
|
+
get dir() {
|
|
233
|
+
return dir;
|
|
234
|
+
},
|
|
235
|
+
/** True when file writes have been disabled (console still works). */
|
|
236
|
+
get fileDisabled() {
|
|
237
|
+
return fileDisabled;
|
|
238
|
+
},
|
|
239
|
+
debug: (message, meta) => emit('debug', message, meta),
|
|
240
|
+
info: (message, meta) => emit('info', message, meta),
|
|
241
|
+
warn: (message, meta) => emit('warn', message, meta),
|
|
242
|
+
error: (message, meta) => emit('error', message, meta),
|
|
243
|
+
/**
|
|
244
|
+
* Record a structured lifecycle/session event. `type` is uppercased into the
|
|
245
|
+
* message so events are greppable (e.g. `grep ' EVENT session_killed '`).
|
|
246
|
+
*/
|
|
247
|
+
event: (type, data) => emit('info', `EVENT ${type}`, data),
|
|
248
|
+
/** Record a heartbeat marker so the last-active time is always discoverable. */
|
|
249
|
+
heartbeat: data => emit('info', 'EVENT heartbeat', { pid: process.pid, ...data }),
|
|
250
|
+
/** Re-export of the formatter for callers that need raw lines. */
|
|
251
|
+
formatLine: formatLogLine,
|
|
252
|
+
};
|
|
253
|
+
}
|
package/src/cleanup.lib.mjs
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
* @see https://github.com/link-assistant/hive-mind/issues/1848
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
import { isValidIssueBranchName } from './solve.branch.lib.mjs';
|
|
20
|
+
import { isValidIssueBranchName, parseIssueBranchName } from './solve.branch.lib.mjs';
|
|
21
21
|
|
|
22
22
|
/**
|
|
23
23
|
* Directory names directly under the tmp root that must never be removed by
|
|
@@ -296,12 +296,17 @@ export function classifyEntry(entry, ctx) {
|
|
|
296
296
|
export function classifyEntries(entries, ctx) {
|
|
297
297
|
const keep = [];
|
|
298
298
|
const remove = [];
|
|
299
|
-
const { matchers = [], gitInfoByPath = new Map() } = ctx || {};
|
|
299
|
+
const { matchers = [], sessionMatchers = [], gitInfoByPath = new Map() } = ctx || {};
|
|
300
300
|
for (const entry of entries || []) {
|
|
301
301
|
const { action, reason } = classifyEntry(entry, ctx);
|
|
302
302
|
const gitInfo = gitInfoByPath.get(entry.path) || null;
|
|
303
303
|
const activeTask = reason === 'active-task' ? folderMatchesActiveTask(gitInfo, matchers) : null;
|
|
304
|
-
|
|
304
|
+
// For folders that are NOT an active task, look up the (possibly finished)
|
|
305
|
+
// session this folder once belonged to so the listing can still report its
|
|
306
|
+
// PR and session id — issue #1927 review: "also for non active tasks to
|
|
307
|
+
// which pull request and to which session it was belonging."
|
|
308
|
+
const session = activeTask ? null : folderMatchesActiveTask(gitInfo, sessionMatchers);
|
|
309
|
+
const record = { name: entry.name, path: entry.path, size: entry.size ?? null, reason, gitInfo, activeTask, session };
|
|
305
310
|
if (action === 'remove') remove.push(record);
|
|
306
311
|
else keep.push(record);
|
|
307
312
|
}
|
|
@@ -399,7 +404,13 @@ export function formatTaskSummary(task) {
|
|
|
399
404
|
*/
|
|
400
405
|
export function formatEntryContext(item) {
|
|
401
406
|
const details = [];
|
|
402
|
-
if (item?.activeTask)
|
|
407
|
+
if (item?.activeTask) {
|
|
408
|
+
details.push(`task ${formatTaskSummary(item.activeTask)}`);
|
|
409
|
+
} else if (item?.session) {
|
|
410
|
+
// A finished (or otherwise non-active) folder still tells us which PR/issue
|
|
411
|
+
// and session it belonged to — render it with a "was" prefix.
|
|
412
|
+
details.push(`was ${formatTaskSummary(item.session)}`);
|
|
413
|
+
}
|
|
403
414
|
|
|
404
415
|
const gitInfo = item?.gitInfo;
|
|
405
416
|
if (gitInfo) {
|
|
@@ -408,6 +419,13 @@ export function formatEntryContext(item) {
|
|
|
408
419
|
if (remote) gitParts.push(`repo ${remote.owner}/${remote.repo}`);
|
|
409
420
|
if (gitInfo.branch) gitParts.push(`branch ${gitInfo.branch}`);
|
|
410
421
|
if (gitInfo.dirty) gitParts.push('dirty/unpushed');
|
|
422
|
+
// When neither an active task nor a known session resolved the issue/PR,
|
|
423
|
+
// derive the issue number from the folder's branch so every hive-mind
|
|
424
|
+
// folder in the listing still shows which issue it belongs to.
|
|
425
|
+
if (!item?.activeTask && !item?.session && gitInfo.branch) {
|
|
426
|
+
const parsed = parseIssueBranchName(gitInfo.branch);
|
|
427
|
+
if (parsed) gitParts.push(`issue #${parsed.issueNumber}`);
|
|
428
|
+
}
|
|
411
429
|
if (gitParts.length > 0) details.push(gitParts.join(', '));
|
|
412
430
|
}
|
|
413
431
|
|
package/src/cleanup.mjs
CHANGED
|
@@ -36,7 +36,7 @@ import { promises as fsp } from 'node:fs';
|
|
|
36
36
|
|
|
37
37
|
import { isConfirmationYes, readConfirmationLine } from './confirmation.lib.mjs';
|
|
38
38
|
import { classifyEntries, summarize, formatBytes, describeReason, buildActiveMatchers, DEFAULT_PROTECTED_NAMES, formatEntryContext, formatTaskSummary } from './cleanup.lib.mjs';
|
|
39
|
-
import { getTempRoot, listTempEntries, getPathSize, readFolderGitInfo, listProcessHeldPaths, getActiveTasks, removePath, runSystemCleanup, collectProcessDebugReport, signalOrphanedAgentTrees } from './cleanup.os.lib.mjs';
|
|
39
|
+
import { getTempRoot, listTempEntries, getPathSize, readFolderGitInfo, listProcessHeldPaths, getActiveTasks, listSessionTasks, removePath, runSystemCleanup, collectProcessDebugReport, signalOrphanedAgentTrees } from './cleanup.os.lib.mjs';
|
|
40
40
|
import { formatProcessDebugReport } from './process-debug.lib.mjs';
|
|
41
41
|
|
|
42
42
|
const args = process.argv.slice(2);
|
|
@@ -238,9 +238,21 @@ async function main() {
|
|
|
238
238
|
const heldPaths = listProcessHeldPaths(tempRoot);
|
|
239
239
|
await vlog(`Process-held paths: ${[...heldPaths].join(', ') || '(none)'}`);
|
|
240
240
|
|
|
241
|
+
// Enumerate every start-command session once (active AND finished) so we can
|
|
242
|
+
// both detect active tasks and annotate finished folders with the PR/session
|
|
243
|
+
// they belonged to. Reused as the source for getActiveTasks to avoid a second
|
|
244
|
+
// `$ --list` call.
|
|
245
|
+
let sessionTasks = [];
|
|
246
|
+
let sessionMatchers = [];
|
|
247
|
+
if (options.useSessions) {
|
|
248
|
+
sessionTasks = await listSessionTasks({ verbose: options.verbose, resolveBranches: options.resolveBranches });
|
|
249
|
+
sessionMatchers = buildActiveMatchers(sessionTasks);
|
|
250
|
+
await vlog(`Known sessions (active + finished): ${sessionTasks.length}`);
|
|
251
|
+
}
|
|
252
|
+
|
|
241
253
|
let matchers = [];
|
|
242
254
|
if (options.keepActiveTasks) {
|
|
243
|
-
const activeTasks = await getActiveTasks({ useSessions: options.useSessions, resolveBranches: options.resolveBranches });
|
|
255
|
+
const activeTasks = await getActiveTasks({ useSessions: options.useSessions, resolveBranches: options.resolveBranches, sessionTasks });
|
|
244
256
|
matchers = buildActiveMatchers(activeTasks);
|
|
245
257
|
if (activeTasks.length > 0) {
|
|
246
258
|
await log(`🏃 Active tasks detected: ${activeTasks.length}`);
|
|
@@ -275,6 +287,7 @@ async function main() {
|
|
|
275
287
|
selfPaths,
|
|
276
288
|
heldPaths,
|
|
277
289
|
matchers,
|
|
290
|
+
sessionMatchers,
|
|
278
291
|
gitInfoByPath,
|
|
279
292
|
};
|
|
280
293
|
const classified = classifyEntries(entries, ctx);
|