alvin-bot 4.16.1 → 4.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,53 @@
2
2
 
3
3
  All notable changes to Alvin Bot are documented here.
4
4
 
5
+ ## [4.18.0] — 2026-04-20
6
+
7
+ ### ⚡ Performance + Hardening: medium-priority cleanups from the stability audit
8
+
9
+ Completes the audit work started in 4.17.0 by addressing the remaining medium-severity findings.
10
+
11
+ **Performance (hot path):**
12
+ - **User profiles now cached in memory** (`src/services/users.ts`). Previously `touchProfile` — called on every inbound message — did a sync `readFileSync` + `writeFileSync` on disk. Now it updates an in-memory cache and schedules a debounced flush (2s batch window). A final flush runs on graceful shutdown so nothing is lost. Drops 2 blocking fs operations per message.
13
+ - **Embeddings index now cached** (`src/services/embeddings.ts`). Semantic search previously re-read + re-parsed the full on-disk index on every query (100+ MB for large memories). Now cached in memory with mtime-based invalidation — external reindexers still picked up without a restart.
14
+ - **Skills no longer force-reload every 5 minutes** (`src/services/skills.ts`). `getSkills()` used to re-scan the disk after 5min even though `fs.watch` already triggers hot-reload on change. Cache is now authoritative.
15
+
16
+ **Hardening (unbounded growth):**
17
+ - **Sub-agents map capped at 1000** (`src/services/subagents.ts`). Hits the 90%-target on overflow and evicts oldest delivered/terminated entries first. Running agents are never evicted.
18
+ - **Async-agent pending map capped at 500** (`src/services/async-agent-watcher.ts`). Same LRU strategy for orphaned `registerPending` entries.
19
+ - **Browser gateway + MCP subprocess stderr now have error handlers** (`browser-manager.ts`, `mcp.ts`). Previously a stream error would throw unhandled and could crash the node process.
20
+
21
+ **Net effect:** message path now does zero blocking fs reads/writes on the profile/skills/embeddings side. Long-running installs can't grow the in-memory state beyond the caps. No API changes.
22
+
23
+ ## [4.17.0] — 2026-04-20
24
+
25
+ ### 🛡️ Hardening: long-running stability audit + leak fixes
26
+
27
+ Ran a full audit of leak/stability hazards for 24/7 operation. Fixed the critical findings and added a disk-cleanup service so the bot stays lean over months of uptime.
28
+
29
+ **Fixes:**
30
+ - **WhatsApp event-listener leak on reconnect** (`src/platforms/whatsapp.ts`): Before every new socket, the previous socket's listeners are now removed and the old socket is ended. Without this, every reconnect stacked new listeners on top of old ones — causing memory growth and duplicate message processing after long sessions.
31
+ - **CDP file-descriptor leak** (`src/services/cdp-bootstrap.ts`): The log-file fd passed to the detached Chromium spawn is now closed in the parent after the child inherits it. Previously leaked one fd per browser bootstrap.
32
+ - **Heartbeat + auto-update timers now `.unref()`'d** and explicitly stopped in the shutdown handler. Prevents timers from keeping the process alive during graceful exit.
33
+
34
+ ### 🧹 Feature: disk-cleanup service
35
+
36
+ New service (`src/services/disk-cleanup.ts`) that runs automatically once a day. Deletes transient files that grow without bound on long-running installs:
37
+ - Bot log rotation (>100 MB by default)
38
+ - Browser screenshots (>30 days)
39
+ - Subagent output streams (>30 days)
40
+ - `/tmp/alvin-bot/` media (>7 days)
41
+ - WhatsApp media cache (>30 days)
42
+ - CDP log file
43
+
44
+ **NEVER touched:** memory, assets, workspaces, cron-jobs, .env, session-store, delivery-queue. Memory is protected.
45
+
46
+ **Configuration via env:** `CLEANUP_LOG_MAX_MB`, `CLEANUP_SCREENSHOTS_DAYS`, `CLEANUP_SUBAGENTS_DAYS`, `CLEANUP_TMP_DAYS`, `CLEANUP_WA_MEDIA_DAYS`. Set any to `0` to disable that category.
47
+
48
+ **Telegram command:**
49
+ - `/cleanup` — show current policy + protected paths
50
+ - `/cleanup run` — trigger manual pass, get stats back
51
+
5
52
  ## [4.16.1] — 2026-04-20
6
53
 
7
54
  ### 🆕 Feature: /update shows release highlights
@@ -28,6 +28,7 @@ import { getWebPort } from "../web/server.js";
28
28
  import { getUsageSummary, getAllRateLimits, formatTokens } from "../services/usage-tracker.js";
29
29
  import { runUpdate, getAutoUpdate, setAutoUpdate, startAutoUpdateLoop } from "../services/updater.js";
30
30
  import { getReleaseHighlights } from "../services/release-highlights.js";
31
+ import { runCleanup, getCleanupPolicy } from "../services/disk-cleanup.js";
31
32
  import { getHealthStatus, isFailedOver } from "../services/heartbeat.js";
32
33
  import { t, LOCALE_NAMES, LOCALE_FLAGS } from "../i18n.js";
33
34
  // Kick off auto-update loop on module load if the persistent flag is set.
@@ -1919,6 +1920,36 @@ export function registerCommands(bot) {
1919
1920
  await ctx.reply(`${t("bot.autoupdate.statusLabel", lang)} *${status ? "ON" : "OFF"}*\n\n${t("bot.autoupdate.commandsLabel", lang)}\n\`/autoupdate on\`\n\`/autoupdate off\``, { parse_mode: "Markdown" });
1920
1921
  }
1921
1922
  });
1923
+ // /cleanup — trigger disk cleanup manually, or show current policy.
1924
+ // /cleanup → show policy
1925
+ // /cleanup run → run a cleanup pass and report what was deleted
1926
+ bot.command("cleanup", async (ctx) => {
1927
+ const arg = (ctx.match || "").trim().toLowerCase();
1928
+ if (arg === "run" || arg === "now") {
1929
+ await ctx.reply("🧹 Running disk cleanup...");
1930
+ const r = await runCleanup();
1931
+ const bytes = r.bytesReclaimed;
1932
+ const human = bytes < 1024 * 1024
1933
+ ? `${(bytes / 1024).toFixed(1)} KB`
1934
+ : bytes < 1024 * 1024 * 1024
1935
+ ? `${(bytes / 1024 / 1024).toFixed(1)} MB`
1936
+ : `${(bytes / 1024 / 1024 / 1024).toFixed(2)} GB`;
1937
+ const errLine = r.errors.length > 0 ? `\n⚠️ ${r.errors.length} error(s)` : "";
1938
+ await ctx.reply(`✅ Cleanup done\n• Files deleted: ${r.filesDeleted}\n• Logs rotated: ${r.logsRotated}\n• Reclaimed: ${human}${errLine}`);
1939
+ }
1940
+ else {
1941
+ const p = getCleanupPolicy();
1942
+ await ctx.reply(`🧹 *Cleanup policy*\n` +
1943
+ `• Log rotation: >${p.logMaxSizeMb} MB\n` +
1944
+ `• Screenshots: >${p.screenshotsMaxAgeDays} days\n` +
1945
+ `• Subagent outputs: >${p.subagentsMaxAgeDays} days\n` +
1946
+ `• /tmp/alvin-bot: >${p.tmpMaxAgeDays} days\n` +
1947
+ `• WhatsApp media: >${p.waMediaMaxAgeDays} days\n\n` +
1948
+ `Memory, assets, workspaces, cron jobs are NEVER touched.\n\n` +
1949
+ `Configure via env: \`CLEANUP_LOG_MAX_MB\`, \`CLEANUP_SCREENSHOTS_DAYS\`, \`CLEANUP_SUBAGENTS_DAYS\`, \`CLEANUP_TMP_DAYS\`, \`CLEANUP_WA_MEDIA_DAYS\`\n\n` +
1950
+ `Run manually: \`/cleanup run\``, { parse_mode: "Markdown" });
1951
+ }
1952
+ });
1922
1953
  // ── /sub-agents — manage background subagents (cron jobs + manual spawns) ──
1923
1954
  //
1924
1955
  // /sub-agents → show current config + running agents
package/dist/index.js CHANGED
@@ -155,7 +155,10 @@ import { startSessionCleanup, stopSessionCleanup, attachPersistHook } from "./se
155
155
  import { loadPersistedSessions, flushSessions, schedulePersist, } from "./services/session-persistence.js";
156
156
  import { processQueue, cleanupQueue, setSenders, enqueue } from "./services/delivery-queue.js";
157
157
  import { discoverTools } from "./services/tool-discovery.js";
158
- import { startHeartbeat } from "./services/heartbeat.js";
158
+ import { startHeartbeat, stopHeartbeat } from "./services/heartbeat.js";
159
+ import { stopAutoUpdateLoop } from "./services/updater.js";
160
+ import { startCleanupLoop, stopCleanupLoop } from "./services/disk-cleanup.js";
161
+ import { flushProfiles } from "./services/users.js";
159
162
  import { initEmbeddings } from "./services/embeddings.js";
160
163
  import { loadSkills } from "./services/skills.js";
161
164
  import { loadHooks } from "./services/hooks.js";
@@ -335,10 +338,19 @@ const shutdown = async () => {
335
338
  stopAsyncAgentWatcher();
336
339
  stopSessionCleanup();
337
340
  stopWorkspaceWatcher();
341
+ stopHeartbeat();
342
+ stopAutoUpdateLoop();
343
+ stopCleanupLoop();
338
344
  // v4.11.0 — Final immediate flush of in-memory sessions to disk before exit.
339
345
  // The debounced timer might be pending; flushSessions() cancels it and writes
340
346
  // synchronously so the next boot can rehydrate the latest state.
341
347
  await flushSessions().catch((err) => console.warn("[shutdown] flushSessions failed:", err));
348
+ try {
349
+ flushProfiles();
350
+ }
351
+ catch (err) {
352
+ console.warn("[shutdown] flushProfiles failed:", err);
353
+ }
342
354
  if (queueInterval)
343
355
  clearInterval(queueInterval);
344
356
  if (queueCleanupInterval)
@@ -612,5 +624,6 @@ else {
612
624
  // Start heartbeat monitor even without Telegram
613
625
  startHeartbeat();
614
626
  startWatchdog();
627
+ startCleanupLoop();
615
628
  initEmbeddings().catch(() => { });
616
629
  }
@@ -252,6 +252,19 @@ export class WhatsAppAdapter {
252
252
  fs.mkdirSync(authDir, { recursive: true });
253
253
  const { state, saveCreds } = await useMultiFileAuthState(authDir);
254
254
  const { version } = await fetchLatestBaileysVersion();
255
+ // Cleanup previous socket (reconnect path) — without this, every reconnect
256
+ // stacks a new set of listeners on baileys' EventEmitter, so messages get
257
+ // processed N times after N reconnects and closures leak.
258
+ if (this.sock) {
259
+ try {
260
+ this.sock.ev?.removeAllListeners?.();
261
+ this.sock.end?.(new Error("reconnect"));
262
+ }
263
+ catch {
264
+ // best-effort cleanup — ignore failures from already-dead socket
265
+ }
266
+ this.sock = null;
267
+ }
255
268
  const sock = makeWASocket({
256
269
  version,
257
270
  auth: {
@@ -62,6 +62,28 @@ function getMissingFileFailureMs() {
62
62
  const pending = new Map();
63
63
  let pollTimer = null;
64
64
  let started = false;
65
+ /**
66
+ * Hard cap on the pending-agents map. Without this, a bot that runs many
67
+ * async agents but sees some fail to write their outputFile would see
68
+ * entries linger up to `giveUpAt` (12h default). If the rate of
69
+ * registerPending() outpaces resolutions for days, memory and the disk
70
+ * state file grow unbounded. We evict oldest-first when over the cap.
71
+ */
72
+ const MAX_PENDING_AGENTS = 500;
73
+ function enforcePendingCap() {
74
+ if (pending.size < MAX_PENDING_AGENTS)
75
+ return;
76
+ const entries = [...pending.entries()].sort((a, b) => a[1].startedAt - b[1].startedAt);
77
+ const target = Math.floor(MAX_PENDING_AGENTS * 0.9);
78
+ let toEvict = pending.size - target;
79
+ for (const [id] of entries) {
80
+ if (toEvict <= 0)
81
+ break;
82
+ pending.delete(id);
83
+ toEvict--;
84
+ }
85
+ console.warn(`[async-agent-watcher] pending map hit cap ${MAX_PENDING_AGENTS}, evicted to ${pending.size}`);
86
+ }
65
87
  // ── Persistence ───────────────────────────────────────────────────
66
88
  function loadFromDisk() {
67
89
  try {
@@ -110,6 +132,7 @@ export function registerPendingAgent(input) {
110
132
  sessionKey: input.sessionKey,
111
133
  platform: input.platform,
112
134
  };
135
+ enforcePendingCap();
113
136
  pending.set(input.agentId, entry);
114
137
  saveToDisk();
115
138
  }
@@ -233,6 +233,17 @@ async function ensureGateway() {
233
233
  gatewayProcess.on("exit", () => {
234
234
  gatewayProcess = null;
235
235
  });
236
+ // Surface spawn failures so we don't silently think the gateway is running.
237
+ gatewayProcess.on("error", (err) => {
238
+ log(`gateway spawn error: ${err.message}`);
239
+ gatewayProcess = null;
240
+ });
241
+ // Drain stdio pipes — otherwise stdout/stderr buffer fills and the child
242
+ // blocks on write. We don't care about the content (just that they drain).
243
+ gatewayProcess.stdout?.on("error", () => { });
244
+ gatewayProcess.stderr?.on("error", () => { });
245
+ gatewayProcess.stdout?.resume();
246
+ gatewayProcess.stderr?.resume();
236
247
  // Wait for startup (max 10s)
237
248
  for (let i = 0; i < 20; i++) {
238
249
  await new Promise((r) => setTimeout(r, 500));
@@ -196,6 +196,12 @@ export async function ensureRunning(opts = {}) {
196
196
  detached: true,
197
197
  });
198
198
  child.unref();
199
+ // The child inherits its own copy of the fd. Close our copy so the parent
200
+ // process doesn't leak a file descriptor per Chromium bootstrap.
201
+ try {
202
+ fs.closeSync(logStream);
203
+ }
204
+ catch { /* already closed — fine */ }
199
205
  if (!child.pid) {
200
206
  throw new Error("Failed to spawn Chromium (no PID)");
201
207
  }
@@ -0,0 +1,203 @@
1
+ /**
2
+ * Disk Cleanup Service — periodic cleanup of transient bot files.
3
+ *
4
+ * Targets files that are SAFE to delete (logs, temp screenshots, browser
5
+ * artifacts, old subagent streams) and leaves critical data alone
6
+ * (memory, assets, workspaces, cron-jobs, .env, session-store).
7
+ *
8
+ * Strategy:
9
+ * - Each path has a max age (days) OR a max size (MB, with rotation)
10
+ * - Defaults are conservative: keep 30 days of artifacts, rotate logs >100MB
11
+ * - All knobs overridable via env (CLEANUP_* vars) and via /cleanup set <key>
12
+ * - Runs once at boot + every 24h thereafter, unref'd so it doesn't
13
+ * prevent shutdown
14
+ *
15
+ * NEVER cleaned:
16
+ * ~/.alvin-bot/memory/ (daily logs, long-term memory)
17
+ * ~/.alvin-bot/assets/ (user-supplied files)
18
+ * ~/.alvin-bot/workspaces/ (user configuration)
19
+ * ~/.alvin-bot/cron-jobs.json (scheduled tasks)
20
+ * ~/.alvin-bot/.env (secrets)
21
+ * ~/.alvin-bot/session-store.json (resume tokens)
22
+ * ~/.alvin-bot/delivery-queue.json
23
+ * ~/.alvin-bot/standing-orders
24
+ * ~/.alvin-bot/auto-update.flag
25
+ */
26
+ import fs from "fs";
27
+ import path from "path";
28
+ import os from "os";
29
+ import { DATA_DIR } from "../paths.js";
30
+ const DEFAULT_POLICY = {
31
+ logMaxSizeMb: parseInt(process.env.CLEANUP_LOG_MAX_MB || "100", 10),
32
+ screenshotsMaxAgeDays: parseInt(process.env.CLEANUP_SCREENSHOTS_DAYS || "30", 10),
33
+ subagentsMaxAgeDays: parseInt(process.env.CLEANUP_SUBAGENTS_DAYS || "30", 10),
34
+ tmpMaxAgeDays: parseInt(process.env.CLEANUP_TMP_DAYS || "7", 10),
35
+ waMediaMaxAgeDays: parseInt(process.env.CLEANUP_WA_MEDIA_DAYS || "30", 10),
36
+ };
37
+ const CLEANUP_INTERVAL_MS = 24 * 60 * 60 * 1000; // once a day
38
+ let cleanupTimer = null;
39
+ /**
40
+ * Return the current effective policy (env-overridden defaults).
41
+ */
42
+ export function getCleanupPolicy() {
43
+ return { ...DEFAULT_POLICY };
44
+ }
45
+ /**
46
+ * Run a cleanup pass once. Safe to call manually (e.g. /cleanup command).
47
+ */
48
+ export async function runCleanup(policyOverride) {
49
+ const policy = { ...DEFAULT_POLICY, ...policyOverride };
50
+ const result = {
51
+ filesDeleted: 0,
52
+ bytesReclaimed: 0,
53
+ logsRotated: 0,
54
+ errors: [],
55
+ details: [],
56
+ };
57
+ // 1. Rotate large log files (launchd stdout/stderr)
58
+ if (policy.logMaxSizeMb > 0) {
59
+ const logsDir = path.join(DATA_DIR, "logs");
60
+ try {
61
+ if (fs.existsSync(logsDir)) {
62
+ for (const name of fs.readdirSync(logsDir)) {
63
+ if (!name.endsWith(".log"))
64
+ continue;
65
+ const full = path.join(logsDir, name);
66
+ try {
67
+ const st = fs.statSync(full);
68
+ if (st.size > policy.logMaxSizeMb * 1024 * 1024) {
69
+ // Rotate: keep a .old, overwrite current. Launchd will reopen on next write.
70
+ const oldPath = full + ".old";
71
+ try {
72
+ fs.rmSync(oldPath, { force: true });
73
+ }
74
+ catch { }
75
+ fs.renameSync(full, oldPath);
76
+ fs.writeFileSync(full, "");
77
+ result.logsRotated++;
78
+ result.bytesReclaimed += st.size;
79
+ result.details.push({ path: full, action: "rotated", size: st.size });
80
+ }
81
+ }
82
+ catch (err) {
83
+ result.errors.push(`log-rotate ${full}: ${err.message}`);
84
+ }
85
+ }
86
+ }
87
+ }
88
+ catch (err) {
89
+ result.errors.push(`logs scan: ${err.message}`);
90
+ }
91
+ }
92
+ // 2. Browser screenshots (bot-owned CDP)
93
+ if (policy.screenshotsMaxAgeDays > 0) {
94
+ const dir = path.join(DATA_DIR, "browser", "screenshots");
95
+ cleanupOldFiles(dir, policy.screenshotsMaxAgeDays, result);
96
+ }
97
+ // 3. Subagent streaming outputs — only delete FINISHED ones (older than N days).
98
+ // We trust that the async-agent-watcher has already marked them done — files
99
+ // older than a few days are either delivered or definitively abandoned.
100
+ if (policy.subagentsMaxAgeDays > 0) {
101
+ const dir = path.join(DATA_DIR, "subagents");
102
+ cleanupOldFiles(dir, policy.subagentsMaxAgeDays, result, [".jsonl", ".err"]);
103
+ }
104
+ // 4. /tmp/alvin-bot/* (media, temp scrapes)
105
+ if (policy.tmpMaxAgeDays > 0) {
106
+ cleanupOldFiles("/tmp/alvin-bot", policy.tmpMaxAgeDays, result);
107
+ }
108
+ // 5. WhatsApp media cache
109
+ if (policy.waMediaMaxAgeDays > 0) {
110
+ const dir = path.join(DATA_DIR, "data", "wa-media");
111
+ cleanupOldFiles(dir, policy.waMediaMaxAgeDays, result);
112
+ }
113
+ // 6. CDP log (/tmp/chrome-cdp.log) — always keep just the latest boot
114
+ const cdpLog = path.join(os.tmpdir(), "chrome-cdp.log");
115
+ try {
116
+ if (fs.existsSync(cdpLog)) {
117
+ const st = fs.statSync(cdpLog);
118
+ const ageDays = (Date.now() - st.mtimeMs) / (24 * 60 * 60 * 1000);
119
+ if (ageDays > 7) {
120
+ fs.unlinkSync(cdpLog);
121
+ result.filesDeleted++;
122
+ result.bytesReclaimed += st.size;
123
+ result.details.push({ path: cdpLog, action: "deleted", size: st.size });
124
+ }
125
+ }
126
+ }
127
+ catch {
128
+ // Not critical
129
+ }
130
+ return result;
131
+ }
132
+ /**
133
+ * Delete files in `dir` older than `maxAgeDays`. Safe if `dir` doesn't exist.
134
+ * Optional extension filter — e.g. [".jsonl", ".err"] restricts to those types.
135
+ */
136
+ function cleanupOldFiles(dir, maxAgeDays, result, extensions) {
137
+ if (!fs.existsSync(dir))
138
+ return;
139
+ const cutoffMs = Date.now() - maxAgeDays * 24 * 60 * 60 * 1000;
140
+ try {
141
+ for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
142
+ const full = path.join(dir, entry.name);
143
+ if (!entry.isFile())
144
+ continue;
145
+ if (extensions && !extensions.some((ext) => entry.name.endsWith(ext)))
146
+ continue;
147
+ try {
148
+ const st = fs.statSync(full);
149
+ if (st.mtimeMs < cutoffMs) {
150
+ fs.unlinkSync(full);
151
+ result.filesDeleted++;
152
+ result.bytesReclaimed += st.size;
153
+ result.details.push({ path: full, action: "deleted", size: st.size });
154
+ }
155
+ }
156
+ catch (err) {
157
+ result.errors.push(`${full}: ${err.message}`);
158
+ }
159
+ }
160
+ }
161
+ catch (err) {
162
+ result.errors.push(`scan ${dir}: ${err.message}`);
163
+ }
164
+ }
165
+ /**
166
+ * Start the periodic cleanup loop. Runs first pass after 5 minutes (let the
167
+ * bot fully boot and avoid competing with startup I/O), then every 24h.
168
+ */
169
+ export function startCleanupLoop() {
170
+ if (cleanupTimer)
171
+ return;
172
+ // First run delayed so we don't step on a restart that's still writing logs
173
+ setTimeout(() => {
174
+ void runCleanup().then((r) => {
175
+ if (r.filesDeleted > 0 || r.logsRotated > 0) {
176
+ console.log(`[cleanup] ${r.filesDeleted} files deleted, ${r.logsRotated} logs rotated, ${formatBytes(r.bytesReclaimed)} reclaimed`);
177
+ }
178
+ });
179
+ }, 5 * 60 * 1000);
180
+ cleanupTimer = setInterval(() => {
181
+ void runCleanup().then((r) => {
182
+ if (r.filesDeleted > 0 || r.logsRotated > 0) {
183
+ console.log(`[cleanup] ${r.filesDeleted} files deleted, ${r.logsRotated} logs rotated, ${formatBytes(r.bytesReclaimed)} reclaimed`);
184
+ }
185
+ });
186
+ }, CLEANUP_INTERVAL_MS);
187
+ cleanupTimer.unref?.();
188
+ }
189
+ export function stopCleanupLoop() {
190
+ if (cleanupTimer) {
191
+ clearInterval(cleanupTimer);
192
+ cleanupTimer = null;
193
+ }
194
+ }
195
+ function formatBytes(n) {
196
+ if (n < 1024)
197
+ return `${n} B`;
198
+ if (n < 1024 * 1024)
199
+ return `${(n / 1024).toFixed(1)} KB`;
200
+ if (n < 1024 * 1024 * 1024)
201
+ return `${(n / 1024 / 1024).toFixed(1)} MB`;
202
+ return `${(n / 1024 / 1024 / 1024).toFixed(2)} GB`;
203
+ }
@@ -143,12 +143,26 @@ function chunkMarkdown(content, source) {
143
143
  return chunks;
144
144
  }
145
145
  // ── Index Management ────────────────────────────────────
146
+ // In-memory cache for the embedding index. Without this, every query would
147
+ // re-read and re-parse the on-disk index (can be 100+ MB, making searchMemory
148
+ // the slowest step in a message turn). We keep the parsed object and invalidate
149
+ // via mtime check — so external reindexers are still picked up.
150
+ let indexCache = null;
151
+ let indexCacheMtime = 0;
146
152
  function loadIndex() {
147
153
  try {
154
+ const st = fs.statSync(INDEX_FILE);
155
+ if (indexCache && st.mtimeMs === indexCacheMtime) {
156
+ return indexCache;
157
+ }
148
158
  const raw = fs.readFileSync(INDEX_FILE, "utf-8");
149
- return JSON.parse(raw);
159
+ indexCache = JSON.parse(raw);
160
+ indexCacheMtime = st.mtimeMs;
161
+ return indexCache;
150
162
  }
151
163
  catch {
164
+ // File missing or unparseable — return an empty index and don't cache it
165
+ // (next call will retry, so a freshly-written index gets picked up).
152
166
  return {
153
167
  model: EMBEDDING_MODEL,
154
168
  lastReindex: 0,
@@ -159,6 +173,15 @@ function loadIndex() {
159
173
  }
160
174
  function saveIndex(index) {
161
175
  fs.writeFileSync(INDEX_FILE, JSON.stringify(index));
176
+ // Refresh cache immediately so the next loadIndex() sees the new state
177
+ // without a disk round-trip.
178
+ indexCache = index;
179
+ try {
180
+ indexCacheMtime = fs.statSync(INDEX_FILE).mtimeMs;
181
+ }
182
+ catch {
183
+ indexCacheMtime = Date.now();
184
+ }
162
185
  }
163
186
  /**
164
187
  * Recursively walk a directory, returning file paths.
@@ -72,6 +72,10 @@ export function startHeartbeat() {
72
72
  setTimeout(() => {
73
73
  runHeartbeat();
74
74
  state.intervalId = setInterval(runHeartbeat, HEARTBEAT_INTERVAL_MS);
75
+ // .unref() so this interval alone doesn't keep the process alive during
76
+ // graceful shutdown — the bot's main loop (grammy, platforms) keeps it
77
+ // running, and once those stop we want the process to exit cleanly.
78
+ state.intervalId?.unref?.();
75
79
  }, 30_000);
76
80
  }
77
81
  /**
@@ -116,6 +116,17 @@ async function connectStdio(name, config) {
116
116
  proc.stderr.on("data", (data) => {
117
117
  console.error(`MCP ${name} stderr:`, data.toString().trim());
118
118
  });
119
+ // Surface stderr stream errors so we don't silently lose the channel
120
+ // (EPIPE, ECONNRESET etc). Without this, unhandled 'error' on the
121
+ // stream would crash the whole Node process.
122
+ proc.stderr.on("error", (err) => {
123
+ console.error(`MCP ${name} stderr stream error:`, err.message);
124
+ server.connected = false;
125
+ });
126
+ proc.stdout?.on("error", (err) => {
127
+ console.error(`MCP ${name} stdout stream error:`, err.message);
128
+ server.connected = false;
129
+ });
119
130
  proc.on("error", (err) => {
120
131
  console.error(`MCP ${name} process error:`, err);
121
132
  server.connected = false;
@@ -167,10 +167,12 @@ export function loadSkills() {
167
167
  return cachedSkills;
168
168
  }
169
169
  /**
170
- * Get all loaded skills.
170
+ * Get all loaded skills. Cached after the first loadSkills() call; hot-reload
171
+ * happens via fs.watch when files change on disk. We only force a scan here if
172
+ * the cache is empty (init-order edge case).
171
173
  */
172
174
  export function getSkills() {
173
- if (cachedSkills.length === 0 || Date.now() - lastScanAt > 300_000) {
175
+ if (cachedSkills.length === 0) {
174
176
  reloadAllSkills();
175
177
  }
176
178
  return cachedSkills;
@@ -128,6 +128,43 @@ export function setDefaultTimeoutMs(ms) {
128
128
  }
129
129
  // ── State ───────────────────────────────────────────────
130
130
  const activeAgents = new Map();
131
+ /**
132
+ * Hard cap on the activeAgents map. Without this, a long-running bot that
133
+ * spawns many agents (e.g. a chatty cron + manual triggers over months) would
134
+ * accumulate delivered entries indefinitely. The 30-min auto-cleanup inside
135
+ * runSubAgent only fires on graceful completion, so crashed/orphaned entries
136
+ * would linger until the 12h giveUpAt ceiling.
137
+ *
138
+ * Enforcement: whenever we insert a new entry and the map is at-or-over the
139
+ * cap, evict the oldest finished-and-delivered entries first. Running agents
140
+ * are never evicted.
141
+ */
142
+ const MAX_ACTIVE_AGENTS = 1000;
143
+ function enforceAgentCap() {
144
+ if (activeAgents.size < MAX_ACTIVE_AGENTS)
145
+ return;
146
+ // Collect evictable entries (delivered OR terminal status), sort by startedAt
147
+ const evictable = [];
148
+ for (const [id, entry] of activeAgents) {
149
+ const status = entry.info.status;
150
+ const done = entry.delivered || status === "error" || status === "timeout" || status === "cancelled";
151
+ if (done)
152
+ evictable.push([id, entry.info.startedAt]);
153
+ }
154
+ evictable.sort((a, b) => a[1] - b[1]);
155
+ // Evict enough to land 10% below the cap, so we don't oscillate.
156
+ const target = Math.floor(MAX_ACTIVE_AGENTS * 0.9);
157
+ let toEvict = activeAgents.size - target;
158
+ for (const [id] of evictable) {
159
+ if (toEvict <= 0)
160
+ break;
161
+ activeAgents.delete(id);
162
+ toEvict--;
163
+ }
164
+ if (toEvict > 0) {
165
+ console.warn(`[subagents] map at ${activeAgents.size}/${MAX_ACTIVE_AGENTS} — could not evict enough finished entries (too many still running)`);
166
+ }
167
+ }
131
168
  // ── Name resolver (B2) ──────────────────────────────────
132
169
  /**
133
170
  * Return all currently-tracked agents whose *base* name matches `base`.
@@ -563,6 +600,7 @@ export function spawnSubAgent(agentConfig) {
563
600
  nameIndex: resolved.index,
564
601
  queuePosition: willRunImmediately ? undefined : queuedLen + 1,
565
602
  };
603
+ enforceAgentCap();
566
604
  activeAgents.set(id, { info, abort, delivered: false });
567
605
  const queuedSpawn = { id, resolvedName, agentConfig, depth, timeoutId };
568
606
  if (willRunImmediately) {
@@ -272,6 +272,7 @@ export function startAutoUpdateLoop() {
272
272
  console.log(`[auto-update] check failed: ${result.message}`);
273
273
  }
274
274
  }, AUTO_CHECK_INTERVAL_MS);
275
+ autoTimer.unref?.();
275
276
  console.log(`[auto-update] loop started (interval: 6h)`);
276
277
  }
277
278
  export function stopAutoUpdateLoop() {
@@ -8,6 +8,12 @@
8
8
  *
9
9
  * The admin/owner user uses the global docs/memory/ and docs/MEMORY.md.
10
10
  * Additional users get isolated memory spaces.
11
+ *
12
+ * Performance:
13
+ * Profiles are cached in memory after first read. `touchProfile` — called
14
+ * on every inbound message — writes to cache and schedules a debounced
15
+ * disk flush (2s). This avoids two sync fs operations per message on the
16
+ * hot path. A final flush happens on graceful shutdown so nothing is lost.
11
17
  */
12
18
  import fs from "fs";
13
19
  import { resolve } from "path";
@@ -18,6 +24,42 @@ import { USERS_DIR, MEMORY_DIR } from "../paths.js";
18
24
  // Ensure users dir exists
19
25
  if (!fs.existsSync(USERS_DIR))
20
26
  fs.mkdirSync(USERS_DIR, { recursive: true });
27
+ // ── In-memory cache + debounced persistence ─────────────
28
+ const cache = new Map();
29
+ const dirty = new Set();
30
+ let flushTimer = null;
31
+ const FLUSH_DELAY_MS = 2000;
32
+ function schedule_flush() {
33
+ if (flushTimer)
34
+ return;
35
+ flushTimer = setTimeout(() => {
36
+ flushTimer = null;
37
+ flushProfiles();
38
+ }, FLUSH_DELAY_MS);
39
+ flushTimer.unref?.();
40
+ }
41
+ /**
42
+ * Write every dirty profile to disk synchronously. Called by the debounce
43
+ * timer AND by the graceful-shutdown handler so no in-flight updates are
44
+ * lost even if the bot exits between debounce ticks.
45
+ */
46
+ export function flushProfiles() {
47
+ if (dirty.size === 0)
48
+ return;
49
+ for (const userId of dirty) {
50
+ const profile = cache.get(userId);
51
+ if (!profile)
52
+ continue;
53
+ try {
54
+ fs.writeFileSync(profilePath(userId), JSON.stringify(profile, null, 2));
55
+ }
56
+ catch (err) {
57
+ // Don't throw — a persistent error would block future flushes.
58
+ console.warn(`[users] flush ${userId} failed: ${err.message}`);
59
+ }
60
+ }
61
+ dirty.clear();
62
+ }
21
63
  // ── Profile Management ──────────────────────────────────
22
64
  function profilePath(userId) {
23
65
  return resolve(USERS_DIR, `${userId}.json`);
@@ -26,22 +68,32 @@ function userMemoryDir(userId) {
26
68
  return resolve(USERS_DIR, `${userId}`);
27
69
  }
28
70
  /**
29
- * Load a user profile. Returns null if not found.
71
+ * Load a user profile. Returns null if not found. Reads from cache first,
72
+ * falls back to disk on cache miss.
30
73
  */
31
74
  export function loadProfile(userId) {
75
+ const cached = cache.get(userId);
76
+ if (cached)
77
+ return cached;
32
78
  try {
33
79
  const raw = fs.readFileSync(profilePath(userId), "utf-8");
34
- return JSON.parse(raw);
80
+ const profile = JSON.parse(raw);
81
+ cache.set(userId, profile);
82
+ return profile;
35
83
  }
36
84
  catch {
37
85
  return null;
38
86
  }
39
87
  }
40
88
  /**
41
- * Save a user profile.
89
+ * Save a user profile — updates cache and schedules a debounced disk flush.
90
+ * For immediate durability (e.g. during shutdown), call flushProfiles()
91
+ * after this.
42
92
  */
43
93
  export function saveProfile(profile) {
44
- fs.writeFileSync(profilePath(profile.userId), JSON.stringify(profile, null, 2));
94
+ cache.set(profile.userId, profile);
95
+ dirty.add(profile.userId);
96
+ schedule_flush();
45
97
  }
46
98
  /**
47
99
  * Get or create a user profile.
@@ -76,6 +128,9 @@ export function getOrCreateProfile(userId, name, username) {
76
128
  }
77
129
  /**
78
130
  * Update a user's activity (call on each message).
131
+ *
132
+ * Previously this did a sync read + write per message. Now it works purely
133
+ * in memory and lets the debounce timer batch writes to disk.
79
134
  */
80
135
  export function touchProfile(userId, name, username, platform, messageText) {
81
136
  const profile = getOrCreateProfile(userId, name, username);
@@ -95,20 +150,33 @@ export function touchProfile(userId, name, username, platform, messageText) {
95
150
  return profile;
96
151
  }
97
152
  /**
98
- * List all known user profiles.
153
+ * List all known user profiles. Reads from disk; populates cache for
154
+ * subsequent fast access.
99
155
  */
100
156
  export function listProfiles() {
101
157
  const profiles = [];
102
158
  try {
103
159
  const files = fs.readdirSync(USERS_DIR);
104
160
  for (const file of files) {
105
- if (file.endsWith(".json")) {
106
- try {
107
- const raw = fs.readFileSync(resolve(USERS_DIR, file), "utf-8");
108
- profiles.push(JSON.parse(raw));
109
- }
110
- catch { /* skip corrupt */ }
161
+ if (!file.endsWith(".json"))
162
+ continue;
163
+ // Parse user id from filename — skip non-numeric (e.g. stray files)
164
+ const userId = parseInt(file.slice(0, -5), 10);
165
+ if (!Number.isFinite(userId))
166
+ continue;
167
+ // If cached, use that; otherwise read once and cache
168
+ const cached = cache.get(userId);
169
+ if (cached) {
170
+ profiles.push(cached);
171
+ continue;
172
+ }
173
+ try {
174
+ const raw = fs.readFileSync(resolve(USERS_DIR, file), "utf-8");
175
+ const p = JSON.parse(raw);
176
+ cache.set(userId, p);
177
+ profiles.push(p);
111
178
  }
179
+ catch { /* skip corrupt */ }
112
180
  }
113
181
  }
114
182
  catch { /* dir doesn't exist */ }
@@ -145,6 +213,9 @@ export function addUserNote(userId, note) {
145
213
  export function deleteUser(userId) {
146
214
  const deleted = [];
147
215
  const errors = [];
216
+ // 0. Drop from cache + dirty set so the debounce doesn't re-create the file
217
+ cache.delete(userId);
218
+ dirty.delete(userId);
148
219
  // 1. Delete profile JSON
149
220
  const pPath = profilePath(userId);
150
221
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alvin-bot",
3
- "version": "4.16.1",
3
+ "version": "4.18.0",
4
4
  "description": "Alvin Bot \u2014 Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",