@virtengine/openfleet 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/.env.example +914 -0
  2. package/LICENSE +190 -0
  3. package/README.md +500 -0
  4. package/agent-endpoint.mjs +918 -0
  5. package/agent-hook-bridge.mjs +230 -0
  6. package/agent-hooks.mjs +1188 -0
  7. package/agent-pool.mjs +2403 -0
  8. package/agent-prompts.mjs +689 -0
  9. package/agent-sdk.mjs +141 -0
  10. package/anomaly-detector.mjs +1195 -0
  11. package/autofix.mjs +1294 -0
  12. package/claude-shell.mjs +708 -0
  13. package/cli.mjs +906 -0
  14. package/codex-config.mjs +1274 -0
  15. package/codex-model-profiles.mjs +135 -0
  16. package/codex-shell.mjs +762 -0
  17. package/config-doctor.mjs +613 -0
  18. package/config.mjs +1720 -0
  19. package/conflict-resolver.mjs +248 -0
  20. package/container-runner.mjs +450 -0
  21. package/copilot-shell.mjs +827 -0
  22. package/daemon-restart-policy.mjs +56 -0
  23. package/diff-stats.mjs +282 -0
  24. package/error-detector.mjs +829 -0
  25. package/fetch-runtime.mjs +34 -0
  26. package/fleet-coordinator.mjs +838 -0
  27. package/get-telegram-chat-id.mjs +71 -0
  28. package/git-safety.mjs +170 -0
  29. package/github-reconciler.mjs +403 -0
  30. package/hook-profiles.mjs +651 -0
  31. package/kanban-adapter.mjs +4491 -0
  32. package/lib/logger.mjs +645 -0
  33. package/maintenance.mjs +828 -0
  34. package/merge-strategy.mjs +1171 -0
  35. package/monitor.mjs +12207 -0
  36. package/openfleet.config.example.json +115 -0
  37. package/openfleet.schema.json +465 -0
  38. package/package.json +203 -0
  39. package/postinstall.mjs +187 -0
  40. package/pr-cleanup-daemon.mjs +978 -0
  41. package/preflight.mjs +408 -0
  42. package/prepublish-check.mjs +90 -0
  43. package/presence.mjs +328 -0
  44. package/primary-agent.mjs +282 -0
  45. package/publish.mjs +151 -0
  46. package/repo-root.mjs +29 -0
  47. package/restart-controller.mjs +100 -0
  48. package/review-agent.mjs +557 -0
  49. package/rotate-agent-logs.sh +133 -0
  50. package/sdk-conflict-resolver.mjs +973 -0
  51. package/session-tracker.mjs +880 -0
  52. package/setup.mjs +3937 -0
  53. package/shared-knowledge.mjs +410 -0
  54. package/shared-state-manager.mjs +841 -0
  55. package/shared-workspace-cli.mjs +199 -0
  56. package/shared-workspace-registry.mjs +537 -0
  57. package/shared-workspaces.json +18 -0
  58. package/startup-service.mjs +1070 -0
  59. package/sync-engine.mjs +1063 -0
  60. package/task-archiver.mjs +801 -0
  61. package/task-assessment.mjs +550 -0
  62. package/task-claims.mjs +924 -0
  63. package/task-complexity.mjs +581 -0
  64. package/task-executor.mjs +5111 -0
  65. package/task-store.mjs +753 -0
  66. package/telegram-bot.mjs +9281 -0
  67. package/telegram-sentinel.mjs +2010 -0
  68. package/ui/app.js +867 -0
  69. package/ui/app.legacy.js +1464 -0
  70. package/ui/app.monolith.js +2488 -0
  71. package/ui/components/charts.js +226 -0
  72. package/ui/components/chat-view.js +567 -0
  73. package/ui/components/command-palette.js +587 -0
  74. package/ui/components/diff-viewer.js +190 -0
  75. package/ui/components/forms.js +327 -0
  76. package/ui/components/kanban-board.js +451 -0
  77. package/ui/components/session-list.js +305 -0
  78. package/ui/components/shared.js +473 -0
  79. package/ui/index.html +70 -0
  80. package/ui/modules/api.js +297 -0
  81. package/ui/modules/icons.js +461 -0
  82. package/ui/modules/router.js +81 -0
  83. package/ui/modules/settings-schema.js +261 -0
  84. package/ui/modules/state.js +679 -0
  85. package/ui/modules/telegram.js +331 -0
  86. package/ui/modules/utils.js +270 -0
  87. package/ui/styles/animations.css +140 -0
  88. package/ui/styles/base.css +98 -0
  89. package/ui/styles/components.css +1915 -0
  90. package/ui/styles/kanban.css +286 -0
  91. package/ui/styles/layout.css +809 -0
  92. package/ui/styles/sessions.css +827 -0
  93. package/ui/styles/variables.css +188 -0
  94. package/ui/styles.css +141 -0
  95. package/ui/styles.monolith.css +1046 -0
  96. package/ui/tabs/agents.js +1417 -0
  97. package/ui/tabs/chat.js +74 -0
  98. package/ui/tabs/control.js +887 -0
  99. package/ui/tabs/dashboard.js +515 -0
  100. package/ui/tabs/infra.js +537 -0
  101. package/ui/tabs/logs.js +783 -0
  102. package/ui/tabs/settings.js +1487 -0
  103. package/ui/tabs/tasks.js +1385 -0
  104. package/ui-server.mjs +4073 -0
  105. package/update-check.mjs +465 -0
  106. package/utils.mjs +172 -0
  107. package/ve-kanban.mjs +654 -0
  108. package/ve-kanban.ps1 +1365 -0
  109. package/ve-kanban.sh +18 -0
  110. package/ve-orchestrator.mjs +340 -0
  111. package/ve-orchestrator.ps1 +6546 -0
  112. package/ve-orchestrator.sh +18 -0
  113. package/vibe-kanban-wrapper.mjs +41 -0
  114. package/vk-error-resolver.mjs +470 -0
  115. package/vk-log-stream.mjs +914 -0
  116. package/whatsapp-channel.mjs +520 -0
  117. package/workspace-monitor.mjs +581 -0
  118. package/workspace-reaper.mjs +405 -0
  119. package/workspace-registry.mjs +238 -0
  120. package/worktree-manager.mjs +1266 -0
@@ -0,0 +1,2010 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * telegram-sentinel.mjs — Always-on Telegram command listener for openfleet.
5
+ *
6
+ * Runs independently of the main openfleet process, ensuring Telegram
7
+ * commands are always handled even when openfleet is down.
8
+ *
9
+ * Architecture:
10
+ * ┌─────────────────┐
11
+ * │ telegram-sentinel│──── always running ────────────────────────────────┐
12
+ * │ (this file) │ │
13
+ * └────────┬─────────┘ │
14
+ * │ │
15
+ * ├─ Standalone Mode (openfleet DOWN) │
16
+ * │ ├─ Polls Telegram directly │
17
+ * │ ├─ Handles simple commands (/ping, /status, /sentinel) │
18
+ * │ └─ Auto-starts openfleet for complex commands │
19
+ * │ │
20
+ * └─ Companion Mode (openfleet UP) │
21
+ * ├─ Does NOT poll (lets telegram-bot.mjs handle it) │
22
+ * ├─ Monitors openfleet health via PID file │
23
+ * └─ Transitions to Standalone if openfleet dies │
24
+ * │
25
+ * ┌─────────────────┐ │
26
+ * │ openfleet │ ← started/stopped by sentinel as needed ─────────┘
27
+ * │ (cli.mjs fork) │
28
+ * └─────────────────┘
29
+ *
30
+ * Usage:
31
+ * node telegram-sentinel.mjs # start sentinel
32
+ * node telegram-sentinel.mjs --stop # stop sentinel
33
+ * node telegram-sentinel.mjs --status # check sentinel status
34
+ */
35
+
36
+ import {
37
+ existsSync,
38
+ readFileSync,
39
+ mkdirSync,
40
+ unlinkSync,
41
+ writeFileSync,
42
+ } from "node:fs";
43
+ import { readFile, writeFile, unlink } from "node:fs/promises";
44
+ import { resolve, dirname } from "node:path";
45
+ import { fileURLToPath } from "node:url";
46
+ import { spawn } from "node:child_process";
47
+ import os from "node:os";
48
+ import {
49
+ execPrimaryPrompt,
50
+ getPrimaryAgentInfo,
51
+ initPrimaryAgent,
52
+ } from "./primary-agent.mjs";
53
+ import { resolveRepoRoot } from "./repo-root.mjs";
54
+
55
+ // ── Paths ────────────────────────────────────────────────────────────────────
56
+
57
+ const __filename = fileURLToPath(import.meta.url);
58
+ const __dirname = dirname(__filename);
59
+ const repoRoot = resolveRepoRoot();
60
+ const cacheDir = resolve(repoRoot, ".cache");
61
+
62
+ const MONITOR_PID_FILE = resolve(__dirname, ".cache", "openfleet.pid");
63
+ const SENTINEL_PID_FILE = resolve(cacheDir, "telegram-sentinel.pid");
64
+ const SENTINEL_HEARTBEAT_FILE = resolve(cacheDir, "sentinel-heartbeat.json");
65
+ const SENTINEL_LOCK_FILE = resolve(cacheDir, "telegram-sentinel.lock");
66
+ const SENTINEL_COMMAND_QUEUE_FILE = resolve(
67
+ cacheDir,
68
+ "sentinel-command-queue.json",
69
+ );
70
+ const SENTINEL_MONITOR_RECOVERY_FILE = resolve(
71
+ cacheDir,
72
+ "sentinel-monitor-recovery.json",
73
+ );
74
+ const MONITOR_POLL_LOCK_FILE = resolve(cacheDir, "telegram-getupdates.lock");
75
+ const STATUS_FILE = resolve(cacheDir, "ve-orchestrator-status.json");
76
+
77
+ const TAG = "[sentinel]";
78
+ const POLL_TIMEOUT_S = 30;
79
+ const MAX_MESSAGE_LEN = 4000;
80
+ const HEALTH_CHECK_INTERVAL_MS = 30_000;
81
+ const POLL_ERROR_BACKOFF_BASE_MS = 5_000;
82
+ const POLL_ERROR_BACKOFF_MAX_MS = 120_000;
83
+ const COMMAND_QUEUE_MAX_SIZE = 50;
84
+ const COMMAND_QUEUE_TTL_MS = 10 * 60 * 1000; // 10 minutes
85
+ const MONITOR_START_TIMEOUT_MS = 60_000; // 60s to wait for monitor to become healthy
86
+ const MONITOR_HEALTH_POLL_MS = 2_000; // check every 2s during startup
87
+
88
+ // ── State ────────────────────────────────────────────────────────────────────
89
+
90
+ /** @type {"standalone" | "companion"} */
91
+ let mode = "standalone";
92
+ let running = false;
93
+ let polling = false;
94
+ /** @type {AbortController | null} */
95
+ let pollAbort = null;
96
+ let lastUpdateId = 0;
97
+ let healthCheckTimer = null;
98
+ let heartbeatTimer = null;
99
+ let consecutivePollErrors = 0;
100
+ let commandsProcessed = 0;
101
+ let startedAt = new Date().toISOString();
102
+ /** @type {Array<{ chatId: string|number, text: string, timestamp: number }>} */
103
+ let commandQueue = [];
104
+ /** @type {Promise<void> | null} */
105
+ let monitorStartPromise = null;
106
+ let sentinelPollLockHeld = false;
107
+ let recoveryInProgress = false;
108
+ let monitorRestartAttempts = [];
109
+ let monitorCrashEvents = [];
110
+ let lastRepairAt = 0;
111
+ let lastMonitorStartAt = 0;
112
+ let monitorManualStopUntil = 0;
113
+
114
+ const sentinelConfig = {
115
+ autoRestartMonitor: true,
116
+ crashLoopThreshold: 3,
117
+ crashLoopWindowMs: 10 * 60 * 1000,
118
+ monitorStartGraceMs: 45 * 1000,
119
+ repairAgentEnabled: true,
120
+ repairCooldownMs: 15 * 60 * 1000,
121
+ repairTimeoutMs: 20 * 60 * 1000,
122
+ primaryAgentFallbackEnabled: true,
123
+ primaryAgentFallbackTimeoutMs: 15 * 60 * 1000,
124
+ restartBackoffMs: 5 * 1000,
125
+ manualStopHoldMs: 10 * 60 * 1000,
126
+ monitorMonitorCheckEnabled: true,
127
+ monitorMonitorMaxAgeMs: 20 * 60 * 1000,
128
+ };
129
+
130
+ // ── Environment ──────────────────────────────────────────────────────────────
131
+
132
+ /** @type {string} */
133
+ let telegramToken = "";
134
+ /** @type {string} */
135
+ let telegramChatId = "";
136
+ /** @type {string} */
137
+ let projectName = "";
138
+
139
+ /**
140
+ * Parse the .env file for Telegram credentials and project name.
141
+ * Uses a simple line-by-line parser — no external dependencies.
142
+ * @returns {{ TELEGRAM_BOT_TOKEN?: string, TELEGRAM_CHAT_ID?: string, PROJECT_NAME?: string }}
143
+ */
144
+ function loadEnvCredentials() {
145
+ const envPath = resolve(__dirname, ".env");
146
+ /** @type {Record<string, string>} */
147
+ const vars = {};
148
+
149
+ if (!existsSync(envPath)) return vars;
150
+
151
+ try {
152
+ const lines = readFileSync(envPath, "utf8").split("\n");
153
+ for (const line of lines) {
154
+ const trimmed = line.trim();
155
+ if (!trimmed || trimmed.startsWith("#")) continue;
156
+ const eqIdx = trimmed.indexOf("=");
157
+ if (eqIdx === -1) continue;
158
+ const key = trimmed.slice(0, eqIdx).trim();
159
+ let val = trimmed.slice(eqIdx + 1).trim();
160
+ // Strip surrounding quotes
161
+ if (
162
+ (val.startsWith('"') && val.endsWith('"')) ||
163
+ (val.startsWith("'") && val.endsWith("'"))
164
+ ) {
165
+ val = val.slice(1, -1);
166
+ }
167
+ vars[key] = val;
168
+ }
169
+ } catch {
170
+ // best effort
171
+ }
172
+
173
+ return vars;
174
+ }
175
+
176
+ /**
177
+ * Initialize environment variables from .env and process.env.
178
+ * Process.env takes precedence over .env file values.
179
+ */
180
+ function parseBool(value, defaultValue) {
181
+ if (value == null || value === "") return defaultValue;
182
+ const normalized = String(value).trim().toLowerCase();
183
+ if (["1", "true", "yes", "on"].includes(normalized)) return true;
184
+ if (["0", "false", "no", "off"].includes(normalized)) return false;
185
+ return defaultValue;
186
+ }
187
+
188
+ function parseNumber(value, defaultValue, min = null, max = null) {
189
+ const parsed = Number(value);
190
+ if (!Number.isFinite(parsed)) return defaultValue;
191
+ let out = parsed;
192
+ if (Number.isFinite(min)) out = Math.max(min, out);
193
+ if (Number.isFinite(max)) out = Math.min(max, out);
194
+ return out;
195
+ }
196
+
197
+ function getEnvValue(fileVars, key, fallback = "") {
198
+ // .env is the PRIMARY source, then process env.
199
+ const fromFile = fileVars?.[key];
200
+ if (fromFile != null && String(fromFile).trim() !== "") return fromFile;
201
+ const fromProcess = process.env[key];
202
+ if (fromProcess != null && String(fromProcess).trim() !== "") {
203
+ return fromProcess;
204
+ }
205
+ return fallback;
206
+ }
207
+
208
+ function initEnv() {
209
+ const fileVars = loadEnvCredentials();
210
+ telegramToken = getEnvValue(fileVars, "TELEGRAM_BOT_TOKEN", "");
211
+ telegramChatId = getEnvValue(fileVars, "TELEGRAM_CHAT_ID", "");
212
+ projectName = getEnvValue(fileVars, "PROJECT_NAME", "openfleet");
213
+
214
+ sentinelConfig.autoRestartMonitor = parseBool(
215
+ getEnvValue(fileVars, "SENTINEL_AUTO_RESTART_MONITOR", "1"),
216
+ true,
217
+ );
218
+ sentinelConfig.crashLoopThreshold = parseNumber(
219
+ getEnvValue(fileVars, "SENTINEL_CRASH_LOOP_THRESHOLD", "3"),
220
+ 3,
221
+ 2,
222
+ 20,
223
+ );
224
+ sentinelConfig.crashLoopWindowMs =
225
+ parseNumber(
226
+ getEnvValue(fileVars, "SENTINEL_CRASH_LOOP_WINDOW_MIN", "10"),
227
+ 10,
228
+ 1,
229
+ 120,
230
+ ) * 60_000;
231
+ sentinelConfig.monitorStartGraceMs =
232
+ parseNumber(
233
+ getEnvValue(fileVars, "SENTINEL_MONITOR_START_GRACE_SEC", "45"),
234
+ 45,
235
+ 10,
236
+ 600,
237
+ ) * 1000;
238
+ sentinelConfig.repairAgentEnabled = parseBool(
239
+ getEnvValue(fileVars, "SENTINEL_REPAIR_AGENT_ENABLED", "1"),
240
+ true,
241
+ );
242
+ sentinelConfig.repairCooldownMs =
243
+ parseNumber(
244
+ getEnvValue(fileVars, "SENTINEL_REPAIR_COOLDOWN_MIN", "15"),
245
+ 15,
246
+ 1,
247
+ 240,
248
+ ) * 60_000;
249
+ sentinelConfig.repairTimeoutMs =
250
+ parseNumber(
251
+ getEnvValue(fileVars, "SENTINEL_REPAIR_TIMEOUT_MIN", "20"),
252
+ 20,
253
+ 1,
254
+ 240,
255
+ ) * 60_000;
256
+ sentinelConfig.primaryAgentFallbackEnabled = parseBool(
257
+ getEnvValue(fileVars, "SENTINEL_PRIMARY_AGENT_FALLBACK_ENABLED", "1"),
258
+ true,
259
+ );
260
+ sentinelConfig.primaryAgentFallbackTimeoutMs =
261
+ parseNumber(
262
+ getEnvValue(fileVars, "SENTINEL_PRIMARY_AGENT_TIMEOUT_MIN", "15"),
263
+ 15,
264
+ 1,
265
+ 180,
266
+ ) * 60_000;
267
+ sentinelConfig.restartBackoffMs =
268
+ parseNumber(
269
+ getEnvValue(fileVars, "SENTINEL_RESTART_BACKOFF_SEC", "5"),
270
+ 5,
271
+ 0,
272
+ 600,
273
+ ) * 1000;
274
+ sentinelConfig.manualStopHoldMs =
275
+ parseNumber(
276
+ getEnvValue(fileVars, "SENTINEL_MANUAL_STOP_HOLD_MIN", "10"),
277
+ 10,
278
+ 0,
279
+ 240,
280
+ ) * 60_000;
281
+ sentinelConfig.monitorMonitorCheckEnabled = parseBool(
282
+ getEnvValue(fileVars, "SENTINEL_MONITOR_MONITOR_CHECK_ENABLED", "1"),
283
+ true,
284
+ );
285
+ sentinelConfig.monitorMonitorMaxAgeMs =
286
+ parseNumber(
287
+ getEnvValue(fileVars, "SENTINEL_MONITOR_MONITOR_MAX_AGE_MIN", "20"),
288
+ 20,
289
+ 1,
290
+ 240,
291
+ ) * 60_000;
292
+ }
293
+
294
+ function pruneTimestamps(values, now = Date.now()) {
295
+ const floor = now - sentinelConfig.crashLoopWindowMs;
296
+ return (values || []).filter((ts) => Number.isFinite(ts) && ts >= floor);
297
+ }
298
+
299
+ function saveRecoveryState() {
300
+ try {
301
+ mkdirSync(dirname(SENTINEL_MONITOR_RECOVERY_FILE), { recursive: true });
302
+ writeFileSync(
303
+ SENTINEL_MONITOR_RECOVERY_FILE,
304
+ JSON.stringify(
305
+ {
306
+ monitorRestartAttempts,
307
+ monitorCrashEvents,
308
+ lastRepairAt,
309
+ lastMonitorStartAt,
310
+ monitorManualStopUntil,
311
+ updatedAt: new Date().toISOString(),
312
+ },
313
+ null,
314
+ 2,
315
+ ),
316
+ "utf8",
317
+ );
318
+ } catch {
319
+ /* best effort */
320
+ }
321
+ }
322
+
323
+ function loadRecoveryState() {
324
+ try {
325
+ if (!existsSync(SENTINEL_MONITOR_RECOVERY_FILE)) return;
326
+ const raw = readFileSync(SENTINEL_MONITOR_RECOVERY_FILE, "utf8");
327
+ if (!raw || !raw.trim()) return;
328
+ const data = JSON.parse(raw);
329
+ monitorRestartAttempts = Array.isArray(data.monitorRestartAttempts)
330
+ ? data.monitorRestartAttempts
331
+ : [];
332
+ monitorCrashEvents = Array.isArray(data.monitorCrashEvents)
333
+ ? data.monitorCrashEvents
334
+ : [];
335
+ lastRepairAt = Number(data.lastRepairAt) || 0;
336
+ lastMonitorStartAt = Number(data.lastMonitorStartAt) || 0;
337
+ monitorManualStopUntil = Number(data.monitorManualStopUntil) || 0;
338
+ } catch {
339
+ /* best effort */
340
+ }
341
+ }
342
+
343
+ function recordMonitorRestartAttempt() {
344
+ const now = Date.now();
345
+ monitorRestartAttempts.push(now);
346
+ monitorRestartAttempts = pruneTimestamps(monitorRestartAttempts, now);
347
+ saveRecoveryState();
348
+ }
349
+
350
+ function recordMonitorCrashEvent() {
351
+ const now = Date.now();
352
+ monitorCrashEvents.push(now);
353
+ monitorCrashEvents = pruneTimestamps(monitorCrashEvents, now);
354
+ saveRecoveryState();
355
+ }
356
+
357
+ function isCrashLoopDetected(now = Date.now()) {
358
+ monitorRestartAttempts = pruneTimestamps(monitorRestartAttempts, now);
359
+ monitorCrashEvents = pruneTimestamps(monitorCrashEvents, now);
360
+ const threshold = sentinelConfig.crashLoopThreshold;
361
+ return (
362
+ monitorCrashEvents.length >= threshold ||
363
+ monitorRestartAttempts.length >= threshold
364
+ );
365
+ }
366
+
367
+ async function assessMonitorMonitorHealth() {
368
+ if (!sentinelConfig.monitorMonitorCheckEnabled) {
369
+ return { ok: true, reason: "check disabled" };
370
+ }
371
+ const devmodeEnabled = parseBool(
372
+ process.env.DEVMODE_MONITOR_MONITOR_ENABLED ?? "1",
373
+ true,
374
+ );
375
+ if (!devmodeEnabled) {
376
+ return { ok: true, reason: "devmode monitor-monitor disabled" };
377
+ }
378
+ try {
379
+ if (!existsSync(STATUS_FILE)) {
380
+ return { ok: false, reason: "status file missing" };
381
+ }
382
+ const statusRaw = await readFile(STATUS_FILE, "utf8");
383
+ const status = JSON.parse(statusRaw || "{}");
384
+ const mm = status?.monitor_monitor || status?.monitorMonitor || null;
385
+ if (!mm || typeof mm !== "object") {
386
+ return { ok: false, reason: "monitor-monitor section unavailable" };
387
+ }
388
+ if (mm.enabled === false) {
389
+ return { ok: true, reason: "monitor-monitor disabled in status" };
390
+ }
391
+ const lastRunAt = mm.lastRunAt || mm.last_run_at || mm.last_run || null;
392
+ if (!lastRunAt) {
393
+ return { ok: false, reason: "monitor-monitor missing last run timestamp" };
394
+ }
395
+ const ageMs = Date.now() - new Date(lastRunAt).getTime();
396
+ if (!Number.isFinite(ageMs) || ageMs < 0) {
397
+ return { ok: false, reason: "monitor-monitor timestamp invalid" };
398
+ }
399
+ if (ageMs > sentinelConfig.monitorMonitorMaxAgeMs) {
400
+ return {
401
+ ok: false,
402
+ reason: `monitor-monitor stale (${formatUptime(ageMs)} old)`,
403
+ };
404
+ }
405
+ return { ok: true, reason: `healthy (${formatUptime(ageMs)} old)` };
406
+ } catch (err) {
407
+ return { ok: false, reason: err?.message || "health check failed" };
408
+ }
409
+ }
410
+
411
+ function normalizeAgentResult(result) {
412
+ if (!result) return "(no response)";
413
+ if (typeof result === "string") return result;
414
+ if (typeof result.finalResponse === "string" && result.finalResponse.trim()) {
415
+ return result.finalResponse.trim();
416
+ }
417
+ if (typeof result.response === "string" && result.response.trim()) {
418
+ return result.response.trim();
419
+ }
420
+ try {
421
+ return JSON.stringify(result).slice(0, 3000);
422
+ } catch {
423
+ return String(result);
424
+ }
425
+ }
426
+
427
+ async function runRepairAgent(triggerReason, details = "") {
428
+ if (!sentinelConfig.repairAgentEnabled) return false;
429
+ if (recoveryInProgress) return false;
430
+
431
+ const now = Date.now();
432
+ const sinceLast = now - lastRepairAt;
433
+ if (lastRepairAt > 0 && sinceLast < sentinelConfig.repairCooldownMs) {
434
+ log(
435
+ "warn",
436
+ `repair-agent cooldown active (${Math.round((sentinelConfig.repairCooldownMs - sinceLast) / 1000)}s remaining)`,
437
+ );
438
+ return false;
439
+ }
440
+
441
+ recoveryInProgress = true;
442
+ lastRepairAt = now;
443
+ saveRecoveryState();
444
+
445
+ try {
446
+ await sendTelegram(
447
+ telegramChatId,
448
+ [
449
+ "🧰 Crash-loop detected. Launching repair agent.",
450
+ `Trigger: ${triggerReason}`,
451
+ details ? `Context: ${details}` : "",
452
+ ]
453
+ .filter(Boolean)
454
+ .join("\n"),
455
+ );
456
+
457
+ await initPrimaryAgent();
458
+ const agentInfo = getPrimaryAgentInfo();
459
+ const mmHealth = await assessMonitorMonitorHealth();
460
+ const prompt = [
461
+ "openfleet sentinel autonomous repair request.",
462
+ "",
463
+ `Trigger: ${triggerReason}`,
464
+ `Project: ${projectName}`,
465
+ `Host: ${os.hostname()}`,
466
+ `Crash events in window: ${monitorCrashEvents.length}`,
467
+ `Restart attempts in window: ${monitorRestartAttempts.length}`,
468
+ `Monitor-monitor health: ${mmHealth.ok ? "healthy" : "degraded"} (${mmHealth.reason})`,
469
+ details ? `Additional context: ${details}` : "",
470
+ "",
471
+ "Task:",
472
+ "1) Diagnose likely monitor crash-loop root cause.",
473
+ "2) Apply safe, minimal fixes directly in this workspace when possible.",
474
+ "3) Return concise summary: root cause, files changed, validation performed, residual risk.",
475
+ ]
476
+ .filter(Boolean)
477
+ .join("\n");
478
+
479
+ const result = await execPrimaryPrompt(prompt, {
480
+ timeoutMs: sentinelConfig.repairTimeoutMs,
481
+ });
482
+ const summary = normalizeAgentResult(result);
483
+ await sendTelegram(
484
+ telegramChatId,
485
+ [
486
+ `✅ Repair agent completed via ${agentInfo.adapter}.`,
487
+ "",
488
+ summary.slice(0, 3500),
489
+ ].join("\n"),
490
+ );
491
+ return true;
492
+ } catch (err) {
493
+ await sendTelegram(
494
+ telegramChatId,
495
+ `❌ Repair agent failed: ${err?.message || err}`,
496
+ );
497
+ return false;
498
+ } finally {
499
+ recoveryInProgress = false;
500
+ saveRecoveryState();
501
+ }
502
+ }
503
+
504
+ async function runPrimaryAgentFallback(chatId, text, command) {
505
+ if (!sentinelConfig.primaryAgentFallbackEnabled) {
506
+ return false;
507
+ }
508
+ try {
509
+ await initPrimaryAgent();
510
+ const agentInfo = getPrimaryAgentInfo();
511
+ await sendTelegram(
512
+ chatId,
513
+ `🤖 openfleet is down. Running via sentinel fallback (${agentInfo.adapter})...`,
514
+ );
515
+
516
+ const prompt = [
517
+ "Telegram fallback request while openfleet is offline.",
518
+ "",
519
+ `Project: ${projectName}`,
520
+ `Host: ${os.hostname()}`,
521
+ `Command: ${command}`,
522
+ "",
523
+ "User input:",
524
+ text,
525
+ "",
526
+ "Execute this request directly and return a concise, actionable response suitable for Telegram.",
527
+ "If the exact command requires monitor internals, provide the closest equivalent action and clear next steps.",
528
+ ].join("\n");
529
+
530
+ const result = await execPrimaryPrompt(prompt, {
531
+ timeoutMs: sentinelConfig.primaryAgentFallbackTimeoutMs,
532
+ });
533
+ const message = normalizeAgentResult(result).slice(0, 3600);
534
+ await sendTelegram(chatId, message || "(fallback completed with no text output)");
535
+ return true;
536
+ } catch (err) {
537
+ await sendTelegram(
538
+ chatId,
539
+ `❌ Sentinel fallback failed: ${err?.message || err}`,
540
+ );
541
+ return false;
542
+ }
543
+ }
544
+
545
+ async function attemptMonitorRecovery(triggerReason) {
546
+ if (!sentinelConfig.autoRestartMonitor) return;
547
+ if (monitorStartPromise) return;
548
+ if (Date.now() < monitorManualStopUntil) {
549
+ log("info", "auto-restart suppressed due to recent manual stop");
550
+ return;
551
+ }
552
+
553
+ const loopDetected = isCrashLoopDetected();
554
+ if (loopDetected) {
555
+ const mmHealth = await assessMonitorMonitorHealth();
556
+ await sendTelegram(
557
+ telegramChatId,
558
+ [
559
+ "⚠️ Monitor crash-loop detected.",
560
+ `Window: ${Math.round(sentinelConfig.crashLoopWindowMs / 60000)}m | threshold: ${sentinelConfig.crashLoopThreshold}`,
561
+ `Monitor-monitor: ${mmHealth.ok ? "healthy" : "degraded"} (${mmHealth.reason})`,
562
+ "Attempting autonomous repair before restart.",
563
+ ].join("\n"),
564
+ );
565
+ await runRepairAgent(triggerReason, mmHealth.reason);
566
+ }
567
+
568
+ if (sentinelConfig.restartBackoffMs > 0) {
569
+ await sleep(sentinelConfig.restartBackoffMs);
570
+ }
571
+
572
+ try {
573
+ await ensureMonitorRunning(`sentinel recovery: ${triggerReason}`);
574
+ const pid = readAlivePid(MONITOR_PID_FILE);
575
+ const pidSuffix = pid ? ` (PID ${pid})` : "";
576
+ await sendTelegram(
577
+ telegramChatId,
578
+ `✅ openfleet recovered${pidSuffix}.`,
579
+ );
580
+ } catch (err) {
581
+ await sendTelegram(
582
+ telegramChatId,
583
+ `❌ openfleet auto-restart failed: ${err?.message || err}`,
584
+ );
585
+ }
586
+ }
587
+
588
+ // ── Process Utilities ────────────────────────────────────────────────────────
589
+
590
+ /**
591
+ * Check if a process with the given PID is alive.
592
+ * @param {number} pid
593
+ * @returns {boolean}
594
+ */
595
+ function isProcessAlive(pid) {
596
+ if (!Number.isFinite(pid) || pid <= 0) return false;
597
+ try {
598
+ process.kill(pid, 0);
599
+ return true;
600
+ } catch {
601
+ return false;
602
+ }
603
+ }
604
+
605
+ /**
606
+ * Read a PID from a file and check if the process is alive.
607
+ * @param {string} pidPath
608
+ * @returns {number | null} The PID if alive, null otherwise.
609
+ */
610
+ function readAlivePid(pidPath) {
611
+ try {
612
+ if (!existsSync(pidPath)) return null;
613
+ const pid = parseInt(readFileSync(pidPath, "utf8").trim(), 10);
614
+ if (isNaN(pid)) return null;
615
+ return isProcessAlive(pid) ? pid : null;
616
+ } catch {
617
+ return null;
618
+ }
619
+ }
620
+
621
+ /**
622
+ * Write a PID file atomically (best effort).
623
+ * @param {string} pidPath
624
+ * @param {number} pid
625
+ */
626
+ function writePidFile(pidPath, pid) {
627
+ try {
628
+ mkdirSync(dirname(pidPath), { recursive: true });
629
+ writeFileSync(pidPath, String(pid), "utf8");
630
+ } catch {
631
+ /* best effort */
632
+ }
633
+ }
634
+
635
+ /**
636
+ * Remove a PID file.
637
+ * @param {string} pidPath
638
+ */
639
+ function removePidFile(pidPath) {
640
+ try {
641
+ if (existsSync(pidPath)) unlinkSync(pidPath);
642
+ } catch {
643
+ /* best effort */
644
+ }
645
+ }
646
+
647
+ // ── Sentinel Lock ────────────────────────────────────────────────────────────
648
+
649
+ /**
650
+ * Acquire the sentinel poll lock file. Uses exclusive create (wx) to prevent
651
+ * races between multiple sentinel instances.
652
+ * @returns {Promise<boolean>}
653
+ */
654
+ async function acquireSentinelPollLock() {
655
+ if (sentinelPollLockHeld) return true;
656
+ try {
657
+ const payload = JSON.stringify(
658
+ {
659
+ owner: "sentinel",
660
+ pid: process.pid,
661
+ started_at: new Date().toISOString(),
662
+ },
663
+ null,
664
+ 2,
665
+ );
666
+ await writeFile(SENTINEL_LOCK_FILE, payload, { flag: "wx" });
667
+ sentinelPollLockHeld = true;
668
+ return true;
669
+ } catch (err) {
670
+ if (err && err.code === "EEXIST") {
671
+ // Check if the existing lock holder is still alive
672
+ try {
673
+ const raw = await readFile(SENTINEL_LOCK_FILE, "utf8");
674
+ if (!raw || !raw.trim()) {
675
+ await unlink(SENTINEL_LOCK_FILE).catch(() => {});
676
+ return acquireSentinelPollLock();
677
+ }
678
+ const data = JSON.parse(raw);
679
+ const pid = Number(data?.pid);
680
+ if (!isProcessAlive(pid)) {
681
+ // Stale lock — reclaim
682
+ await unlink(SENTINEL_LOCK_FILE).catch(() => {});
683
+ return acquireSentinelPollLock();
684
+ }
685
+ // Another live sentinel holds the lock
686
+ return false;
687
+ } catch {
688
+ // Corrupt lock file — remove and retry
689
+ await unlink(SENTINEL_LOCK_FILE).catch(() => {});
690
+ return acquireSentinelPollLock();
691
+ }
692
+ }
693
+ return false;
694
+ }
695
+ }
696
+
697
+ /**
698
+ * Release the sentinel poll lock file.
699
+ * @returns {Promise<void>}
700
+ */
701
+ async function releaseSentinelPollLock() {
702
+ if (!sentinelPollLockHeld) return;
703
+ sentinelPollLockHeld = false;
704
+ try {
705
+ await unlink(SENTINEL_LOCK_FILE).catch(() => {});
706
+ } catch {
707
+ /* best effort */
708
+ }
709
+ }
710
+
711
+ // ── Telegram API ─────────────────────────────────────────────────────────────
712
+
713
+ /**
714
+ * Send a text message to a Telegram chat.
715
+ * Handles message splitting for long texts and retries on transient errors.
716
+ * @param {string | number} chatId
717
+ * @param {string} text
718
+ * @param {object} [options]
719
+ * @param {string} [options.parseMode]
720
+ * @param {boolean} [options.silent]
721
+ * @returns {Promise<number | null>} The message_id of the last sent chunk, or null.
722
+ */
723
+ async function sendTelegram(chatId, text, options = {}) {
724
+ if (!telegramToken) return null;
725
+ const chunks = splitMessage(text, MAX_MESSAGE_LEN);
726
+ let lastMessageId = null;
727
+
728
+ for (const chunk of chunks) {
729
+ const url = `https://api.telegram.org/bot${telegramToken}/sendMessage`;
730
+ /** @type {Record<string, any>} */
731
+ const payload = {
732
+ chat_id: chatId,
733
+ text: chunk,
734
+ disable_web_page_preview: true,
735
+ };
736
+ if (options.parseMode) payload.parse_mode = options.parseMode;
737
+ if (options.silent) payload.disable_notification = true;
738
+
739
+ try {
740
+ const res = await fetch(url, {
741
+ method: "POST",
742
+ headers: { "Content-Type": "application/json" },
743
+ body: JSON.stringify(payload),
744
+ signal: AbortSignal.timeout(15_000),
745
+ });
746
+
747
+ if (!res || typeof res.ok === "undefined") {
748
+ log("warn", "send error: invalid response object");
749
+ continue;
750
+ }
751
+
752
+ if (!res.ok) {
753
+ const body = await res.text().catch(() => "");
754
+ log("warn", `send failed: ${res.status} ${body}`);
755
+ // If parse_mode caused the error, retry as plain text
756
+ if (options.parseMode && res.status === 400) {
757
+ return sendTelegram(chatId, chunk, {
758
+ ...options,
759
+ parseMode: undefined,
760
+ });
761
+ }
762
+ } else {
763
+ try {
764
+ const data = await res.json();
765
+ if (data.ok && data.result?.message_id) {
766
+ lastMessageId = data.result.message_id;
767
+ }
768
+ } catch {
769
+ /* best effort */
770
+ }
771
+ }
772
+ } catch (err) {
773
+ log("warn", `send error: ${err.message}`);
774
+ }
775
+ }
776
+ return lastMessageId;
777
+ }
778
+
779
+ /**
780
+ * Split a text into chunks that fit within Telegram's message limit.
781
+ * @param {string} text
782
+ * @param {number} maxLen
783
+ * @returns {string[]}
784
+ */
785
+ function splitMessage(text, maxLen) {
786
+ if (!text) return ["(empty)"];
787
+ if (text.length <= maxLen) return [text];
788
+ const chunks = [];
789
+ let remaining = text;
790
+ while (remaining.length > 0) {
791
+ if (remaining.length <= maxLen) {
792
+ chunks.push(remaining);
793
+ break;
794
+ }
795
+ let splitIdx = remaining.lastIndexOf("\n", maxLen);
796
+ if (splitIdx < maxLen * 0.3) splitIdx = maxLen;
797
+ chunks.push(remaining.slice(0, splitIdx));
798
+ remaining = remaining.slice(splitIdx);
799
+ }
800
+ return chunks;
801
+ }
802
+
803
+ // ── Telegram Polling ─────────────────────────────────────────────────────────
804
+
805
+ /**
806
+ * Long-poll the Telegram Bot API for new updates.
807
+ * @returns {Promise<Array<object>>}
808
+ */
809
+ async function pollUpdates() {
810
+ if (!telegramToken) return [];
811
+
812
+ const url = `https://api.telegram.org/bot${telegramToken}/getUpdates`;
813
+ const params = new URLSearchParams({
814
+ offset: String(lastUpdateId + 1),
815
+ timeout: String(POLL_TIMEOUT_S),
816
+ allowed_updates: JSON.stringify(["message"]),
817
+ });
818
+
819
+ pollAbort = new AbortController();
820
+ let res;
821
+ try {
822
+ res = await fetch(`${url}?${params}`, {
823
+ signal: pollAbort.signal,
824
+ });
825
+ } catch (err) {
826
+ if (err.name === "AbortError") return [];
827
+ throw err;
828
+ } finally {
829
+ pollAbort = null;
830
+ }
831
+
832
+ if (!res || typeof res.ok === "undefined") {
833
+ throw new Error("invalid response object from Telegram");
834
+ }
835
+
836
+ if (!res.ok) {
837
+ const body = await res.text().catch(() => "");
838
+ // 409 = conflict — another poller is active
839
+ if (res.status === 409) {
840
+ log(
841
+ "warn",
842
+ "Telegram 409 conflict — another poller is active, backing off",
843
+ );
844
+ }
845
+ throw new Error(`getUpdates failed: ${res.status} ${body}`);
846
+ }
847
+
848
+ const data = await res.json();
849
+ return data.ok ? data.result || [] : [];
850
+ }
851
+
852
+ /**
853
+ * Main polling loop. Runs continuously while sentinel is in standalone mode.
854
+ * Implements exponential backoff on errors.
855
+ */
856
+ async function pollLoop() {
857
+ log("info", "polling loop started");
858
+
859
+ while (running && polling && mode === "standalone") {
860
+ try {
861
+ const updates = await pollUpdates();
862
+ consecutivePollErrors = 0;
863
+
864
+ for (const update of updates) {
865
+ lastUpdateId = Math.max(lastUpdateId, update.update_id);
866
+ await handleUpdate(update);
867
+ }
868
+ } catch (err) {
869
+ if (!running) break;
870
+ consecutivePollErrors++;
871
+ const backoff = Math.min(
872
+ POLL_ERROR_BACKOFF_BASE_MS * Math.pow(2, consecutivePollErrors - 1),
873
+ POLL_ERROR_BACKOFF_MAX_MS,
874
+ );
875
+ log(
876
+ "warn",
877
+ `poll error (attempt ${consecutivePollErrors}): ${err.message} — retry in ${Math.round(backoff / 1000)}s`,
878
+ );
879
+ await sleep(backoff);
880
+ }
881
+ }
882
+
883
+ log("info", "polling loop stopped");
884
+ }
885
+
886
+ // ── Update Handler ───────────────────────────────────────────────────────────
887
+
888
+ /** Commands that the sentinel can handle without openfleet. */
889
+ const STANDALONE_COMMANDS = new Set([
890
+ "/ping",
891
+ "/status",
892
+ "/sentinel",
893
+ "/start",
894
+ "/stop",
895
+ "/help",
896
+ ]);
897
+
898
+ /** Commands that require openfleet to be running. */
899
+ const MONITOR_REQUIRED_COMMANDS = new Set([
900
+ "/resumetask",
901
+ "/resume",
902
+ "/tasks",
903
+ "/task",
904
+ "/sdk",
905
+ "/model",
906
+ "/switch",
907
+ "/worktrees",
908
+ "/prune",
909
+ "/batch",
910
+ "/threads",
911
+ "/rebalance",
912
+ "/logs",
913
+ "/errors",
914
+ "/restart",
915
+ "/config",
916
+ ]);
917
+
918
+ /**
919
+ * Handle a single Telegram update.
920
+ * @param {object} update
921
+ */
922
+ async function handleUpdate(update) {
923
+ const msg = update.message;
924
+ if (!msg || !msg.text) return;
925
+
926
+ const chatId = String(msg.chat?.id);
927
+ // Security: only accept messages from the configured chat
928
+ if (chatId !== String(telegramChatId)) {
929
+ log("warn", `ignoring message from unauthorized chat ${chatId}`);
930
+ return;
931
+ }
932
+
933
+ const text = msg.text.trim();
934
+ const command = text.split(/\s+/)[0].toLowerCase();
935
+ // Strip @botname suffix from commands (e.g. /ping@MyBot → /ping)
936
+ const bareCommand = command.includes("@") ? command.split("@")[0] : command;
937
+
938
+ commandsProcessed++;
939
+
940
+ // ── Standalone-handled commands ──────────────────────────────────────────
941
+ if (STANDALONE_COMMANDS.has(bareCommand)) {
942
+ await handleStandaloneCommand(chatId, bareCommand, text);
943
+ return;
944
+ }
945
+
946
+ // ── Commands requiring openfleet ─────────────────────────────────────
947
+ // Either a known monitor command, free-text message, or unknown command
948
+ log("info", `command "${bareCommand}" requires openfleet`);
949
+ await handleMonitorCommand(chatId, text, bareCommand);
950
+ }
951
+
952
+ // ── Standalone Command Handlers ──────────────────────────────────────────────
953
+
954
+ /**
955
+ * Handle commands that the sentinel can process without openfleet.
956
+ * @param {string} chatId
957
+ * @param {string} command
958
+ * @param {string} fullText
959
+ */
960
+ async function handleStandaloneCommand(chatId, command, fullText) {
961
+ switch (command) {
962
+ case "/ping":
963
+ await handlePing(chatId);
964
+ break;
965
+ case "/status":
966
+ await handleStatus(chatId);
967
+ break;
968
+ case "/sentinel":
969
+ await handleSentinelInfo(chatId);
970
+ break;
971
+ case "/start":
972
+ await handleStartMonitor(chatId);
973
+ break;
974
+ case "/stop":
975
+ await handleStopMonitor(chatId);
976
+ break;
977
+ case "/help":
978
+ await handleHelp(chatId);
979
+ break;
980
+ default:
981
+ await sendTelegram(chatId, `Unknown standalone command: ${command}`);
982
+ }
983
+ }
984
+
985
+ /**
986
+ * /ping — Simple liveness check for the sentinel.
987
+ * @param {string} chatId
988
+ */
989
+ async function handlePing(chatId) {
990
+ const monPid = readAlivePid(MONITOR_PID_FILE);
991
+ const monStatus = monPid ? `✅ running (PID ${monPid})` : "❌ not running";
992
+ const uptime = formatUptime(Date.now() - new Date(startedAt).getTime());
993
+ await sendTelegram(
994
+ chatId,
995
+ [
996
+ "🏓 *Pong!*",
997
+ "",
998
+ `Sentinel: ✅ alive (${uptime})`,
999
+ `Mode: ${mode}`,
1000
+ `Monitor: ${monStatus}`,
1001
+ `Host: \`${os.hostname()}\``,
1002
+ ].join("\n"),
1003
+ { parseMode: "Markdown" },
1004
+ );
1005
+ }
1006
+
1007
+ /**
1008
+ * /status — Read the cached orchestrator status file.
1009
+ * @param {string} chatId
1010
+ */
1011
+ async function handleStatus(chatId) {
1012
+ try {
1013
+ if (!existsSync(STATUS_FILE)) {
1014
+ await sendTelegram(
1015
+ chatId,
1016
+ "📊 No status file found. openfleet may not have run yet.",
1017
+ );
1018
+ return;
1019
+ }
1020
+ const raw = await readFile(STATUS_FILE, "utf8");
1021
+ const data = JSON.parse(raw);
1022
+
1023
+ const lines = ["📊 *Orchestrator Status*", ""];
1024
+
1025
+ if (data.executor_mode) lines.push(`Mode: \`${data.executor_mode}\``);
1026
+ if (data.active_slots) lines.push(`Slots: \`${data.active_slots}\``);
1027
+ if (data.last_executor_sync) {
1028
+ const ago = formatUptime(
1029
+ Date.now() - new Date(data.last_executor_sync).getTime(),
1030
+ );
1031
+ lines.push(`Last sync: ${ago} ago`);
1032
+ }
1033
+
1034
+ // Show active attempts
1035
+ if (data.attempts && typeof data.attempts === "object") {
1036
+ const active = Object.values(data.attempts).filter(
1037
+ (a) => a.status === "running" || a.status === "pending",
1038
+ );
1039
+ if (active.length > 0) {
1040
+ lines.push("", "*Active Tasks:*");
1041
+ for (const a of active.slice(0, 10)) {
1042
+ const title = a.task_title || a.task_id?.slice(0, 8) || "?";
1043
+ lines.push(`• ${title} — ${a.status} (${a.executor || "?"})`);
1044
+ }
1045
+ } else {
1046
+ lines.push("", "No active tasks.");
1047
+ }
1048
+ }
1049
+
1050
+ await sendTelegram(chatId, lines.join("\n"), { parseMode: "Markdown" });
1051
+ } catch (err) {
1052
+ await sendTelegram(chatId, `❌ Error reading status: ${err.message}`);
1053
+ }
1054
+ }
1055
+
1056
+ /**
1057
+ * /sentinel — Show detailed sentinel information.
1058
+ * @param {string} chatId
1059
+ */
1060
+ async function handleSentinelInfo(chatId) {
1061
+ const status = getSentinelStatus();
1062
+ const lines = [
1063
+ "🛡️ *Telegram Sentinel*",
1064
+ "",
1065
+ `PID: \`${process.pid}\``,
1066
+ `Mode: ${status.mode}`,
1067
+ `Started: ${status.startedAt}`,
1068
+ `Uptime: ${formatUptime(Date.now() - new Date(status.startedAt).getTime())}`,
1069
+ `Monitor PID: ${status.monitorPid ? `\`${status.monitorPid}\`` : "none"}`,
1070
+ `Commands processed: ${status.commandsProcessed}`,
1071
+ `Commands queued: ${status.commandsQueued}`,
1072
+ `Poll errors: ${consecutivePollErrors}`,
1073
+ `Host: \`${os.hostname()}\``,
1074
+ `Platform: \`${process.platform} ${process.arch}\``,
1075
+ `Node: \`${process.version}\``,
1076
+ ];
1077
+
1078
+ await sendTelegram(chatId, lines.join("\n"), { parseMode: "Markdown" });
1079
+ }
1080
+
1081
+ /**
1082
+ * /start — Manually start openfleet.
1083
+ * @param {string} chatId
1084
+ */
1085
+ async function handleStartMonitor(chatId) {
1086
+ const monPid = readAlivePid(MONITOR_PID_FILE);
1087
+ if (monPid) {
1088
+ await sendTelegram(
1089
+ chatId,
1090
+ `✅ openfleet is already running (PID ${monPid}).`,
1091
+ );
1092
+ return;
1093
+ }
1094
+ await sendTelegram(chatId, "🚀 Starting openfleet...");
1095
+ try {
1096
+ await ensureMonitorRunning("manual /start command");
1097
+ const pid = readAlivePid(MONITOR_PID_FILE);
1098
+ await sendTelegram(
1099
+ chatId,
1100
+ `✅ openfleet started${pid ? ` (PID ${pid})` : ""}.`,
1101
+ );
1102
+ } catch (err) {
1103
+ await sendTelegram(
1104
+ chatId,
1105
+ `❌ Failed to start openfleet: ${err.message}`,
1106
+ );
1107
+ }
1108
+ }
1109
+
1110
+ /**
1111
+ * /stop — Manually stop openfleet.
1112
+ * @param {string} chatId
1113
+ */
1114
+ async function handleStopMonitor(chatId) {
1115
+ const monPid = readAlivePid(MONITOR_PID_FILE);
1116
+ if (!monPid) {
1117
+ await sendTelegram(chatId, "ℹ️ openfleet is not running.");
1118
+ return;
1119
+ }
1120
+ await sendTelegram(chatId, `🛑 Stopping openfleet (PID ${monPid})...`);
1121
+ try {
1122
+ process.kill(monPid, "SIGTERM");
1123
+ // Wait for process to die
1124
+ let gone = false;
1125
+ for (let i = 0; i < 20; i++) {
1126
+ await sleep(500);
1127
+ if (!isProcessAlive(monPid)) {
1128
+ gone = true;
1129
+ break;
1130
+ }
1131
+ }
1132
+ if (!gone) {
1133
+ try {
1134
+ process.kill(monPid, "SIGKILL");
1135
+ } catch {
1136
+ /* best effort */
1137
+ }
1138
+ }
1139
+ removePidFile(MONITOR_PID_FILE);
1140
+ await sendTelegram(chatId, "✅ openfleet stopped.");
1141
+ monitorManualStopUntil = Date.now() + sentinelConfig.manualStopHoldMs;
1142
+ saveRecoveryState();
1143
+ // Transition to standalone mode after stopping monitor
1144
+ await transitionToStandalone("monitor manually stopped");
1145
+ } catch (err) {
1146
+ await sendTelegram(chatId, `❌ Error stopping monitor: ${err.message}`);
1147
+ }
1148
+ }
1149
+
1150
+ /**
1151
+ * /help — Show available sentinel commands.
1152
+ * @param {string} chatId
1153
+ */
1154
+ async function handleHelp(chatId) {
1155
+ const monPid = readAlivePid(MONITOR_PID_FILE);
1156
+ const monStatus = monPid ? "running" : "stopped";
1157
+
1158
+ const lines = [
1159
+ "🛡️ *Sentinel Commands* (always available)",
1160
+ "",
1161
+ "/ping — Check sentinel + monitor liveness",
1162
+ "/status — Show cached orchestrator status",
1163
+ "/sentinel — Show sentinel details",
1164
+ "/start — Start openfleet",
1165
+ "/stop — Stop openfleet",
1166
+ "/help — This message",
1167
+ "",
1168
+ `Monitor is *${monStatus}*. All other commands will ${monPid ? "be forwarded to" : "auto-start"} openfleet.`,
1169
+ ];
1170
+
1171
+ await sendTelegram(chatId, lines.join("\n"), { parseMode: "Markdown" });
1172
+ }
1173
+
1174
+ // ── Monitor-Required Command Handling ────────────────────────────────────────
1175
+
1176
+ /**
1177
+ * Handle commands that need openfleet. Starts the monitor if not running
1178
+ * and queues the command for replay once it's healthy.
1179
+ * @param {string} chatId
1180
+ * @param {string} text
1181
+ * @param {string} command
1182
+ */
1183
+ async function handleMonitorCommand(chatId, text, command) {
1184
+ const monPid = readAlivePid(MONITOR_PID_FILE);
1185
+ const requiresMonitor = MONITOR_REQUIRED_COMMANDS.has(command);
1186
+
1187
+ if (monPid) {
1188
+ // Monitor is running but sentinel is somehow in standalone mode — this
1189
+ // can happen briefly during transitions. Queue the command for the
1190
+ // monitor to pick up.
1191
+ queueCommand(chatId, text);
1192
+ await writeCommandQueueFile();
1193
+ log("info", "monitor running — queued command for replay");
1194
+ return;
1195
+ }
1196
+
1197
+ let fallbackHandled = false;
1198
+ if (sentinelConfig.primaryAgentFallbackEnabled) {
1199
+ fallbackHandled = await runPrimaryAgentFallback(chatId, text, command);
1200
+ }
1201
+
1202
+ if (requiresMonitor) {
1203
+ queueCommand(chatId, text);
1204
+ }
1205
+
1206
+ if (!sentinelConfig.autoRestartMonitor && !requiresMonitor) {
1207
+ return;
1208
+ }
1209
+
1210
+ await sendTelegram(chatId, "⏳ Starting openfleet in the background...");
1211
+
1212
+ try {
1213
+ await ensureMonitorRunning(`command: ${command}`);
1214
+ if (commandQueue.length > 0) {
1215
+ await writeCommandQueueFile();
1216
+ }
1217
+ log(
1218
+ "info",
1219
+ `monitor started — ${commandQueue.length} command(s) queued for replay`,
1220
+ );
1221
+ } catch (err) {
1222
+ if (!fallbackHandled) {
1223
+ await sendTelegram(
1224
+ chatId,
1225
+ `❌ Failed to start openfleet: ${err.message}\n\nYour command was not processed.`,
1226
+ );
1227
+ }
1228
+ // Clear the failed commands
1229
+ commandQueue = [];
1230
+ }
1231
+ }
1232
+
1233
+ // ── Command Queue ────────────────────────────────────────────────────────────
1234
+
1235
+ /**
1236
+ * Add a command to the replay queue.
1237
+ * @param {string | number} chatId
1238
+ * @param {string} text
1239
+ */
1240
+ function queueCommand(chatId, text) {
1241
+ // Evict stale commands
1242
+ const now = Date.now();
1243
+ commandQueue = commandQueue.filter(
1244
+ (c) => now - c.timestamp < COMMAND_QUEUE_TTL_MS,
1245
+ );
1246
+
1247
+ // Enforce max queue size
1248
+ if (commandQueue.length >= COMMAND_QUEUE_MAX_SIZE) {
1249
+ log(
1250
+ "warn",
1251
+ `command queue full (${COMMAND_QUEUE_MAX_SIZE}), dropping oldest`,
1252
+ );
1253
+ commandQueue.shift();
1254
+ }
1255
+
1256
+ commandQueue.push({ chatId: String(chatId), text, timestamp: now });
1257
+ }
1258
+
1259
+ /**
1260
+ * Write the command queue to a JSON file for openfleet to read.
1261
+ * @returns {Promise<void>}
1262
+ */
1263
+ async function writeCommandQueueFile() {
1264
+ try {
1265
+ mkdirSync(dirname(SENTINEL_COMMAND_QUEUE_FILE), { recursive: true });
1266
+ await writeFile(
1267
+ SENTINEL_COMMAND_QUEUE_FILE,
1268
+ JSON.stringify(commandQueue, null, 2),
1269
+ "utf8",
1270
+ );
1271
+ } catch (err) {
1272
+ log("warn", `failed to write command queue: ${err.message}`);
1273
+ }
1274
+ }
1275
+
1276
+ /**
1277
+ * Get the current command queue.
1278
+ * @returns {Array<{ chatId: string, text: string, timestamp: number }>}
1279
+ */
1280
+ export function getQueuedCommands() {
1281
+ return [...commandQueue];
1282
+ }
1283
+
1284
+ // ── Monitor Lifecycle ────────────────────────────────────────────────────────
1285
+
1286
+ /**
1287
+ * Check if the openfleet process is running.
1288
+ * @returns {boolean}
1289
+ */
1290
+ export function isMonitorRunning() {
1291
+ return readAlivePid(MONITOR_PID_FILE) !== null;
1292
+ }
1293
+
1294
+ /**
1295
+ * Ensure openfleet is running. If not, start it and wait until it's healthy.
1296
+ * Returns immediately if monitor is already running. Coalesces concurrent calls
1297
+ * so only one monitor start happens at a time.
1298
+ * @param {string} reason - Human-readable reason for starting the monitor.
1299
+ * @returns {Promise<void>}
1300
+ */
1301
+ export async function ensureMonitorRunning(reason) {
1302
+ // Already running
1303
+ if (readAlivePid(MONITOR_PID_FILE)) return;
1304
+
1305
+ // Another call is already starting the monitor — piggyback on it
1306
+ if (monitorStartPromise) {
1307
+ log("info", `waiting for in-progress monitor start (reason: ${reason})`);
1308
+ return monitorStartPromise;
1309
+ }
1310
+
1311
+ recordMonitorRestartAttempt();
1312
+
1313
+ monitorStartPromise = startAndWaitForMonitor(reason).catch((err) => {
1314
+ recordMonitorCrashEvent();
1315
+ throw err;
1316
+ });
1317
+ try {
1318
+ await monitorStartPromise;
1319
+ } finally {
1320
+ monitorStartPromise = null;
1321
+ }
1322
+ }
1323
+
1324
+ /**
1325
+ * Start openfleet as a detached background process and wait for it to
1326
+ * become healthy (PID file written and process alive).
1327
+ * @param {string} reason
1328
+ * @returns {Promise<void>}
1329
+ */
1330
+ async function startAndWaitForMonitor(reason) {
1331
+ log("info", `starting openfleet (reason: ${reason})`);
1332
+
1333
+ // If sentinel is currently polling, release the sentinel lock.
1334
+ // The monitor's telegram-bot.mjs will acquire its own poll lock.
1335
+ const wasPolling = polling;
1336
+ if (wasPolling) {
1337
+ polling = false;
1338
+ if (pollAbort) {
1339
+ try {
1340
+ pollAbort.abort();
1341
+ } catch {
1342
+ /* ok */
1343
+ }
1344
+ }
1345
+ await releaseSentinelPollLock();
1346
+ log("info", "released sentinel poll lock for monitor startup");
1347
+ }
1348
+
1349
+ // Ensure log directory exists for daemon output
1350
+ const daemonLog = resolve(__dirname, "logs", "daemon.log");
1351
+ try {
1352
+ mkdirSync(dirname(daemonLog), { recursive: true });
1353
+ } catch {
1354
+ /* ok */
1355
+ }
1356
+
1357
+ // Start cli.mjs as a detached daemon child
1358
+ const child = spawn(
1359
+ process.execPath,
1360
+ [
1361
+ "--max-old-space-size=4096",
1362
+ resolve(__dirname, "cli.mjs"),
1363
+ "--daemon-child",
1364
+ ],
1365
+ {
1366
+ detached: true,
1367
+ stdio: "ignore",
1368
+ env: { ...process.env, CODEX_MONITOR_DAEMON: "1" },
1369
+ cwd: repoRoot,
1370
+ },
1371
+ );
1372
+
1373
+ child.on("error", (err) => {
1374
+ log("error", `monitor spawn error: ${err.message}`);
1375
+ });
1376
+
1377
+ child.unref();
1378
+
1379
+ const spawnedPid = child.pid;
1380
+ if (!spawnedPid) {
1381
+ throw new Error("openfleet failed to spawn (no PID)");
1382
+ }
1383
+
1384
+ log("info", `monitor spawned (PID ${spawnedPid}), waiting for health...`);
1385
+
1386
+ // Wait for the monitor to become healthy (PID file written + process alive)
1387
+ const deadline = Date.now() + MONITOR_START_TIMEOUT_MS;
1388
+ while (Date.now() < deadline) {
1389
+ await sleep(MONITOR_HEALTH_POLL_MS);
1390
+
1391
+ const alivePid = readAlivePid(MONITOR_PID_FILE);
1392
+ if (alivePid) {
1393
+ log("info", `monitor is healthy (PID ${alivePid})`);
1394
+ lastMonitorStartAt = Date.now();
1395
+ saveRecoveryState();
1396
+ // Transition to companion mode
1397
+ await transitionToCompanion(alivePid);
1398
+ return;
1399
+ }
1400
+
1401
+ // Check if spawned process died prematurely
1402
+ if (!isProcessAlive(spawnedPid)) {
1403
+ throw new Error(
1404
+ `openfleet process died during startup (PID ${spawnedPid})`,
1405
+ );
1406
+ }
1407
+ }
1408
+
1409
+ throw new Error(
1410
+ `openfleet did not become healthy within ${MONITOR_START_TIMEOUT_MS / 1000}s`,
1411
+ );
1412
+ }
1413
+
1414
+ // ── Mode Transitions ─────────────────────────────────────────────────────────
1415
+
1416
+ /**
1417
+ * Transition to standalone mode. Starts polling for Telegram updates directly.
1418
+ * @param {string} reason
1419
+ */
1420
+ async function transitionToStandalone(reason) {
1421
+ if (mode === "standalone" && polling) {
1422
+ log("debug", `already in standalone mode (${reason})`);
1423
+ return;
1424
+ }
1425
+
1426
+ log("info", `transitioning to standalone mode: ${reason}`);
1427
+ mode = "standalone";
1428
+
1429
+ // Check if the main bot poll lock is held by a live process
1430
+ const mainBotPolling = await isMainBotPolling();
1431
+ if (mainBotPolling) {
1432
+ log("info", "main bot is still polling — skipping sentinel poll start");
1433
+ return;
1434
+ }
1435
+
1436
+ // Acquire sentinel poll lock and start polling
1437
+ const lockAcquired = await acquireSentinelPollLock();
1438
+ if (!lockAcquired) {
1439
+ log(
1440
+ "warn",
1441
+ "failed to acquire sentinel poll lock — another sentinel may be running",
1442
+ );
1443
+ return;
1444
+ }
1445
+
1446
+ // Clear stale updates before starting the loop
1447
+ try {
1448
+ const stale = await pollUpdates();
1449
+ for (const u of stale) {
1450
+ lastUpdateId = Math.max(lastUpdateId, u.update_id);
1451
+ }
1452
+ if (stale.length > 0) {
1453
+ log("info", `skipped ${stale.length} stale updates`);
1454
+ }
1455
+ } catch {
1456
+ /* best effort */
1457
+ }
1458
+
1459
+ polling = true;
1460
+ consecutivePollErrors = 0;
1461
+
1462
+ // Fire polling loop (non-blocking)
1463
+ pollLoop().catch((err) => {
1464
+ log("error", `poll loop crashed: ${err.message}`);
1465
+ polling = false;
1466
+ });
1467
+
1468
+ await writeHeartbeat();
1469
+ }
1470
+
1471
+ /**
1472
+ * Transition to companion mode. Stops polling and lets telegram-bot.mjs handle it.
1473
+ * @param {number} monitorPid
1474
+ */
1475
+ async function transitionToCompanion(monitorPid) {
1476
+ log("info", `transitioning to companion mode (monitor PID ${monitorPid})`);
1477
+ mode = "companion";
1478
+
1479
+ // Stop polling if active
1480
+ polling = false;
1481
+ if (pollAbort) {
1482
+ try {
1483
+ pollAbort.abort();
1484
+ } catch {
1485
+ /* ok */
1486
+ }
1487
+ }
1488
+ await releaseSentinelPollLock();
1489
+
1490
+ await writeHeartbeat();
1491
+ }
1492
+
1493
+ /**
1494
+ * Check if the main telegram-bot.mjs poll lock is held by a live process.
1495
+ * @returns {Promise<boolean>}
1496
+ */
1497
+ async function isMainBotPolling() {
1498
+ try {
1499
+ if (!existsSync(MONITOR_POLL_LOCK_FILE)) return false;
1500
+ const raw = await readFile(MONITOR_POLL_LOCK_FILE, "utf8");
1501
+ if (!raw || !raw.trim()) return false;
1502
+ const data = JSON.parse(raw);
1503
+ const pid = Number(data?.pid);
1504
+ return isProcessAlive(pid);
1505
+ } catch {
1506
+ return false;
1507
+ }
1508
+ }
1509
+
1510
+ // ── Health Monitoring ────────────────────────────────────────────────────────
1511
+
1512
+ /**
1513
+ * Periodic health check for openfleet. Runs every HEALTH_CHECK_INTERVAL_MS.
1514
+ */
1515
+ async function healthCheck() {
1516
+ const monPid = readAlivePid(MONITOR_PID_FILE);
1517
+
1518
+ if (mode === "companion") {
1519
+ if (!monPid) {
1520
+ // Monitor died while in companion mode — send crash notification and go standalone
1521
+ log("warn", "monitor process died — transitioning to standalone");
1522
+ removePidFile(MONITOR_PID_FILE);
1523
+ recordMonitorCrashEvent();
1524
+
1525
+ const recentStartAge =
1526
+ lastMonitorStartAt > 0 ? Date.now() - lastMonitorStartAt : null;
1527
+ const rapidCrash =
1528
+ Number.isFinite(recentStartAge) &&
1529
+ recentStartAge <= sentinelConfig.monitorStartGraceMs;
1530
+
1531
+ // Notify user
1532
+ const host = os.hostname();
1533
+ const tag = projectName ? `[${projectName}]` : "";
1534
+ await sendTelegram(
1535
+ telegramChatId,
1536
+ [
1537
+ `🔥 ${tag} openfleet crashed`,
1538
+ "",
1539
+ `Host: \`${host}\``,
1540
+ `Time: ${new Date().toISOString()}`,
1541
+ rapidCrash
1542
+ ? `Detected rapid crash (${formatUptime(recentStartAge)} after startup).`
1543
+ : "",
1544
+ "",
1545
+ "Sentinel is switching to standalone mode and will attempt automatic recovery.",
1546
+ ].join("\n"),
1547
+ { parseMode: "Markdown" },
1548
+ );
1549
+
1550
+ await transitionToStandalone("monitor process died");
1551
+ await attemptMonitorRecovery("monitor crashed in companion mode");
1552
+ }
1553
+ } else if (mode === "standalone") {
1554
+ if (monPid) {
1555
+ // Monitor appeared while in standalone mode (started externally)
1556
+ log(
1557
+ "info",
1558
+ `monitor detected (PID ${monPid}) — switching to companion mode`,
1559
+ );
1560
+ await transitionToCompanion(monPid);
1561
+ } else {
1562
+ // Check if main bot has acquired the poll lock (edge case: monitor starting up)
1563
+ const mainPolling = await isMainBotPolling();
1564
+ if (mainPolling && polling) {
1565
+ log("info", "main bot is polling — stopping sentinel polling");
1566
+ polling = false;
1567
+ if (pollAbort) {
1568
+ try {
1569
+ pollAbort.abort();
1570
+ } catch {
1571
+ /* ok */
1572
+ }
1573
+ }
1574
+ await releaseSentinelPollLock();
1575
+ } else if (!mainPolling && !polling) {
1576
+ // Neither is polling — sentinel should resume
1577
+ log("info", "no poller active — resuming sentinel polling");
1578
+ await transitionToStandalone("no active poller detected");
1579
+ }
1580
+
1581
+ if (sentinelConfig.autoRestartMonitor && !monitorStartPromise) {
1582
+ await attemptMonitorRecovery("monitor not running during standalone health check");
1583
+ }
1584
+ }
1585
+ }
1586
+
1587
+ // Clean up stale PID files
1588
+ const sentinelPid = readAlivePid(SENTINEL_PID_FILE);
1589
+ if (sentinelPid && sentinelPid !== process.pid) {
1590
+ // Another sentinel is alive — we shouldn't be running
1591
+ log(
1592
+ "warn",
1593
+ `another sentinel is alive (PID ${sentinelPid}) — stopping this instance`,
1594
+ );
1595
+ stopSentinel();
1596
+ return;
1597
+ }
1598
+
1599
+ await writeHeartbeat();
1600
+ }
1601
+
1602
+ // ── Heartbeat ────────────────────────────────────────────────────────────────
1603
+
1604
+ /**
1605
+ * Write the sentinel heartbeat file.
1606
+ * @returns {Promise<void>}
1607
+ */
1608
+ async function writeHeartbeat() {
1609
+ /** @type {import("./telegram-sentinel.mjs").SentinelHeartbeat} */
1610
+ const heartbeat = {
1611
+ pid: process.pid,
1612
+ startedAt,
1613
+ mode,
1614
+ monitorPid: readAlivePid(MONITOR_PID_FILE),
1615
+ lastCheck: new Date().toISOString(),
1616
+ commandsQueued: commandQueue.length,
1617
+ commandsProcessed,
1618
+ };
1619
+
1620
+ try {
1621
+ mkdirSync(dirname(SENTINEL_HEARTBEAT_FILE), { recursive: true });
1622
+ await writeFile(
1623
+ SENTINEL_HEARTBEAT_FILE,
1624
+ JSON.stringify(heartbeat, null, 2),
1625
+ "utf8",
1626
+ );
1627
+ } catch (err) {
1628
+ log("warn", `heartbeat write failed: ${err.message}`);
1629
+ }
1630
+ }
1631
+
1632
+ // ── Public API ───────────────────────────────────────────────────────────────
1633
+
1634
+ /**
1635
+ * Start the Telegram sentinel. This is the main entry point.
1636
+ *
1637
+ * @param {object} [options]
1638
+ * @param {boolean} [options.skipExistingCheck] - Skip checking for an existing sentinel.
1639
+ * @returns {Promise<void>}
1640
+ */
1641
+ export async function startSentinel(options = {}) {
1642
+ if (running) {
1643
+ log("warn", "sentinel is already running");
1644
+ return;
1645
+ }
1646
+
1647
+ initEnv();
1648
+
1649
+ if (!telegramToken || !telegramChatId) {
1650
+ log(
1651
+ "error",
1652
+ "cannot start sentinel: TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID not configured",
1653
+ );
1654
+ console.error(
1655
+ `${TAG} Set these in .env (project root) or as environment variables.`,
1656
+ );
1657
+ process.exit(1);
1658
+ }
1659
+
1660
+ // Ensure cache directory exists
1661
+ mkdirSync(cacheDir, { recursive: true });
1662
+ mkdirSync(dirname(MONITOR_PID_FILE), { recursive: true });
1663
+
1664
+ // Check for existing sentinel
1665
+ if (!options.skipExistingCheck) {
1666
+ const existingPid = readAlivePid(SENTINEL_PID_FILE);
1667
+ if (existingPid && existingPid !== process.pid) {
1668
+ console.error(
1669
+ `${TAG} Another sentinel is already running (PID ${existingPid}). Use --stop first.`,
1670
+ );
1671
+ process.exit(1);
1672
+ }
1673
+ }
1674
+
1675
+ running = true;
1676
+ startedAt = new Date().toISOString();
1677
+ loadRecoveryState();
1678
+ writePidFile(SENTINEL_PID_FILE, process.pid);
1679
+
1680
+ log("info", `sentinel started (PID ${process.pid})`);
1681
+
1682
+ // Determine initial mode
1683
+ const monPid = readAlivePid(MONITOR_PID_FILE);
1684
+ if (monPid) {
1685
+ log(
1686
+ "info",
1687
+ `openfleet already running (PID ${monPid}) — starting in companion mode`,
1688
+ );
1689
+ await transitionToCompanion(monPid);
1690
+ } else {
1691
+ log("info", "openfleet not running — starting in standalone mode");
1692
+ await transitionToStandalone("initial startup");
1693
+ }
1694
+
1695
+ // Set up periodic health checks
1696
+ healthCheckTimer = setInterval(() => {
1697
+ healthCheck().catch((err) => {
1698
+ log("error", `health check error: ${err.message}`);
1699
+ });
1700
+ }, HEALTH_CHECK_INTERVAL_MS);
1701
+ if (healthCheckTimer.unref) healthCheckTimer.unref();
1702
+
1703
+ // Set up periodic heartbeat writes
1704
+ heartbeatTimer = setInterval(() => {
1705
+ writeHeartbeat().catch(() => {});
1706
+ }, HEALTH_CHECK_INTERVAL_MS);
1707
+ if (heartbeatTimer.unref) heartbeatTimer.unref();
1708
+
1709
+ // Initial heartbeat
1710
+ await writeHeartbeat();
1711
+
1712
+ // Register shutdown handlers
1713
+ const shutdown = () => {
1714
+ log("info", "received shutdown signal");
1715
+ stopSentinel();
1716
+ process.exit(0);
1717
+ };
1718
+ process.on("SIGINT", shutdown);
1719
+ process.on("SIGTERM", shutdown);
1720
+ process.on("uncaughtException", (err) => {
1721
+ log("error", `uncaught exception: ${err.message}\n${err.stack}`);
1722
+ // Attempt crash notification
1723
+ sendTelegram(
1724
+ telegramChatId,
1725
+ `🛡️❌ Sentinel crashed: ${err.message}\nHost: \`${os.hostname()}\``,
1726
+ { parseMode: "Markdown" },
1727
+ ).catch(() => {});
1728
+ stopSentinel();
1729
+ process.exit(1);
1730
+ });
1731
+ process.on("unhandledRejection", (reason) => {
1732
+ log("error", `unhandled rejection: ${reason}`);
1733
+ });
1734
+ }
1735
+
1736
+ /**
1737
+ * Stop the sentinel gracefully. Cleans up timers, locks, and PID files.
1738
+ */
1739
+ export function stopSentinel() {
1740
+ if (!running) return;
1741
+ running = false;
1742
+ polling = false;
1743
+
1744
+ // Abort any pending poll
1745
+ if (pollAbort) {
1746
+ try {
1747
+ pollAbort.abort();
1748
+ } catch {
1749
+ /* ok */
1750
+ }
1751
+ }
1752
+
1753
+ // Clear timers
1754
+ if (healthCheckTimer) {
1755
+ clearInterval(healthCheckTimer);
1756
+ healthCheckTimer = null;
1757
+ }
1758
+ if (heartbeatTimer) {
1759
+ clearInterval(heartbeatTimer);
1760
+ heartbeatTimer = null;
1761
+ }
1762
+
1763
+ // Release locks and PID files
1764
+ releaseSentinelPollLock().catch(() => {});
1765
+ removePidFile(SENTINEL_PID_FILE);
1766
+
1767
+ // Clean up heartbeat file
1768
+ try {
1769
+ if (existsSync(SENTINEL_HEARTBEAT_FILE))
1770
+ unlinkSync(SENTINEL_HEARTBEAT_FILE);
1771
+ } catch {
1772
+ /* best effort */
1773
+ }
1774
+
1775
+ log("info", "sentinel stopped");
1776
+ }
1777
+
1778
+ /**
1779
+ * Get the current sentinel status.
1780
+ * @returns {SentinelStatus}
1781
+ */
1782
+ export function getSentinelStatus() {
1783
+ return {
1784
+ pid: process.pid,
1785
+ running,
1786
+ startedAt,
1787
+ mode,
1788
+ monitorPid: readAlivePid(MONITOR_PID_FILE),
1789
+ polling,
1790
+ commandsQueued: commandQueue.length,
1791
+ commandsProcessed,
1792
+ consecutivePollErrors,
1793
+ uptime: Date.now() - new Date(startedAt).getTime(),
1794
+ };
1795
+ }
1796
+
1797
+ export function getSentinelRecoveryStatus() {
1798
+ const now = Date.now();
1799
+ const crashes = pruneTimestamps(monitorCrashEvents, now).length;
1800
+ const restarts = pruneTimestamps(monitorRestartAttempts, now).length;
1801
+ return {
1802
+ crashLoopDetected: isCrashLoopDetected(now),
1803
+ crashesInWindow: crashes,
1804
+ restartsInWindow: restarts,
1805
+ crashLoopThreshold: sentinelConfig.crashLoopThreshold,
1806
+ crashLoopWindowMs: sentinelConfig.crashLoopWindowMs,
1807
+ lastRepairAt,
1808
+ recoveryInProgress,
1809
+ };
1810
+ }
1811
+
1812
+ export function __setRecoveryStateForTest(state = {}) {
1813
+ monitorRestartAttempts = Array.isArray(state.monitorRestartAttempts)
1814
+ ? [...state.monitorRestartAttempts]
1815
+ : [];
1816
+ monitorCrashEvents = Array.isArray(state.monitorCrashEvents)
1817
+ ? [...state.monitorCrashEvents]
1818
+ : [];
1819
+ lastRepairAt = Number(state.lastRepairAt) || 0;
1820
+ lastMonitorStartAt = Number(state.lastMonitorStartAt) || 0;
1821
+ monitorManualStopUntil = Number(state.monitorManualStopUntil) || 0;
1822
+ }
1823
+
1824
+ // ── Logging ──────────────────────────────────────────────────────────────────
1825
+
1826
+ /**
1827
+ * Simple structured logger. All output goes to stdout/stderr with a tag prefix.
1828
+ * @param {"info" | "warn" | "error" | "debug"} level
1829
+ * @param {string} message
1830
+ */
1831
+ function log(level, message) {
1832
+ const timestamp = new Date().toISOString();
1833
+ const prefix = `${timestamp} ${TAG}`;
1834
+ switch (level) {
1835
+ case "error":
1836
+ console.error(`${prefix} ERROR: ${message}`);
1837
+ break;
1838
+ case "warn":
1839
+ console.warn(`${prefix} WARN: ${message}`);
1840
+ break;
1841
+ case "debug":
1842
+ if (process.env.SENTINEL_DEBUG === "1") {
1843
+ console.log(`${prefix} DEBUG: ${message}`);
1844
+ }
1845
+ break;
1846
+ default:
1847
+ console.log(`${prefix} ${message}`);
1848
+ }
1849
+ }
1850
+
1851
+ // ── Utility ──────────────────────────────────────────────────────────────────
1852
+
1853
+ /**
1854
+ * Format a duration in milliseconds to a human-readable string.
1855
+ * @param {number} ms
1856
+ * @returns {string}
1857
+ */
1858
+ function formatUptime(ms) {
1859
+ if (ms < 0) ms = 0;
1860
+ const seconds = Math.floor(ms / 1000);
1861
+ const minutes = Math.floor(seconds / 60);
1862
+ const hours = Math.floor(minutes / 60);
1863
+ const days = Math.floor(hours / 24);
1864
+
1865
+ if (days > 0) return `${days}d ${hours % 24}h ${minutes % 60}m`;
1866
+ if (hours > 0) return `${hours}h ${minutes % 60}m`;
1867
+ if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
1868
+ return `${seconds}s`;
1869
+ }
1870
+
1871
+ /**
1872
+ * Sleep for the given number of milliseconds.
1873
+ * @param {number} ms
1874
+ * @returns {Promise<void>}
1875
+ */
1876
+ function sleep(ms) {
1877
+ return new Promise((resolve) => setTimeout(resolve, ms));
1878
+ }
1879
+
1880
+ // ── Type Definitions (JSDoc) ─────────────────────────────────────────────────
1881
+
1882
+ /**
1883
+ * @typedef {object} SentinelHeartbeat
1884
+ * @property {number} pid
1885
+ * @property {string} startedAt
1886
+ * @property {"standalone" | "companion"} mode
1887
+ * @property {number | null} monitorPid
1888
+ * @property {string} lastCheck
1889
+ * @property {number} commandsQueued
1890
+ * @property {number} commandsProcessed
1891
+ */
1892
+
1893
+ /**
1894
+ * @typedef {object} SentinelStatus
1895
+ * @property {number} pid
1896
+ * @property {boolean} running
1897
+ * @property {string} startedAt
1898
+ * @property {"standalone" | "companion"} mode
1899
+ * @property {number | null} monitorPid
1900
+ * @property {boolean} polling
1901
+ * @property {number} commandsQueued
1902
+ * @property {number} commandsProcessed
1903
+ * @property {number} consecutivePollErrors
1904
+ * @property {number} uptime
1905
+ */
1906
+
1907
+ // ── CLI Entry Point ──────────────────────────────────────────────────────────
1908
+
1909
+ const isDirectExecution = (() => {
1910
+ try {
1911
+ const thisFile = fileURLToPath(import.meta.url);
1912
+ const argv1 = process.argv[1];
1913
+ if (!argv1) return false;
1914
+ // Normalize paths for comparison (Windows backslash vs posix)
1915
+ const normalizedThis = thisFile.replace(/\\/g, "/").toLowerCase();
1916
+ const normalizedArgv = resolve(argv1).replace(/\\/g, "/").toLowerCase();
1917
+ return normalizedThis === normalizedArgv;
1918
+ } catch {
1919
+ return false;
1920
+ }
1921
+ })();
1922
+
1923
+ if (isDirectExecution) {
1924
+ const args = process.argv.slice(2);
1925
+
1926
+ if (args.includes("--help") || args.includes("-h")) {
1927
+ console.log(`
1928
+ telegram-sentinel — Always-on Telegram command listener for openfleet
1929
+
1930
+ USAGE
1931
+ node telegram-sentinel.mjs [options]
1932
+
1933
+ OPTIONS
1934
+ --stop Stop a running sentinel
1935
+ --status Check sentinel status
1936
+ --help Show this help
1937
+
1938
+ ENVIRONMENT
1939
+ TELEGRAM_BOT_TOKEN Telegram bot token (or set in .env)
1940
+ TELEGRAM_CHAT_ID Authorized chat ID (or set in .env)
1941
+ SENTINEL_DEBUG=1 Enable debug logging
1942
+
1943
+ The sentinel monitors openfleet and handles Telegram commands
1944
+ even when the main process is not running.
1945
+ `);
1946
+ process.exit(0);
1947
+ }
1948
+
1949
+ if (args.includes("--stop")) {
1950
+ const pid = readAlivePid(SENTINEL_PID_FILE);
1951
+ if (!pid) {
1952
+ console.log(" No sentinel running.");
1953
+ removePidFile(SENTINEL_PID_FILE);
1954
+ process.exit(0);
1955
+ }
1956
+ console.log(` Stopping sentinel (PID ${pid})...`);
1957
+ try {
1958
+ process.kill(pid, "SIGTERM");
1959
+ let gone = false;
1960
+ for (let i = 0; i < 20; i++) {
1961
+ await sleep(500);
1962
+ if (!isProcessAlive(pid)) {
1963
+ gone = true;
1964
+ break;
1965
+ }
1966
+ }
1967
+ if (!gone) {
1968
+ try {
1969
+ process.kill(pid, "SIGKILL");
1970
+ } catch {
1971
+ /* ok */
1972
+ }
1973
+ }
1974
+ removePidFile(SENTINEL_PID_FILE);
1975
+ console.log(" ✓ Sentinel stopped.");
1976
+ } catch (err) {
1977
+ console.error(` Failed: ${err.message}`);
1978
+ process.exit(1);
1979
+ }
1980
+ process.exit(0);
1981
+ }
1982
+
1983
+ if (args.includes("--status")) {
1984
+ const pid = readAlivePid(SENTINEL_PID_FILE);
1985
+ if (pid) {
1986
+ console.log(` Sentinel is running (PID ${pid})`);
1987
+ try {
1988
+ if (existsSync(SENTINEL_HEARTBEAT_FILE)) {
1989
+ const hb = JSON.parse(readFileSync(SENTINEL_HEARTBEAT_FILE, "utf8"));
1990
+ console.log(` Mode: ${hb.mode}`);
1991
+ console.log(` Monitor PID: ${hb.monitorPid || "none"}`);
1992
+ console.log(` Last check: ${hb.lastCheck}`);
1993
+ console.log(` Commands processed: ${hb.commandsProcessed}`);
1994
+ }
1995
+ } catch {
1996
+ /* best effort */
1997
+ }
1998
+ } else {
1999
+ console.log(" Sentinel is not running.");
2000
+ removePidFile(SENTINEL_PID_FILE);
2001
+ }
2002
+ process.exit(0);
2003
+ }
2004
+
2005
+ // Default: start sentinel
2006
+ startSentinel().catch((err) => {
2007
+ console.error(`${TAG} Fatal: ${err.message}`);
2008
+ process.exit(1);
2009
+ });
2010
+ }