@virtengine/openfleet 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +914 -0
- package/LICENSE +190 -0
- package/README.md +500 -0
- package/agent-endpoint.mjs +918 -0
- package/agent-hook-bridge.mjs +230 -0
- package/agent-hooks.mjs +1188 -0
- package/agent-pool.mjs +2403 -0
- package/agent-prompts.mjs +689 -0
- package/agent-sdk.mjs +141 -0
- package/anomaly-detector.mjs +1195 -0
- package/autofix.mjs +1294 -0
- package/claude-shell.mjs +708 -0
- package/cli.mjs +906 -0
- package/codex-config.mjs +1274 -0
- package/codex-model-profiles.mjs +135 -0
- package/codex-shell.mjs +762 -0
- package/config-doctor.mjs +613 -0
- package/config.mjs +1720 -0
- package/conflict-resolver.mjs +248 -0
- package/container-runner.mjs +450 -0
- package/copilot-shell.mjs +827 -0
- package/daemon-restart-policy.mjs +56 -0
- package/diff-stats.mjs +282 -0
- package/error-detector.mjs +829 -0
- package/fetch-runtime.mjs +34 -0
- package/fleet-coordinator.mjs +838 -0
- package/get-telegram-chat-id.mjs +71 -0
- package/git-safety.mjs +170 -0
- package/github-reconciler.mjs +403 -0
- package/hook-profiles.mjs +651 -0
- package/kanban-adapter.mjs +4491 -0
- package/lib/logger.mjs +645 -0
- package/maintenance.mjs +828 -0
- package/merge-strategy.mjs +1171 -0
- package/monitor.mjs +12207 -0
- package/openfleet.config.example.json +115 -0
- package/openfleet.schema.json +465 -0
- package/package.json +203 -0
- package/postinstall.mjs +187 -0
- package/pr-cleanup-daemon.mjs +978 -0
- package/preflight.mjs +408 -0
- package/prepublish-check.mjs +90 -0
- package/presence.mjs +328 -0
- package/primary-agent.mjs +282 -0
- package/publish.mjs +151 -0
- package/repo-root.mjs +29 -0
- package/restart-controller.mjs +100 -0
- package/review-agent.mjs +557 -0
- package/rotate-agent-logs.sh +133 -0
- package/sdk-conflict-resolver.mjs +973 -0
- package/session-tracker.mjs +880 -0
- package/setup.mjs +3937 -0
- package/shared-knowledge.mjs +410 -0
- package/shared-state-manager.mjs +841 -0
- package/shared-workspace-cli.mjs +199 -0
- package/shared-workspace-registry.mjs +537 -0
- package/shared-workspaces.json +18 -0
- package/startup-service.mjs +1070 -0
- package/sync-engine.mjs +1063 -0
- package/task-archiver.mjs +801 -0
- package/task-assessment.mjs +550 -0
- package/task-claims.mjs +924 -0
- package/task-complexity.mjs +581 -0
- package/task-executor.mjs +5111 -0
- package/task-store.mjs +753 -0
- package/telegram-bot.mjs +9281 -0
- package/telegram-sentinel.mjs +2010 -0
- package/ui/app.js +867 -0
- package/ui/app.legacy.js +1464 -0
- package/ui/app.monolith.js +2488 -0
- package/ui/components/charts.js +226 -0
- package/ui/components/chat-view.js +567 -0
- package/ui/components/command-palette.js +587 -0
- package/ui/components/diff-viewer.js +190 -0
- package/ui/components/forms.js +327 -0
- package/ui/components/kanban-board.js +451 -0
- package/ui/components/session-list.js +305 -0
- package/ui/components/shared.js +473 -0
- package/ui/index.html +70 -0
- package/ui/modules/api.js +297 -0
- package/ui/modules/icons.js +461 -0
- package/ui/modules/router.js +81 -0
- package/ui/modules/settings-schema.js +261 -0
- package/ui/modules/state.js +679 -0
- package/ui/modules/telegram.js +331 -0
- package/ui/modules/utils.js +270 -0
- package/ui/styles/animations.css +140 -0
- package/ui/styles/base.css +98 -0
- package/ui/styles/components.css +1915 -0
- package/ui/styles/kanban.css +286 -0
- package/ui/styles/layout.css +809 -0
- package/ui/styles/sessions.css +827 -0
- package/ui/styles/variables.css +188 -0
- package/ui/styles.css +141 -0
- package/ui/styles.monolith.css +1046 -0
- package/ui/tabs/agents.js +1417 -0
- package/ui/tabs/chat.js +74 -0
- package/ui/tabs/control.js +887 -0
- package/ui/tabs/dashboard.js +515 -0
- package/ui/tabs/infra.js +537 -0
- package/ui/tabs/logs.js +783 -0
- package/ui/tabs/settings.js +1487 -0
- package/ui/tabs/tasks.js +1385 -0
- package/ui-server.mjs +4073 -0
- package/update-check.mjs +465 -0
- package/utils.mjs +172 -0
- package/ve-kanban.mjs +654 -0
- package/ve-kanban.ps1 +1365 -0
- package/ve-kanban.sh +18 -0
- package/ve-orchestrator.mjs +340 -0
- package/ve-orchestrator.ps1 +6546 -0
- package/ve-orchestrator.sh +18 -0
- package/vibe-kanban-wrapper.mjs +41 -0
- package/vk-error-resolver.mjs +470 -0
- package/vk-log-stream.mjs +914 -0
- package/whatsapp-channel.mjs +520 -0
- package/workspace-monitor.mjs +581 -0
- package/workspace-reaper.mjs +405 -0
- package/workspace-registry.mjs +238 -0
- package/worktree-manager.mjs +1266 -0
|
@@ -0,0 +1,2010 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* telegram-sentinel.mjs — Always-on Telegram command listener for openfleet.
|
|
5
|
+
*
|
|
6
|
+
* Runs independently of the main openfleet process, ensuring Telegram
|
|
7
|
+
* commands are always handled even when openfleet is down.
|
|
8
|
+
*
|
|
9
|
+
* Architecture:
|
|
10
|
+
* ┌─────────────────┐
|
|
11
|
+
* │ telegram-sentinel│──── always running ────────────────────────────────┐
|
|
12
|
+
* │ (this file) │ │
|
|
13
|
+
* └────────┬─────────┘ │
|
|
14
|
+
* │ │
|
|
15
|
+
* ├─ Standalone Mode (openfleet DOWN) │
|
|
16
|
+
* │ ├─ Polls Telegram directly │
|
|
17
|
+
* │ ├─ Handles simple commands (/ping, /status, /sentinel) │
|
|
18
|
+
* │ └─ Auto-starts openfleet for complex commands │
|
|
19
|
+
* │ │
|
|
20
|
+
* └─ Companion Mode (openfleet UP) │
|
|
21
|
+
* ├─ Does NOT poll (lets telegram-bot.mjs handle it) │
|
|
22
|
+
* ├─ Monitors openfleet health via PID file │
|
|
23
|
+
* └─ Transitions to Standalone if openfleet dies │
|
|
24
|
+
* │
|
|
25
|
+
* ┌─────────────────┐ │
|
|
26
|
+
* │ openfleet │ ← started/stopped by sentinel as needed ─────────┘
|
|
27
|
+
* │ (cli.mjs fork) │
|
|
28
|
+
* └─────────────────┘
|
|
29
|
+
*
|
|
30
|
+
* Usage:
|
|
31
|
+
* node telegram-sentinel.mjs # start sentinel
|
|
32
|
+
* node telegram-sentinel.mjs --stop # stop sentinel
|
|
33
|
+
* node telegram-sentinel.mjs --status # check sentinel status
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
import {
|
|
37
|
+
existsSync,
|
|
38
|
+
readFileSync,
|
|
39
|
+
mkdirSync,
|
|
40
|
+
unlinkSync,
|
|
41
|
+
writeFileSync,
|
|
42
|
+
} from "node:fs";
|
|
43
|
+
import { readFile, writeFile, unlink } from "node:fs/promises";
|
|
44
|
+
import { resolve, dirname } from "node:path";
|
|
45
|
+
import { fileURLToPath } from "node:url";
|
|
46
|
+
import { spawn } from "node:child_process";
|
|
47
|
+
import os from "node:os";
|
|
48
|
+
import {
|
|
49
|
+
execPrimaryPrompt,
|
|
50
|
+
getPrimaryAgentInfo,
|
|
51
|
+
initPrimaryAgent,
|
|
52
|
+
} from "./primary-agent.mjs";
|
|
53
|
+
import { resolveRepoRoot } from "./repo-root.mjs";
|
|
54
|
+
|
|
55
|
+
// ── Paths ────────────────────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
58
|
+
const __dirname = dirname(__filename);
|
|
59
|
+
const repoRoot = resolveRepoRoot();
|
|
60
|
+
const cacheDir = resolve(repoRoot, ".cache");
|
|
61
|
+
|
|
62
|
+
const MONITOR_PID_FILE = resolve(__dirname, ".cache", "openfleet.pid");
|
|
63
|
+
const SENTINEL_PID_FILE = resolve(cacheDir, "telegram-sentinel.pid");
|
|
64
|
+
const SENTINEL_HEARTBEAT_FILE = resolve(cacheDir, "sentinel-heartbeat.json");
|
|
65
|
+
const SENTINEL_LOCK_FILE = resolve(cacheDir, "telegram-sentinel.lock");
|
|
66
|
+
const SENTINEL_COMMAND_QUEUE_FILE = resolve(
|
|
67
|
+
cacheDir,
|
|
68
|
+
"sentinel-command-queue.json",
|
|
69
|
+
);
|
|
70
|
+
const SENTINEL_MONITOR_RECOVERY_FILE = resolve(
|
|
71
|
+
cacheDir,
|
|
72
|
+
"sentinel-monitor-recovery.json",
|
|
73
|
+
);
|
|
74
|
+
const MONITOR_POLL_LOCK_FILE = resolve(cacheDir, "telegram-getupdates.lock");
|
|
75
|
+
const STATUS_FILE = resolve(cacheDir, "ve-orchestrator-status.json");
|
|
76
|
+
|
|
77
|
+
const TAG = "[sentinel]";
|
|
78
|
+
const POLL_TIMEOUT_S = 30;
|
|
79
|
+
const MAX_MESSAGE_LEN = 4000;
|
|
80
|
+
const HEALTH_CHECK_INTERVAL_MS = 30_000;
|
|
81
|
+
const POLL_ERROR_BACKOFF_BASE_MS = 5_000;
|
|
82
|
+
const POLL_ERROR_BACKOFF_MAX_MS = 120_000;
|
|
83
|
+
const COMMAND_QUEUE_MAX_SIZE = 50;
|
|
84
|
+
const COMMAND_QUEUE_TTL_MS = 10 * 60 * 1000; // 10 minutes
|
|
85
|
+
const MONITOR_START_TIMEOUT_MS = 60_000; // 60s to wait for monitor to become healthy
|
|
86
|
+
const MONITOR_HEALTH_POLL_MS = 2_000; // check every 2s during startup
|
|
87
|
+
|
|
88
|
+
// ── State ────────────────────────────────────────────────────────────────────
|
|
89
|
+
|
|
90
|
+
/** @type {"standalone" | "companion"} */
|
|
91
|
+
let mode = "standalone";
|
|
92
|
+
let running = false;
|
|
93
|
+
let polling = false;
|
|
94
|
+
/** @type {AbortController | null} */
|
|
95
|
+
let pollAbort = null;
|
|
96
|
+
let lastUpdateId = 0;
|
|
97
|
+
let healthCheckTimer = null;
|
|
98
|
+
let heartbeatTimer = null;
|
|
99
|
+
let consecutivePollErrors = 0;
|
|
100
|
+
let commandsProcessed = 0;
|
|
101
|
+
let startedAt = new Date().toISOString();
|
|
102
|
+
/** @type {Array<{ chatId: string|number, text: string, timestamp: number }>} */
|
|
103
|
+
let commandQueue = [];
|
|
104
|
+
/** @type {Promise<void> | null} */
|
|
105
|
+
let monitorStartPromise = null;
|
|
106
|
+
let sentinelPollLockHeld = false;
|
|
107
|
+
let recoveryInProgress = false;
|
|
108
|
+
let monitorRestartAttempts = [];
|
|
109
|
+
let monitorCrashEvents = [];
|
|
110
|
+
let lastRepairAt = 0;
|
|
111
|
+
let lastMonitorStartAt = 0;
|
|
112
|
+
let monitorManualStopUntil = 0;
|
|
113
|
+
|
|
114
|
+
const sentinelConfig = {
|
|
115
|
+
autoRestartMonitor: true,
|
|
116
|
+
crashLoopThreshold: 3,
|
|
117
|
+
crashLoopWindowMs: 10 * 60 * 1000,
|
|
118
|
+
monitorStartGraceMs: 45 * 1000,
|
|
119
|
+
repairAgentEnabled: true,
|
|
120
|
+
repairCooldownMs: 15 * 60 * 1000,
|
|
121
|
+
repairTimeoutMs: 20 * 60 * 1000,
|
|
122
|
+
primaryAgentFallbackEnabled: true,
|
|
123
|
+
primaryAgentFallbackTimeoutMs: 15 * 60 * 1000,
|
|
124
|
+
restartBackoffMs: 5 * 1000,
|
|
125
|
+
manualStopHoldMs: 10 * 60 * 1000,
|
|
126
|
+
monitorMonitorCheckEnabled: true,
|
|
127
|
+
monitorMonitorMaxAgeMs: 20 * 60 * 1000,
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
// ── Environment ──────────────────────────────────────────────────────────────
|
|
131
|
+
|
|
132
|
+
/** @type {string} */
|
|
133
|
+
let telegramToken = "";
|
|
134
|
+
/** @type {string} */
|
|
135
|
+
let telegramChatId = "";
|
|
136
|
+
/** @type {string} */
|
|
137
|
+
let projectName = "";
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Parse the .env file for Telegram credentials and project name.
|
|
141
|
+
* Uses a simple line-by-line parser — no external dependencies.
|
|
142
|
+
* @returns {{ TELEGRAM_BOT_TOKEN?: string, TELEGRAM_CHAT_ID?: string, PROJECT_NAME?: string }}
|
|
143
|
+
*/
|
|
144
|
+
function loadEnvCredentials() {
|
|
145
|
+
const envPath = resolve(__dirname, ".env");
|
|
146
|
+
/** @type {Record<string, string>} */
|
|
147
|
+
const vars = {};
|
|
148
|
+
|
|
149
|
+
if (!existsSync(envPath)) return vars;
|
|
150
|
+
|
|
151
|
+
try {
|
|
152
|
+
const lines = readFileSync(envPath, "utf8").split("\n");
|
|
153
|
+
for (const line of lines) {
|
|
154
|
+
const trimmed = line.trim();
|
|
155
|
+
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
156
|
+
const eqIdx = trimmed.indexOf("=");
|
|
157
|
+
if (eqIdx === -1) continue;
|
|
158
|
+
const key = trimmed.slice(0, eqIdx).trim();
|
|
159
|
+
let val = trimmed.slice(eqIdx + 1).trim();
|
|
160
|
+
// Strip surrounding quotes
|
|
161
|
+
if (
|
|
162
|
+
(val.startsWith('"') && val.endsWith('"')) ||
|
|
163
|
+
(val.startsWith("'") && val.endsWith("'"))
|
|
164
|
+
) {
|
|
165
|
+
val = val.slice(1, -1);
|
|
166
|
+
}
|
|
167
|
+
vars[key] = val;
|
|
168
|
+
}
|
|
169
|
+
} catch {
|
|
170
|
+
// best effort
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return vars;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Initialize environment variables from .env and process.env.
|
|
178
|
+
* Process.env takes precedence over .env file values.
|
|
179
|
+
*/
|
|
180
|
+
function parseBool(value, defaultValue) {
|
|
181
|
+
if (value == null || value === "") return defaultValue;
|
|
182
|
+
const normalized = String(value).trim().toLowerCase();
|
|
183
|
+
if (["1", "true", "yes", "on"].includes(normalized)) return true;
|
|
184
|
+
if (["0", "false", "no", "off"].includes(normalized)) return false;
|
|
185
|
+
return defaultValue;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function parseNumber(value, defaultValue, min = null, max = null) {
|
|
189
|
+
const parsed = Number(value);
|
|
190
|
+
if (!Number.isFinite(parsed)) return defaultValue;
|
|
191
|
+
let out = parsed;
|
|
192
|
+
if (Number.isFinite(min)) out = Math.max(min, out);
|
|
193
|
+
if (Number.isFinite(max)) out = Math.min(max, out);
|
|
194
|
+
return out;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
function getEnvValue(fileVars, key, fallback = "") {
|
|
198
|
+
// .env is the PRIMARY source, then process env.
|
|
199
|
+
const fromFile = fileVars?.[key];
|
|
200
|
+
if (fromFile != null && String(fromFile).trim() !== "") return fromFile;
|
|
201
|
+
const fromProcess = process.env[key];
|
|
202
|
+
if (fromProcess != null && String(fromProcess).trim() !== "") {
|
|
203
|
+
return fromProcess;
|
|
204
|
+
}
|
|
205
|
+
return fallback;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function initEnv() {
|
|
209
|
+
const fileVars = loadEnvCredentials();
|
|
210
|
+
telegramToken = getEnvValue(fileVars, "TELEGRAM_BOT_TOKEN", "");
|
|
211
|
+
telegramChatId = getEnvValue(fileVars, "TELEGRAM_CHAT_ID", "");
|
|
212
|
+
projectName = getEnvValue(fileVars, "PROJECT_NAME", "openfleet");
|
|
213
|
+
|
|
214
|
+
sentinelConfig.autoRestartMonitor = parseBool(
|
|
215
|
+
getEnvValue(fileVars, "SENTINEL_AUTO_RESTART_MONITOR", "1"),
|
|
216
|
+
true,
|
|
217
|
+
);
|
|
218
|
+
sentinelConfig.crashLoopThreshold = parseNumber(
|
|
219
|
+
getEnvValue(fileVars, "SENTINEL_CRASH_LOOP_THRESHOLD", "3"),
|
|
220
|
+
3,
|
|
221
|
+
2,
|
|
222
|
+
20,
|
|
223
|
+
);
|
|
224
|
+
sentinelConfig.crashLoopWindowMs =
|
|
225
|
+
parseNumber(
|
|
226
|
+
getEnvValue(fileVars, "SENTINEL_CRASH_LOOP_WINDOW_MIN", "10"),
|
|
227
|
+
10,
|
|
228
|
+
1,
|
|
229
|
+
120,
|
|
230
|
+
) * 60_000;
|
|
231
|
+
sentinelConfig.monitorStartGraceMs =
|
|
232
|
+
parseNumber(
|
|
233
|
+
getEnvValue(fileVars, "SENTINEL_MONITOR_START_GRACE_SEC", "45"),
|
|
234
|
+
45,
|
|
235
|
+
10,
|
|
236
|
+
600,
|
|
237
|
+
) * 1000;
|
|
238
|
+
sentinelConfig.repairAgentEnabled = parseBool(
|
|
239
|
+
getEnvValue(fileVars, "SENTINEL_REPAIR_AGENT_ENABLED", "1"),
|
|
240
|
+
true,
|
|
241
|
+
);
|
|
242
|
+
sentinelConfig.repairCooldownMs =
|
|
243
|
+
parseNumber(
|
|
244
|
+
getEnvValue(fileVars, "SENTINEL_REPAIR_COOLDOWN_MIN", "15"),
|
|
245
|
+
15,
|
|
246
|
+
1,
|
|
247
|
+
240,
|
|
248
|
+
) * 60_000;
|
|
249
|
+
sentinelConfig.repairTimeoutMs =
|
|
250
|
+
parseNumber(
|
|
251
|
+
getEnvValue(fileVars, "SENTINEL_REPAIR_TIMEOUT_MIN", "20"),
|
|
252
|
+
20,
|
|
253
|
+
1,
|
|
254
|
+
240,
|
|
255
|
+
) * 60_000;
|
|
256
|
+
sentinelConfig.primaryAgentFallbackEnabled = parseBool(
|
|
257
|
+
getEnvValue(fileVars, "SENTINEL_PRIMARY_AGENT_FALLBACK_ENABLED", "1"),
|
|
258
|
+
true,
|
|
259
|
+
);
|
|
260
|
+
sentinelConfig.primaryAgentFallbackTimeoutMs =
|
|
261
|
+
parseNumber(
|
|
262
|
+
getEnvValue(fileVars, "SENTINEL_PRIMARY_AGENT_TIMEOUT_MIN", "15"),
|
|
263
|
+
15,
|
|
264
|
+
1,
|
|
265
|
+
180,
|
|
266
|
+
) * 60_000;
|
|
267
|
+
sentinelConfig.restartBackoffMs =
|
|
268
|
+
parseNumber(
|
|
269
|
+
getEnvValue(fileVars, "SENTINEL_RESTART_BACKOFF_SEC", "5"),
|
|
270
|
+
5,
|
|
271
|
+
0,
|
|
272
|
+
600,
|
|
273
|
+
) * 1000;
|
|
274
|
+
sentinelConfig.manualStopHoldMs =
|
|
275
|
+
parseNumber(
|
|
276
|
+
getEnvValue(fileVars, "SENTINEL_MANUAL_STOP_HOLD_MIN", "10"),
|
|
277
|
+
10,
|
|
278
|
+
0,
|
|
279
|
+
240,
|
|
280
|
+
) * 60_000;
|
|
281
|
+
sentinelConfig.monitorMonitorCheckEnabled = parseBool(
|
|
282
|
+
getEnvValue(fileVars, "SENTINEL_MONITOR_MONITOR_CHECK_ENABLED", "1"),
|
|
283
|
+
true,
|
|
284
|
+
);
|
|
285
|
+
sentinelConfig.monitorMonitorMaxAgeMs =
|
|
286
|
+
parseNumber(
|
|
287
|
+
getEnvValue(fileVars, "SENTINEL_MONITOR_MONITOR_MAX_AGE_MIN", "20"),
|
|
288
|
+
20,
|
|
289
|
+
1,
|
|
290
|
+
240,
|
|
291
|
+
) * 60_000;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function pruneTimestamps(values, now = Date.now()) {
|
|
295
|
+
const floor = now - sentinelConfig.crashLoopWindowMs;
|
|
296
|
+
return (values || []).filter((ts) => Number.isFinite(ts) && ts >= floor);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
function saveRecoveryState() {
|
|
300
|
+
try {
|
|
301
|
+
mkdirSync(dirname(SENTINEL_MONITOR_RECOVERY_FILE), { recursive: true });
|
|
302
|
+
writeFileSync(
|
|
303
|
+
SENTINEL_MONITOR_RECOVERY_FILE,
|
|
304
|
+
JSON.stringify(
|
|
305
|
+
{
|
|
306
|
+
monitorRestartAttempts,
|
|
307
|
+
monitorCrashEvents,
|
|
308
|
+
lastRepairAt,
|
|
309
|
+
lastMonitorStartAt,
|
|
310
|
+
monitorManualStopUntil,
|
|
311
|
+
updatedAt: new Date().toISOString(),
|
|
312
|
+
},
|
|
313
|
+
null,
|
|
314
|
+
2,
|
|
315
|
+
),
|
|
316
|
+
"utf8",
|
|
317
|
+
);
|
|
318
|
+
} catch {
|
|
319
|
+
/* best effort */
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
function loadRecoveryState() {
|
|
324
|
+
try {
|
|
325
|
+
if (!existsSync(SENTINEL_MONITOR_RECOVERY_FILE)) return;
|
|
326
|
+
const raw = readFileSync(SENTINEL_MONITOR_RECOVERY_FILE, "utf8");
|
|
327
|
+
if (!raw || !raw.trim()) return;
|
|
328
|
+
const data = JSON.parse(raw);
|
|
329
|
+
monitorRestartAttempts = Array.isArray(data.monitorRestartAttempts)
|
|
330
|
+
? data.monitorRestartAttempts
|
|
331
|
+
: [];
|
|
332
|
+
monitorCrashEvents = Array.isArray(data.monitorCrashEvents)
|
|
333
|
+
? data.monitorCrashEvents
|
|
334
|
+
: [];
|
|
335
|
+
lastRepairAt = Number(data.lastRepairAt) || 0;
|
|
336
|
+
lastMonitorStartAt = Number(data.lastMonitorStartAt) || 0;
|
|
337
|
+
monitorManualStopUntil = Number(data.monitorManualStopUntil) || 0;
|
|
338
|
+
} catch {
|
|
339
|
+
/* best effort */
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
function recordMonitorRestartAttempt() {
|
|
344
|
+
const now = Date.now();
|
|
345
|
+
monitorRestartAttempts.push(now);
|
|
346
|
+
monitorRestartAttempts = pruneTimestamps(monitorRestartAttempts, now);
|
|
347
|
+
saveRecoveryState();
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
function recordMonitorCrashEvent() {
|
|
351
|
+
const now = Date.now();
|
|
352
|
+
monitorCrashEvents.push(now);
|
|
353
|
+
monitorCrashEvents = pruneTimestamps(monitorCrashEvents, now);
|
|
354
|
+
saveRecoveryState();
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
function isCrashLoopDetected(now = Date.now()) {
|
|
358
|
+
monitorRestartAttempts = pruneTimestamps(monitorRestartAttempts, now);
|
|
359
|
+
monitorCrashEvents = pruneTimestamps(monitorCrashEvents, now);
|
|
360
|
+
const threshold = sentinelConfig.crashLoopThreshold;
|
|
361
|
+
return (
|
|
362
|
+
monitorCrashEvents.length >= threshold ||
|
|
363
|
+
monitorRestartAttempts.length >= threshold
|
|
364
|
+
);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
async function assessMonitorMonitorHealth() {
|
|
368
|
+
if (!sentinelConfig.monitorMonitorCheckEnabled) {
|
|
369
|
+
return { ok: true, reason: "check disabled" };
|
|
370
|
+
}
|
|
371
|
+
const devmodeEnabled = parseBool(
|
|
372
|
+
process.env.DEVMODE_MONITOR_MONITOR_ENABLED ?? "1",
|
|
373
|
+
true,
|
|
374
|
+
);
|
|
375
|
+
if (!devmodeEnabled) {
|
|
376
|
+
return { ok: true, reason: "devmode monitor-monitor disabled" };
|
|
377
|
+
}
|
|
378
|
+
try {
|
|
379
|
+
if (!existsSync(STATUS_FILE)) {
|
|
380
|
+
return { ok: false, reason: "status file missing" };
|
|
381
|
+
}
|
|
382
|
+
const statusRaw = await readFile(STATUS_FILE, "utf8");
|
|
383
|
+
const status = JSON.parse(statusRaw || "{}");
|
|
384
|
+
const mm = status?.monitor_monitor || status?.monitorMonitor || null;
|
|
385
|
+
if (!mm || typeof mm !== "object") {
|
|
386
|
+
return { ok: false, reason: "monitor-monitor section unavailable" };
|
|
387
|
+
}
|
|
388
|
+
if (mm.enabled === false) {
|
|
389
|
+
return { ok: true, reason: "monitor-monitor disabled in status" };
|
|
390
|
+
}
|
|
391
|
+
const lastRunAt = mm.lastRunAt || mm.last_run_at || mm.last_run || null;
|
|
392
|
+
if (!lastRunAt) {
|
|
393
|
+
return { ok: false, reason: "monitor-monitor missing last run timestamp" };
|
|
394
|
+
}
|
|
395
|
+
const ageMs = Date.now() - new Date(lastRunAt).getTime();
|
|
396
|
+
if (!Number.isFinite(ageMs) || ageMs < 0) {
|
|
397
|
+
return { ok: false, reason: "monitor-monitor timestamp invalid" };
|
|
398
|
+
}
|
|
399
|
+
if (ageMs > sentinelConfig.monitorMonitorMaxAgeMs) {
|
|
400
|
+
return {
|
|
401
|
+
ok: false,
|
|
402
|
+
reason: `monitor-monitor stale (${formatUptime(ageMs)} old)`,
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
return { ok: true, reason: `healthy (${formatUptime(ageMs)} old)` };
|
|
406
|
+
} catch (err) {
|
|
407
|
+
return { ok: false, reason: err?.message || "health check failed" };
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
function normalizeAgentResult(result) {
|
|
412
|
+
if (!result) return "(no response)";
|
|
413
|
+
if (typeof result === "string") return result;
|
|
414
|
+
if (typeof result.finalResponse === "string" && result.finalResponse.trim()) {
|
|
415
|
+
return result.finalResponse.trim();
|
|
416
|
+
}
|
|
417
|
+
if (typeof result.response === "string" && result.response.trim()) {
|
|
418
|
+
return result.response.trim();
|
|
419
|
+
}
|
|
420
|
+
try {
|
|
421
|
+
return JSON.stringify(result).slice(0, 3000);
|
|
422
|
+
} catch {
|
|
423
|
+
return String(result);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
async function runRepairAgent(triggerReason, details = "") {
|
|
428
|
+
if (!sentinelConfig.repairAgentEnabled) return false;
|
|
429
|
+
if (recoveryInProgress) return false;
|
|
430
|
+
|
|
431
|
+
const now = Date.now();
|
|
432
|
+
const sinceLast = now - lastRepairAt;
|
|
433
|
+
if (lastRepairAt > 0 && sinceLast < sentinelConfig.repairCooldownMs) {
|
|
434
|
+
log(
|
|
435
|
+
"warn",
|
|
436
|
+
`repair-agent cooldown active (${Math.round((sentinelConfig.repairCooldownMs - sinceLast) / 1000)}s remaining)`,
|
|
437
|
+
);
|
|
438
|
+
return false;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
recoveryInProgress = true;
|
|
442
|
+
lastRepairAt = now;
|
|
443
|
+
saveRecoveryState();
|
|
444
|
+
|
|
445
|
+
try {
|
|
446
|
+
await sendTelegram(
|
|
447
|
+
telegramChatId,
|
|
448
|
+
[
|
|
449
|
+
"🧰 Crash-loop detected. Launching repair agent.",
|
|
450
|
+
`Trigger: ${triggerReason}`,
|
|
451
|
+
details ? `Context: ${details}` : "",
|
|
452
|
+
]
|
|
453
|
+
.filter(Boolean)
|
|
454
|
+
.join("\n"),
|
|
455
|
+
);
|
|
456
|
+
|
|
457
|
+
await initPrimaryAgent();
|
|
458
|
+
const agentInfo = getPrimaryAgentInfo();
|
|
459
|
+
const mmHealth = await assessMonitorMonitorHealth();
|
|
460
|
+
const prompt = [
|
|
461
|
+
"openfleet sentinel autonomous repair request.",
|
|
462
|
+
"",
|
|
463
|
+
`Trigger: ${triggerReason}`,
|
|
464
|
+
`Project: ${projectName}`,
|
|
465
|
+
`Host: ${os.hostname()}`,
|
|
466
|
+
`Crash events in window: ${monitorCrashEvents.length}`,
|
|
467
|
+
`Restart attempts in window: ${monitorRestartAttempts.length}`,
|
|
468
|
+
`Monitor-monitor health: ${mmHealth.ok ? "healthy" : "degraded"} (${mmHealth.reason})`,
|
|
469
|
+
details ? `Additional context: ${details}` : "",
|
|
470
|
+
"",
|
|
471
|
+
"Task:",
|
|
472
|
+
"1) Diagnose likely monitor crash-loop root cause.",
|
|
473
|
+
"2) Apply safe, minimal fixes directly in this workspace when possible.",
|
|
474
|
+
"3) Return concise summary: root cause, files changed, validation performed, residual risk.",
|
|
475
|
+
]
|
|
476
|
+
.filter(Boolean)
|
|
477
|
+
.join("\n");
|
|
478
|
+
|
|
479
|
+
const result = await execPrimaryPrompt(prompt, {
|
|
480
|
+
timeoutMs: sentinelConfig.repairTimeoutMs,
|
|
481
|
+
});
|
|
482
|
+
const summary = normalizeAgentResult(result);
|
|
483
|
+
await sendTelegram(
|
|
484
|
+
telegramChatId,
|
|
485
|
+
[
|
|
486
|
+
`✅ Repair agent completed via ${agentInfo.adapter}.`,
|
|
487
|
+
"",
|
|
488
|
+
summary.slice(0, 3500),
|
|
489
|
+
].join("\n"),
|
|
490
|
+
);
|
|
491
|
+
return true;
|
|
492
|
+
} catch (err) {
|
|
493
|
+
await sendTelegram(
|
|
494
|
+
telegramChatId,
|
|
495
|
+
`❌ Repair agent failed: ${err?.message || err}`,
|
|
496
|
+
);
|
|
497
|
+
return false;
|
|
498
|
+
} finally {
|
|
499
|
+
recoveryInProgress = false;
|
|
500
|
+
saveRecoveryState();
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
async function runPrimaryAgentFallback(chatId, text, command) {
|
|
505
|
+
if (!sentinelConfig.primaryAgentFallbackEnabled) {
|
|
506
|
+
return false;
|
|
507
|
+
}
|
|
508
|
+
try {
|
|
509
|
+
await initPrimaryAgent();
|
|
510
|
+
const agentInfo = getPrimaryAgentInfo();
|
|
511
|
+
await sendTelegram(
|
|
512
|
+
chatId,
|
|
513
|
+
`🤖 openfleet is down. Running via sentinel fallback (${agentInfo.adapter})...`,
|
|
514
|
+
);
|
|
515
|
+
|
|
516
|
+
const prompt = [
|
|
517
|
+
"Telegram fallback request while openfleet is offline.",
|
|
518
|
+
"",
|
|
519
|
+
`Project: ${projectName}`,
|
|
520
|
+
`Host: ${os.hostname()}`,
|
|
521
|
+
`Command: ${command}`,
|
|
522
|
+
"",
|
|
523
|
+
"User input:",
|
|
524
|
+
text,
|
|
525
|
+
"",
|
|
526
|
+
"Execute this request directly and return a concise, actionable response suitable for Telegram.",
|
|
527
|
+
"If the exact command requires monitor internals, provide the closest equivalent action and clear next steps.",
|
|
528
|
+
].join("\n");
|
|
529
|
+
|
|
530
|
+
const result = await execPrimaryPrompt(prompt, {
|
|
531
|
+
timeoutMs: sentinelConfig.primaryAgentFallbackTimeoutMs,
|
|
532
|
+
});
|
|
533
|
+
const message = normalizeAgentResult(result).slice(0, 3600);
|
|
534
|
+
await sendTelegram(chatId, message || "(fallback completed with no text output)");
|
|
535
|
+
return true;
|
|
536
|
+
} catch (err) {
|
|
537
|
+
await sendTelegram(
|
|
538
|
+
chatId,
|
|
539
|
+
`❌ Sentinel fallback failed: ${err?.message || err}`,
|
|
540
|
+
);
|
|
541
|
+
return false;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
async function attemptMonitorRecovery(triggerReason) {
|
|
546
|
+
if (!sentinelConfig.autoRestartMonitor) return;
|
|
547
|
+
if (monitorStartPromise) return;
|
|
548
|
+
if (Date.now() < monitorManualStopUntil) {
|
|
549
|
+
log("info", "auto-restart suppressed due to recent manual stop");
|
|
550
|
+
return;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
const loopDetected = isCrashLoopDetected();
|
|
554
|
+
if (loopDetected) {
|
|
555
|
+
const mmHealth = await assessMonitorMonitorHealth();
|
|
556
|
+
await sendTelegram(
|
|
557
|
+
telegramChatId,
|
|
558
|
+
[
|
|
559
|
+
"⚠️ Monitor crash-loop detected.",
|
|
560
|
+
`Window: ${Math.round(sentinelConfig.crashLoopWindowMs / 60000)}m | threshold: ${sentinelConfig.crashLoopThreshold}`,
|
|
561
|
+
`Monitor-monitor: ${mmHealth.ok ? "healthy" : "degraded"} (${mmHealth.reason})`,
|
|
562
|
+
"Attempting autonomous repair before restart.",
|
|
563
|
+
].join("\n"),
|
|
564
|
+
);
|
|
565
|
+
await runRepairAgent(triggerReason, mmHealth.reason);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
if (sentinelConfig.restartBackoffMs > 0) {
|
|
569
|
+
await sleep(sentinelConfig.restartBackoffMs);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
try {
|
|
573
|
+
await ensureMonitorRunning(`sentinel recovery: ${triggerReason}`);
|
|
574
|
+
const pid = readAlivePid(MONITOR_PID_FILE);
|
|
575
|
+
const pidSuffix = pid ? ` (PID ${pid})` : "";
|
|
576
|
+
await sendTelegram(
|
|
577
|
+
telegramChatId,
|
|
578
|
+
`✅ openfleet recovered${pidSuffix}.`,
|
|
579
|
+
);
|
|
580
|
+
} catch (err) {
|
|
581
|
+
await sendTelegram(
|
|
582
|
+
telegramChatId,
|
|
583
|
+
`❌ openfleet auto-restart failed: ${err?.message || err}`,
|
|
584
|
+
);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
// ── Process Utilities ────────────────────────────────────────────────────────
|
|
589
|
+
|
|
590
|
+
/**
|
|
591
|
+
* Check if a process with the given PID is alive.
|
|
592
|
+
* @param {number} pid
|
|
593
|
+
* @returns {boolean}
|
|
594
|
+
*/
|
|
595
|
+
function isProcessAlive(pid) {
|
|
596
|
+
if (!Number.isFinite(pid) || pid <= 0) return false;
|
|
597
|
+
try {
|
|
598
|
+
process.kill(pid, 0);
|
|
599
|
+
return true;
|
|
600
|
+
} catch {
|
|
601
|
+
return false;
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
/**
|
|
606
|
+
* Read a PID from a file and check if the process is alive.
|
|
607
|
+
* @param {string} pidPath
|
|
608
|
+
* @returns {number | null} The PID if alive, null otherwise.
|
|
609
|
+
*/
|
|
610
|
+
function readAlivePid(pidPath) {
|
|
611
|
+
try {
|
|
612
|
+
if (!existsSync(pidPath)) return null;
|
|
613
|
+
const pid = parseInt(readFileSync(pidPath, "utf8").trim(), 10);
|
|
614
|
+
if (isNaN(pid)) return null;
|
|
615
|
+
return isProcessAlive(pid) ? pid : null;
|
|
616
|
+
} catch {
|
|
617
|
+
return null;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
/**
|
|
622
|
+
* Write a PID file atomically (best effort).
|
|
623
|
+
* @param {string} pidPath
|
|
624
|
+
* @param {number} pid
|
|
625
|
+
*/
|
|
626
|
+
function writePidFile(pidPath, pid) {
|
|
627
|
+
try {
|
|
628
|
+
mkdirSync(dirname(pidPath), { recursive: true });
|
|
629
|
+
writeFileSync(pidPath, String(pid), "utf8");
|
|
630
|
+
} catch {
|
|
631
|
+
/* best effort */
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
/**
|
|
636
|
+
* Remove a PID file.
|
|
637
|
+
* @param {string} pidPath
|
|
638
|
+
*/
|
|
639
|
+
function removePidFile(pidPath) {
|
|
640
|
+
try {
|
|
641
|
+
if (existsSync(pidPath)) unlinkSync(pidPath);
|
|
642
|
+
} catch {
|
|
643
|
+
/* best effort */
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
// ── Sentinel Lock ────────────────────────────────────────────────────────────
|
|
648
|
+
|
|
649
|
+
/**
|
|
650
|
+
* Acquire the sentinel poll lock file. Uses exclusive create (wx) to prevent
|
|
651
|
+
* races between multiple sentinel instances.
|
|
652
|
+
* @returns {Promise<boolean>}
|
|
653
|
+
*/
|
|
654
|
+
async function acquireSentinelPollLock() {
|
|
655
|
+
if (sentinelPollLockHeld) return true;
|
|
656
|
+
try {
|
|
657
|
+
const payload = JSON.stringify(
|
|
658
|
+
{
|
|
659
|
+
owner: "sentinel",
|
|
660
|
+
pid: process.pid,
|
|
661
|
+
started_at: new Date().toISOString(),
|
|
662
|
+
},
|
|
663
|
+
null,
|
|
664
|
+
2,
|
|
665
|
+
);
|
|
666
|
+
await writeFile(SENTINEL_LOCK_FILE, payload, { flag: "wx" });
|
|
667
|
+
sentinelPollLockHeld = true;
|
|
668
|
+
return true;
|
|
669
|
+
} catch (err) {
|
|
670
|
+
if (err && err.code === "EEXIST") {
|
|
671
|
+
// Check if the existing lock holder is still alive
|
|
672
|
+
try {
|
|
673
|
+
const raw = await readFile(SENTINEL_LOCK_FILE, "utf8");
|
|
674
|
+
if (!raw || !raw.trim()) {
|
|
675
|
+
await unlink(SENTINEL_LOCK_FILE).catch(() => {});
|
|
676
|
+
return acquireSentinelPollLock();
|
|
677
|
+
}
|
|
678
|
+
const data = JSON.parse(raw);
|
|
679
|
+
const pid = Number(data?.pid);
|
|
680
|
+
if (!isProcessAlive(pid)) {
|
|
681
|
+
// Stale lock — reclaim
|
|
682
|
+
await unlink(SENTINEL_LOCK_FILE).catch(() => {});
|
|
683
|
+
return acquireSentinelPollLock();
|
|
684
|
+
}
|
|
685
|
+
// Another live sentinel holds the lock
|
|
686
|
+
return false;
|
|
687
|
+
} catch {
|
|
688
|
+
// Corrupt lock file — remove and retry
|
|
689
|
+
await unlink(SENTINEL_LOCK_FILE).catch(() => {});
|
|
690
|
+
return acquireSentinelPollLock();
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
return false;
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
/**
|
|
698
|
+
* Release the sentinel poll lock file.
|
|
699
|
+
* @returns {Promise<void>}
|
|
700
|
+
*/
|
|
701
|
+
async function releaseSentinelPollLock() {
|
|
702
|
+
if (!sentinelPollLockHeld) return;
|
|
703
|
+
sentinelPollLockHeld = false;
|
|
704
|
+
try {
|
|
705
|
+
await unlink(SENTINEL_LOCK_FILE).catch(() => {});
|
|
706
|
+
} catch {
|
|
707
|
+
/* best effort */
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
// ── Telegram API ─────────────────────────────────────────────────────────────
|
|
712
|
+
|
|
713
|
+
/**
|
|
714
|
+
* Send a text message to a Telegram chat.
|
|
715
|
+
* Handles message splitting for long texts and retries on transient errors.
|
|
716
|
+
* @param {string | number} chatId
|
|
717
|
+
* @param {string} text
|
|
718
|
+
* @param {object} [options]
|
|
719
|
+
* @param {string} [options.parseMode]
|
|
720
|
+
* @param {boolean} [options.silent]
|
|
721
|
+
* @returns {Promise<number | null>} The message_id of the last sent chunk, or null.
|
|
722
|
+
*/
|
|
723
|
+
async function sendTelegram(chatId, text, options = {}) {
|
|
724
|
+
if (!telegramToken) return null;
|
|
725
|
+
const chunks = splitMessage(text, MAX_MESSAGE_LEN);
|
|
726
|
+
let lastMessageId = null;
|
|
727
|
+
|
|
728
|
+
for (const chunk of chunks) {
|
|
729
|
+
const url = `https://api.telegram.org/bot${telegramToken}/sendMessage`;
|
|
730
|
+
/** @type {Record<string, any>} */
|
|
731
|
+
const payload = {
|
|
732
|
+
chat_id: chatId,
|
|
733
|
+
text: chunk,
|
|
734
|
+
disable_web_page_preview: true,
|
|
735
|
+
};
|
|
736
|
+
if (options.parseMode) payload.parse_mode = options.parseMode;
|
|
737
|
+
if (options.silent) payload.disable_notification = true;
|
|
738
|
+
|
|
739
|
+
try {
|
|
740
|
+
const res = await fetch(url, {
|
|
741
|
+
method: "POST",
|
|
742
|
+
headers: { "Content-Type": "application/json" },
|
|
743
|
+
body: JSON.stringify(payload),
|
|
744
|
+
signal: AbortSignal.timeout(15_000),
|
|
745
|
+
});
|
|
746
|
+
|
|
747
|
+
if (!res || typeof res.ok === "undefined") {
|
|
748
|
+
log("warn", "send error: invalid response object");
|
|
749
|
+
continue;
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
if (!res.ok) {
|
|
753
|
+
const body = await res.text().catch(() => "");
|
|
754
|
+
log("warn", `send failed: ${res.status} ${body}`);
|
|
755
|
+
// If parse_mode caused the error, retry as plain text
|
|
756
|
+
if (options.parseMode && res.status === 400) {
|
|
757
|
+
return sendTelegram(chatId, chunk, {
|
|
758
|
+
...options,
|
|
759
|
+
parseMode: undefined,
|
|
760
|
+
});
|
|
761
|
+
}
|
|
762
|
+
} else {
|
|
763
|
+
try {
|
|
764
|
+
const data = await res.json();
|
|
765
|
+
if (data.ok && data.result?.message_id) {
|
|
766
|
+
lastMessageId = data.result.message_id;
|
|
767
|
+
}
|
|
768
|
+
} catch {
|
|
769
|
+
/* best effort */
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
} catch (err) {
|
|
773
|
+
log("warn", `send error: ${err.message}`);
|
|
774
|
+
}
|
|
775
|
+
}
|
|
776
|
+
return lastMessageId;
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
/**
|
|
780
|
+
* Split a text into chunks that fit within Telegram's message limit.
|
|
781
|
+
* @param {string} text
|
|
782
|
+
* @param {number} maxLen
|
|
783
|
+
* @returns {string[]}
|
|
784
|
+
*/
|
|
785
|
+
function splitMessage(text, maxLen) {
|
|
786
|
+
if (!text) return ["(empty)"];
|
|
787
|
+
if (text.length <= maxLen) return [text];
|
|
788
|
+
const chunks = [];
|
|
789
|
+
let remaining = text;
|
|
790
|
+
while (remaining.length > 0) {
|
|
791
|
+
if (remaining.length <= maxLen) {
|
|
792
|
+
chunks.push(remaining);
|
|
793
|
+
break;
|
|
794
|
+
}
|
|
795
|
+
let splitIdx = remaining.lastIndexOf("\n", maxLen);
|
|
796
|
+
if (splitIdx < maxLen * 0.3) splitIdx = maxLen;
|
|
797
|
+
chunks.push(remaining.slice(0, splitIdx));
|
|
798
|
+
remaining = remaining.slice(splitIdx);
|
|
799
|
+
}
|
|
800
|
+
return chunks;
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
// ── Telegram Polling ─────────────────────────────────────────────────────────
|
|
804
|
+
|
|
805
|
+
/**
|
|
806
|
+
* Long-poll the Telegram Bot API for new updates.
|
|
807
|
+
* @returns {Promise<Array<object>>}
|
|
808
|
+
*/
|
|
809
|
+
async function pollUpdates() {
|
|
810
|
+
if (!telegramToken) return [];
|
|
811
|
+
|
|
812
|
+
const url = `https://api.telegram.org/bot${telegramToken}/getUpdates`;
|
|
813
|
+
const params = new URLSearchParams({
|
|
814
|
+
offset: String(lastUpdateId + 1),
|
|
815
|
+
timeout: String(POLL_TIMEOUT_S),
|
|
816
|
+
allowed_updates: JSON.stringify(["message"]),
|
|
817
|
+
});
|
|
818
|
+
|
|
819
|
+
pollAbort = new AbortController();
|
|
820
|
+
let res;
|
|
821
|
+
try {
|
|
822
|
+
res = await fetch(`${url}?${params}`, {
|
|
823
|
+
signal: pollAbort.signal,
|
|
824
|
+
});
|
|
825
|
+
} catch (err) {
|
|
826
|
+
if (err.name === "AbortError") return [];
|
|
827
|
+
throw err;
|
|
828
|
+
} finally {
|
|
829
|
+
pollAbort = null;
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
if (!res || typeof res.ok === "undefined") {
|
|
833
|
+
throw new Error("invalid response object from Telegram");
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
if (!res.ok) {
|
|
837
|
+
const body = await res.text().catch(() => "");
|
|
838
|
+
// 409 = conflict — another poller is active
|
|
839
|
+
if (res.status === 409) {
|
|
840
|
+
log(
|
|
841
|
+
"warn",
|
|
842
|
+
"Telegram 409 conflict — another poller is active, backing off",
|
|
843
|
+
);
|
|
844
|
+
}
|
|
845
|
+
throw new Error(`getUpdates failed: ${res.status} ${body}`);
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
const data = await res.json();
|
|
849
|
+
return data.ok ? data.result || [] : [];
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
/**
|
|
853
|
+
* Main polling loop. Runs continuously while sentinel is in standalone mode.
|
|
854
|
+
* Implements exponential backoff on errors.
|
|
855
|
+
*/
|
|
856
|
+
async function pollLoop() {
|
|
857
|
+
log("info", "polling loop started");
|
|
858
|
+
|
|
859
|
+
while (running && polling && mode === "standalone") {
|
|
860
|
+
try {
|
|
861
|
+
const updates = await pollUpdates();
|
|
862
|
+
consecutivePollErrors = 0;
|
|
863
|
+
|
|
864
|
+
for (const update of updates) {
|
|
865
|
+
lastUpdateId = Math.max(lastUpdateId, update.update_id);
|
|
866
|
+
await handleUpdate(update);
|
|
867
|
+
}
|
|
868
|
+
} catch (err) {
|
|
869
|
+
if (!running) break;
|
|
870
|
+
consecutivePollErrors++;
|
|
871
|
+
const backoff = Math.min(
|
|
872
|
+
POLL_ERROR_BACKOFF_BASE_MS * Math.pow(2, consecutivePollErrors - 1),
|
|
873
|
+
POLL_ERROR_BACKOFF_MAX_MS,
|
|
874
|
+
);
|
|
875
|
+
log(
|
|
876
|
+
"warn",
|
|
877
|
+
`poll error (attempt ${consecutivePollErrors}): ${err.message} — retry in ${Math.round(backoff / 1000)}s`,
|
|
878
|
+
);
|
|
879
|
+
await sleep(backoff);
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
log("info", "polling loop stopped");
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
// ── Update Handler ───────────────────────────────────────────────────────────
|
|
887
|
+
|
|
888
|
+
/** Commands that the sentinel can handle without openfleet. */
|
|
889
|
+
const STANDALONE_COMMANDS = new Set([
|
|
890
|
+
"/ping",
|
|
891
|
+
"/status",
|
|
892
|
+
"/sentinel",
|
|
893
|
+
"/start",
|
|
894
|
+
"/stop",
|
|
895
|
+
"/help",
|
|
896
|
+
]);
|
|
897
|
+
|
|
898
|
+
/** Commands that require openfleet to be running. */
|
|
899
|
+
const MONITOR_REQUIRED_COMMANDS = new Set([
|
|
900
|
+
"/resumetask",
|
|
901
|
+
"/resume",
|
|
902
|
+
"/tasks",
|
|
903
|
+
"/task",
|
|
904
|
+
"/sdk",
|
|
905
|
+
"/model",
|
|
906
|
+
"/switch",
|
|
907
|
+
"/worktrees",
|
|
908
|
+
"/prune",
|
|
909
|
+
"/batch",
|
|
910
|
+
"/threads",
|
|
911
|
+
"/rebalance",
|
|
912
|
+
"/logs",
|
|
913
|
+
"/errors",
|
|
914
|
+
"/restart",
|
|
915
|
+
"/config",
|
|
916
|
+
]);
|
|
917
|
+
|
|
918
|
+
/**
|
|
919
|
+
* Handle a single Telegram update.
|
|
920
|
+
* @param {object} update
|
|
921
|
+
*/
|
|
922
|
+
async function handleUpdate(update) {
|
|
923
|
+
const msg = update.message;
|
|
924
|
+
if (!msg || !msg.text) return;
|
|
925
|
+
|
|
926
|
+
const chatId = String(msg.chat?.id);
|
|
927
|
+
// Security: only accept messages from the configured chat
|
|
928
|
+
if (chatId !== String(telegramChatId)) {
|
|
929
|
+
log("warn", `ignoring message from unauthorized chat ${chatId}`);
|
|
930
|
+
return;
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
const text = msg.text.trim();
|
|
934
|
+
const command = text.split(/\s+/)[0].toLowerCase();
|
|
935
|
+
// Strip @botname suffix from commands (e.g. /ping@MyBot → /ping)
|
|
936
|
+
const bareCommand = command.includes("@") ? command.split("@")[0] : command;
|
|
937
|
+
|
|
938
|
+
commandsProcessed++;
|
|
939
|
+
|
|
940
|
+
// ── Standalone-handled commands ──────────────────────────────────────────
|
|
941
|
+
if (STANDALONE_COMMANDS.has(bareCommand)) {
|
|
942
|
+
await handleStandaloneCommand(chatId, bareCommand, text);
|
|
943
|
+
return;
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
// ── Commands requiring openfleet ─────────────────────────────────────
|
|
947
|
+
// Either a known monitor command, free-text message, or unknown command
|
|
948
|
+
log("info", `command "${bareCommand}" requires openfleet`);
|
|
949
|
+
await handleMonitorCommand(chatId, text, bareCommand);
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
// ── Standalone Command Handlers ──────────────────────────────────────────────
|
|
953
|
+
|
|
954
|
+
/**
|
|
955
|
+
* Handle commands that the sentinel can process without openfleet.
|
|
956
|
+
* @param {string} chatId
|
|
957
|
+
* @param {string} command
|
|
958
|
+
* @param {string} fullText
|
|
959
|
+
*/
|
|
960
|
+
async function handleStandaloneCommand(chatId, command, fullText) {
|
|
961
|
+
switch (command) {
|
|
962
|
+
case "/ping":
|
|
963
|
+
await handlePing(chatId);
|
|
964
|
+
break;
|
|
965
|
+
case "/status":
|
|
966
|
+
await handleStatus(chatId);
|
|
967
|
+
break;
|
|
968
|
+
case "/sentinel":
|
|
969
|
+
await handleSentinelInfo(chatId);
|
|
970
|
+
break;
|
|
971
|
+
case "/start":
|
|
972
|
+
await handleStartMonitor(chatId);
|
|
973
|
+
break;
|
|
974
|
+
case "/stop":
|
|
975
|
+
await handleStopMonitor(chatId);
|
|
976
|
+
break;
|
|
977
|
+
case "/help":
|
|
978
|
+
await handleHelp(chatId);
|
|
979
|
+
break;
|
|
980
|
+
default:
|
|
981
|
+
await sendTelegram(chatId, `Unknown standalone command: ${command}`);
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
/**
|
|
986
|
+
* /ping — Simple liveness check for the sentinel.
|
|
987
|
+
* @param {string} chatId
|
|
988
|
+
*/
|
|
989
|
+
async function handlePing(chatId) {
|
|
990
|
+
const monPid = readAlivePid(MONITOR_PID_FILE);
|
|
991
|
+
const monStatus = monPid ? `✅ running (PID ${monPid})` : "❌ not running";
|
|
992
|
+
const uptime = formatUptime(Date.now() - new Date(startedAt).getTime());
|
|
993
|
+
await sendTelegram(
|
|
994
|
+
chatId,
|
|
995
|
+
[
|
|
996
|
+
"🏓 *Pong!*",
|
|
997
|
+
"",
|
|
998
|
+
`Sentinel: ✅ alive (${uptime})`,
|
|
999
|
+
`Mode: ${mode}`,
|
|
1000
|
+
`Monitor: ${monStatus}`,
|
|
1001
|
+
`Host: \`${os.hostname()}\``,
|
|
1002
|
+
].join("\n"),
|
|
1003
|
+
{ parseMode: "Markdown" },
|
|
1004
|
+
);
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
/**
|
|
1008
|
+
* /status — Read the cached orchestrator status file.
|
|
1009
|
+
* @param {string} chatId
|
|
1010
|
+
*/
|
|
1011
|
+
async function handleStatus(chatId) {
|
|
1012
|
+
try {
|
|
1013
|
+
if (!existsSync(STATUS_FILE)) {
|
|
1014
|
+
await sendTelegram(
|
|
1015
|
+
chatId,
|
|
1016
|
+
"📊 No status file found. openfleet may not have run yet.",
|
|
1017
|
+
);
|
|
1018
|
+
return;
|
|
1019
|
+
}
|
|
1020
|
+
const raw = await readFile(STATUS_FILE, "utf8");
|
|
1021
|
+
const data = JSON.parse(raw);
|
|
1022
|
+
|
|
1023
|
+
const lines = ["📊 *Orchestrator Status*", ""];
|
|
1024
|
+
|
|
1025
|
+
if (data.executor_mode) lines.push(`Mode: \`${data.executor_mode}\``);
|
|
1026
|
+
if (data.active_slots) lines.push(`Slots: \`${data.active_slots}\``);
|
|
1027
|
+
if (data.last_executor_sync) {
|
|
1028
|
+
const ago = formatUptime(
|
|
1029
|
+
Date.now() - new Date(data.last_executor_sync).getTime(),
|
|
1030
|
+
);
|
|
1031
|
+
lines.push(`Last sync: ${ago} ago`);
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
// Show active attempts
|
|
1035
|
+
if (data.attempts && typeof data.attempts === "object") {
|
|
1036
|
+
const active = Object.values(data.attempts).filter(
|
|
1037
|
+
(a) => a.status === "running" || a.status === "pending",
|
|
1038
|
+
);
|
|
1039
|
+
if (active.length > 0) {
|
|
1040
|
+
lines.push("", "*Active Tasks:*");
|
|
1041
|
+
for (const a of active.slice(0, 10)) {
|
|
1042
|
+
const title = a.task_title || a.task_id?.slice(0, 8) || "?";
|
|
1043
|
+
lines.push(`• ${title} — ${a.status} (${a.executor || "?"})`);
|
|
1044
|
+
}
|
|
1045
|
+
} else {
|
|
1046
|
+
lines.push("", "No active tasks.");
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
await sendTelegram(chatId, lines.join("\n"), { parseMode: "Markdown" });
|
|
1051
|
+
} catch (err) {
|
|
1052
|
+
await sendTelegram(chatId, `❌ Error reading status: ${err.message}`);
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
/**
|
|
1057
|
+
* /sentinel — Show detailed sentinel information.
|
|
1058
|
+
* @param {string} chatId
|
|
1059
|
+
*/
|
|
1060
|
+
async function handleSentinelInfo(chatId) {
|
|
1061
|
+
const status = getSentinelStatus();
|
|
1062
|
+
const lines = [
|
|
1063
|
+
"🛡️ *Telegram Sentinel*",
|
|
1064
|
+
"",
|
|
1065
|
+
`PID: \`${process.pid}\``,
|
|
1066
|
+
`Mode: ${status.mode}`,
|
|
1067
|
+
`Started: ${status.startedAt}`,
|
|
1068
|
+
`Uptime: ${formatUptime(Date.now() - new Date(status.startedAt).getTime())}`,
|
|
1069
|
+
`Monitor PID: ${status.monitorPid ? `\`${status.monitorPid}\`` : "none"}`,
|
|
1070
|
+
`Commands processed: ${status.commandsProcessed}`,
|
|
1071
|
+
`Commands queued: ${status.commandsQueued}`,
|
|
1072
|
+
`Poll errors: ${consecutivePollErrors}`,
|
|
1073
|
+
`Host: \`${os.hostname()}\``,
|
|
1074
|
+
`Platform: \`${process.platform} ${process.arch}\``,
|
|
1075
|
+
`Node: \`${process.version}\``,
|
|
1076
|
+
];
|
|
1077
|
+
|
|
1078
|
+
await sendTelegram(chatId, lines.join("\n"), { parseMode: "Markdown" });
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
/**
|
|
1082
|
+
* /start — Manually start openfleet.
|
|
1083
|
+
* @param {string} chatId
|
|
1084
|
+
*/
|
|
1085
|
+
async function handleStartMonitor(chatId) {
|
|
1086
|
+
const monPid = readAlivePid(MONITOR_PID_FILE);
|
|
1087
|
+
if (monPid) {
|
|
1088
|
+
await sendTelegram(
|
|
1089
|
+
chatId,
|
|
1090
|
+
`✅ openfleet is already running (PID ${monPid}).`,
|
|
1091
|
+
);
|
|
1092
|
+
return;
|
|
1093
|
+
}
|
|
1094
|
+
await sendTelegram(chatId, "🚀 Starting openfleet...");
|
|
1095
|
+
try {
|
|
1096
|
+
await ensureMonitorRunning("manual /start command");
|
|
1097
|
+
const pid = readAlivePid(MONITOR_PID_FILE);
|
|
1098
|
+
await sendTelegram(
|
|
1099
|
+
chatId,
|
|
1100
|
+
`✅ openfleet started${pid ? ` (PID ${pid})` : ""}.`,
|
|
1101
|
+
);
|
|
1102
|
+
} catch (err) {
|
|
1103
|
+
await sendTelegram(
|
|
1104
|
+
chatId,
|
|
1105
|
+
`❌ Failed to start openfleet: ${err.message}`,
|
|
1106
|
+
);
|
|
1107
|
+
}
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
/**
|
|
1111
|
+
* /stop — Manually stop openfleet.
|
|
1112
|
+
* @param {string} chatId
|
|
1113
|
+
*/
|
|
1114
|
+
async function handleStopMonitor(chatId) {
|
|
1115
|
+
const monPid = readAlivePid(MONITOR_PID_FILE);
|
|
1116
|
+
if (!monPid) {
|
|
1117
|
+
await sendTelegram(chatId, "ℹ️ openfleet is not running.");
|
|
1118
|
+
return;
|
|
1119
|
+
}
|
|
1120
|
+
await sendTelegram(chatId, `🛑 Stopping openfleet (PID ${monPid})...`);
|
|
1121
|
+
try {
|
|
1122
|
+
process.kill(monPid, "SIGTERM");
|
|
1123
|
+
// Wait for process to die
|
|
1124
|
+
let gone = false;
|
|
1125
|
+
for (let i = 0; i < 20; i++) {
|
|
1126
|
+
await sleep(500);
|
|
1127
|
+
if (!isProcessAlive(monPid)) {
|
|
1128
|
+
gone = true;
|
|
1129
|
+
break;
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
if (!gone) {
|
|
1133
|
+
try {
|
|
1134
|
+
process.kill(monPid, "SIGKILL");
|
|
1135
|
+
} catch {
|
|
1136
|
+
/* best effort */
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
removePidFile(MONITOR_PID_FILE);
|
|
1140
|
+
await sendTelegram(chatId, "✅ openfleet stopped.");
|
|
1141
|
+
monitorManualStopUntil = Date.now() + sentinelConfig.manualStopHoldMs;
|
|
1142
|
+
saveRecoveryState();
|
|
1143
|
+
// Transition to standalone mode after stopping monitor
|
|
1144
|
+
await transitionToStandalone("monitor manually stopped");
|
|
1145
|
+
} catch (err) {
|
|
1146
|
+
await sendTelegram(chatId, `❌ Error stopping monitor: ${err.message}`);
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
/**
|
|
1151
|
+
* /help — Show available sentinel commands.
|
|
1152
|
+
* @param {string} chatId
|
|
1153
|
+
*/
|
|
1154
|
+
async function handleHelp(chatId) {
|
|
1155
|
+
const monPid = readAlivePid(MONITOR_PID_FILE);
|
|
1156
|
+
const monStatus = monPid ? "running" : "stopped";
|
|
1157
|
+
|
|
1158
|
+
const lines = [
|
|
1159
|
+
"🛡️ *Sentinel Commands* (always available)",
|
|
1160
|
+
"",
|
|
1161
|
+
"/ping — Check sentinel + monitor liveness",
|
|
1162
|
+
"/status — Show cached orchestrator status",
|
|
1163
|
+
"/sentinel — Show sentinel details",
|
|
1164
|
+
"/start — Start openfleet",
|
|
1165
|
+
"/stop — Stop openfleet",
|
|
1166
|
+
"/help — This message",
|
|
1167
|
+
"",
|
|
1168
|
+
`Monitor is *${monStatus}*. All other commands will ${monPid ? "be forwarded to" : "auto-start"} openfleet.`,
|
|
1169
|
+
];
|
|
1170
|
+
|
|
1171
|
+
await sendTelegram(chatId, lines.join("\n"), { parseMode: "Markdown" });
|
|
1172
|
+
}
|
|
1173
|
+
|
|
1174
|
+
// ── Monitor-Required Command Handling ────────────────────────────────────────
|
|
1175
|
+
|
|
1176
|
+
/**
|
|
1177
|
+
* Handle commands that need openfleet. Starts the monitor if not running
|
|
1178
|
+
* and queues the command for replay once it's healthy.
|
|
1179
|
+
* @param {string} chatId
|
|
1180
|
+
* @param {string} text
|
|
1181
|
+
* @param {string} command
|
|
1182
|
+
*/
|
|
1183
|
+
async function handleMonitorCommand(chatId, text, command) {
|
|
1184
|
+
const monPid = readAlivePid(MONITOR_PID_FILE);
|
|
1185
|
+
const requiresMonitor = MONITOR_REQUIRED_COMMANDS.has(command);
|
|
1186
|
+
|
|
1187
|
+
if (monPid) {
|
|
1188
|
+
// Monitor is running but sentinel is somehow in standalone mode — this
|
|
1189
|
+
// can happen briefly during transitions. Queue the command for the
|
|
1190
|
+
// monitor to pick up.
|
|
1191
|
+
queueCommand(chatId, text);
|
|
1192
|
+
await writeCommandQueueFile();
|
|
1193
|
+
log("info", "monitor running — queued command for replay");
|
|
1194
|
+
return;
|
|
1195
|
+
}
|
|
1196
|
+
|
|
1197
|
+
let fallbackHandled = false;
|
|
1198
|
+
if (sentinelConfig.primaryAgentFallbackEnabled) {
|
|
1199
|
+
fallbackHandled = await runPrimaryAgentFallback(chatId, text, command);
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
if (requiresMonitor) {
|
|
1203
|
+
queueCommand(chatId, text);
|
|
1204
|
+
}
|
|
1205
|
+
|
|
1206
|
+
if (!sentinelConfig.autoRestartMonitor && !requiresMonitor) {
|
|
1207
|
+
return;
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
await sendTelegram(chatId, "⏳ Starting openfleet in the background...");
|
|
1211
|
+
|
|
1212
|
+
try {
|
|
1213
|
+
await ensureMonitorRunning(`command: ${command}`);
|
|
1214
|
+
if (commandQueue.length > 0) {
|
|
1215
|
+
await writeCommandQueueFile();
|
|
1216
|
+
}
|
|
1217
|
+
log(
|
|
1218
|
+
"info",
|
|
1219
|
+
`monitor started — ${commandQueue.length} command(s) queued for replay`,
|
|
1220
|
+
);
|
|
1221
|
+
} catch (err) {
|
|
1222
|
+
if (!fallbackHandled) {
|
|
1223
|
+
await sendTelegram(
|
|
1224
|
+
chatId,
|
|
1225
|
+
`❌ Failed to start openfleet: ${err.message}\n\nYour command was not processed.`,
|
|
1226
|
+
);
|
|
1227
|
+
}
|
|
1228
|
+
// Clear the failed commands
|
|
1229
|
+
commandQueue = [];
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1233
|
+
// ── Command Queue ────────────────────────────────────────────────────────────
|
|
1234
|
+
|
|
1235
|
+
/**
|
|
1236
|
+
* Add a command to the replay queue.
|
|
1237
|
+
* @param {string | number} chatId
|
|
1238
|
+
* @param {string} text
|
|
1239
|
+
*/
|
|
1240
|
+
function queueCommand(chatId, text) {
|
|
1241
|
+
// Evict stale commands
|
|
1242
|
+
const now = Date.now();
|
|
1243
|
+
commandQueue = commandQueue.filter(
|
|
1244
|
+
(c) => now - c.timestamp < COMMAND_QUEUE_TTL_MS,
|
|
1245
|
+
);
|
|
1246
|
+
|
|
1247
|
+
// Enforce max queue size
|
|
1248
|
+
if (commandQueue.length >= COMMAND_QUEUE_MAX_SIZE) {
|
|
1249
|
+
log(
|
|
1250
|
+
"warn",
|
|
1251
|
+
`command queue full (${COMMAND_QUEUE_MAX_SIZE}), dropping oldest`,
|
|
1252
|
+
);
|
|
1253
|
+
commandQueue.shift();
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
commandQueue.push({ chatId: String(chatId), text, timestamp: now });
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
/**
|
|
1260
|
+
* Write the command queue to a JSON file for openfleet to read.
|
|
1261
|
+
* @returns {Promise<void>}
|
|
1262
|
+
*/
|
|
1263
|
+
async function writeCommandQueueFile() {
|
|
1264
|
+
try {
|
|
1265
|
+
mkdirSync(dirname(SENTINEL_COMMAND_QUEUE_FILE), { recursive: true });
|
|
1266
|
+
await writeFile(
|
|
1267
|
+
SENTINEL_COMMAND_QUEUE_FILE,
|
|
1268
|
+
JSON.stringify(commandQueue, null, 2),
|
|
1269
|
+
"utf8",
|
|
1270
|
+
);
|
|
1271
|
+
} catch (err) {
|
|
1272
|
+
log("warn", `failed to write command queue: ${err.message}`);
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
|
|
1276
|
+
/**
|
|
1277
|
+
* Get the current command queue.
|
|
1278
|
+
* @returns {Array<{ chatId: string, text: string, timestamp: number }>}
|
|
1279
|
+
*/
|
|
1280
|
+
export function getQueuedCommands() {
|
|
1281
|
+
return [...commandQueue];
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
// ── Monitor Lifecycle ────────────────────────────────────────────────────────
|
|
1285
|
+
|
|
1286
|
+
/**
|
|
1287
|
+
* Check if the openfleet process is running.
|
|
1288
|
+
* @returns {boolean}
|
|
1289
|
+
*/
|
|
1290
|
+
export function isMonitorRunning() {
|
|
1291
|
+
return readAlivePid(MONITOR_PID_FILE) !== null;
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
/**
|
|
1295
|
+
* Ensure openfleet is running. If not, start it and wait until it's healthy.
|
|
1296
|
+
* Returns immediately if monitor is already running. Coalesces concurrent calls
|
|
1297
|
+
* so only one monitor start happens at a time.
|
|
1298
|
+
* @param {string} reason - Human-readable reason for starting the monitor.
|
|
1299
|
+
* @returns {Promise<void>}
|
|
1300
|
+
*/
|
|
1301
|
+
export async function ensureMonitorRunning(reason) {
|
|
1302
|
+
// Already running
|
|
1303
|
+
if (readAlivePid(MONITOR_PID_FILE)) return;
|
|
1304
|
+
|
|
1305
|
+
// Another call is already starting the monitor — piggyback on it
|
|
1306
|
+
if (monitorStartPromise) {
|
|
1307
|
+
log("info", `waiting for in-progress monitor start (reason: ${reason})`);
|
|
1308
|
+
return monitorStartPromise;
|
|
1309
|
+
}
|
|
1310
|
+
|
|
1311
|
+
recordMonitorRestartAttempt();
|
|
1312
|
+
|
|
1313
|
+
monitorStartPromise = startAndWaitForMonitor(reason).catch((err) => {
|
|
1314
|
+
recordMonitorCrashEvent();
|
|
1315
|
+
throw err;
|
|
1316
|
+
});
|
|
1317
|
+
try {
|
|
1318
|
+
await monitorStartPromise;
|
|
1319
|
+
} finally {
|
|
1320
|
+
monitorStartPromise = null;
|
|
1321
|
+
}
|
|
1322
|
+
}
|
|
1323
|
+
|
|
1324
|
+
/**
|
|
1325
|
+
* Start openfleet as a detached background process and wait for it to
|
|
1326
|
+
* become healthy (PID file written and process alive).
|
|
1327
|
+
* @param {string} reason
|
|
1328
|
+
* @returns {Promise<void>}
|
|
1329
|
+
*/
|
|
1330
|
+
async function startAndWaitForMonitor(reason) {
|
|
1331
|
+
log("info", `starting openfleet (reason: ${reason})`);
|
|
1332
|
+
|
|
1333
|
+
// If sentinel is currently polling, release the sentinel lock.
|
|
1334
|
+
// The monitor's telegram-bot.mjs will acquire its own poll lock.
|
|
1335
|
+
const wasPolling = polling;
|
|
1336
|
+
if (wasPolling) {
|
|
1337
|
+
polling = false;
|
|
1338
|
+
if (pollAbort) {
|
|
1339
|
+
try {
|
|
1340
|
+
pollAbort.abort();
|
|
1341
|
+
} catch {
|
|
1342
|
+
/* ok */
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
await releaseSentinelPollLock();
|
|
1346
|
+
log("info", "released sentinel poll lock for monitor startup");
|
|
1347
|
+
}
|
|
1348
|
+
|
|
1349
|
+
// Ensure log directory exists for daemon output
|
|
1350
|
+
const daemonLog = resolve(__dirname, "logs", "daemon.log");
|
|
1351
|
+
try {
|
|
1352
|
+
mkdirSync(dirname(daemonLog), { recursive: true });
|
|
1353
|
+
} catch {
|
|
1354
|
+
/* ok */
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
// Start cli.mjs as a detached daemon child
|
|
1358
|
+
const child = spawn(
|
|
1359
|
+
process.execPath,
|
|
1360
|
+
[
|
|
1361
|
+
"--max-old-space-size=4096",
|
|
1362
|
+
resolve(__dirname, "cli.mjs"),
|
|
1363
|
+
"--daemon-child",
|
|
1364
|
+
],
|
|
1365
|
+
{
|
|
1366
|
+
detached: true,
|
|
1367
|
+
stdio: "ignore",
|
|
1368
|
+
env: { ...process.env, CODEX_MONITOR_DAEMON: "1" },
|
|
1369
|
+
cwd: repoRoot,
|
|
1370
|
+
},
|
|
1371
|
+
);
|
|
1372
|
+
|
|
1373
|
+
child.on("error", (err) => {
|
|
1374
|
+
log("error", `monitor spawn error: ${err.message}`);
|
|
1375
|
+
});
|
|
1376
|
+
|
|
1377
|
+
child.unref();
|
|
1378
|
+
|
|
1379
|
+
const spawnedPid = child.pid;
|
|
1380
|
+
if (!spawnedPid) {
|
|
1381
|
+
throw new Error("openfleet failed to spawn (no PID)");
|
|
1382
|
+
}
|
|
1383
|
+
|
|
1384
|
+
log("info", `monitor spawned (PID ${spawnedPid}), waiting for health...`);
|
|
1385
|
+
|
|
1386
|
+
// Wait for the monitor to become healthy (PID file written + process alive)
|
|
1387
|
+
const deadline = Date.now() + MONITOR_START_TIMEOUT_MS;
|
|
1388
|
+
while (Date.now() < deadline) {
|
|
1389
|
+
await sleep(MONITOR_HEALTH_POLL_MS);
|
|
1390
|
+
|
|
1391
|
+
const alivePid = readAlivePid(MONITOR_PID_FILE);
|
|
1392
|
+
if (alivePid) {
|
|
1393
|
+
log("info", `monitor is healthy (PID ${alivePid})`);
|
|
1394
|
+
lastMonitorStartAt = Date.now();
|
|
1395
|
+
saveRecoveryState();
|
|
1396
|
+
// Transition to companion mode
|
|
1397
|
+
await transitionToCompanion(alivePid);
|
|
1398
|
+
return;
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
// Check if spawned process died prematurely
|
|
1402
|
+
if (!isProcessAlive(spawnedPid)) {
|
|
1403
|
+
throw new Error(
|
|
1404
|
+
`openfleet process died during startup (PID ${spawnedPid})`,
|
|
1405
|
+
);
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
throw new Error(
|
|
1410
|
+
`openfleet did not become healthy within ${MONITOR_START_TIMEOUT_MS / 1000}s`,
|
|
1411
|
+
);
|
|
1412
|
+
}
|
|
1413
|
+
|
|
1414
|
+
// ── Mode Transitions ─────────────────────────────────────────────────────────
|
|
1415
|
+
|
|
1416
|
+
/**
|
|
1417
|
+
* Transition to standalone mode. Starts polling for Telegram updates directly.
|
|
1418
|
+
* @param {string} reason
|
|
1419
|
+
*/
|
|
1420
|
+
async function transitionToStandalone(reason) {
|
|
1421
|
+
if (mode === "standalone" && polling) {
|
|
1422
|
+
log("debug", `already in standalone mode (${reason})`);
|
|
1423
|
+
return;
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
log("info", `transitioning to standalone mode: ${reason}`);
|
|
1427
|
+
mode = "standalone";
|
|
1428
|
+
|
|
1429
|
+
// Check if the main bot poll lock is held by a live process
|
|
1430
|
+
const mainBotPolling = await isMainBotPolling();
|
|
1431
|
+
if (mainBotPolling) {
|
|
1432
|
+
log("info", "main bot is still polling — skipping sentinel poll start");
|
|
1433
|
+
return;
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1436
|
+
// Acquire sentinel poll lock and start polling
|
|
1437
|
+
const lockAcquired = await acquireSentinelPollLock();
|
|
1438
|
+
if (!lockAcquired) {
|
|
1439
|
+
log(
|
|
1440
|
+
"warn",
|
|
1441
|
+
"failed to acquire sentinel poll lock — another sentinel may be running",
|
|
1442
|
+
);
|
|
1443
|
+
return;
|
|
1444
|
+
}
|
|
1445
|
+
|
|
1446
|
+
// Clear stale updates before starting the loop
|
|
1447
|
+
try {
|
|
1448
|
+
const stale = await pollUpdates();
|
|
1449
|
+
for (const u of stale) {
|
|
1450
|
+
lastUpdateId = Math.max(lastUpdateId, u.update_id);
|
|
1451
|
+
}
|
|
1452
|
+
if (stale.length > 0) {
|
|
1453
|
+
log("info", `skipped ${stale.length} stale updates`);
|
|
1454
|
+
}
|
|
1455
|
+
} catch {
|
|
1456
|
+
/* best effort */
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1459
|
+
polling = true;
|
|
1460
|
+
consecutivePollErrors = 0;
|
|
1461
|
+
|
|
1462
|
+
// Fire polling loop (non-blocking)
|
|
1463
|
+
pollLoop().catch((err) => {
|
|
1464
|
+
log("error", `poll loop crashed: ${err.message}`);
|
|
1465
|
+
polling = false;
|
|
1466
|
+
});
|
|
1467
|
+
|
|
1468
|
+
await writeHeartbeat();
|
|
1469
|
+
}
|
|
1470
|
+
|
|
1471
|
+
/**
|
|
1472
|
+
* Transition to companion mode. Stops polling and lets telegram-bot.mjs handle it.
|
|
1473
|
+
* @param {number} monitorPid
|
|
1474
|
+
*/
|
|
1475
|
+
async function transitionToCompanion(monitorPid) {
|
|
1476
|
+
log("info", `transitioning to companion mode (monitor PID ${monitorPid})`);
|
|
1477
|
+
mode = "companion";
|
|
1478
|
+
|
|
1479
|
+
// Stop polling if active
|
|
1480
|
+
polling = false;
|
|
1481
|
+
if (pollAbort) {
|
|
1482
|
+
try {
|
|
1483
|
+
pollAbort.abort();
|
|
1484
|
+
} catch {
|
|
1485
|
+
/* ok */
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
await releaseSentinelPollLock();
|
|
1489
|
+
|
|
1490
|
+
await writeHeartbeat();
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
/**
|
|
1494
|
+
* Check if the main telegram-bot.mjs poll lock is held by a live process.
|
|
1495
|
+
* @returns {Promise<boolean>}
|
|
1496
|
+
*/
|
|
1497
|
+
async function isMainBotPolling() {
|
|
1498
|
+
try {
|
|
1499
|
+
if (!existsSync(MONITOR_POLL_LOCK_FILE)) return false;
|
|
1500
|
+
const raw = await readFile(MONITOR_POLL_LOCK_FILE, "utf8");
|
|
1501
|
+
if (!raw || !raw.trim()) return false;
|
|
1502
|
+
const data = JSON.parse(raw);
|
|
1503
|
+
const pid = Number(data?.pid);
|
|
1504
|
+
return isProcessAlive(pid);
|
|
1505
|
+
} catch {
|
|
1506
|
+
return false;
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
|
|
1510
|
+
// ── Health Monitoring ────────────────────────────────────────────────────────
|
|
1511
|
+
|
|
1512
|
+
/**
|
|
1513
|
+
* Periodic health check for openfleet. Runs every HEALTH_CHECK_INTERVAL_MS.
|
|
1514
|
+
*/
|
|
1515
|
+
async function healthCheck() {
|
|
1516
|
+
const monPid = readAlivePid(MONITOR_PID_FILE);
|
|
1517
|
+
|
|
1518
|
+
if (mode === "companion") {
|
|
1519
|
+
if (!monPid) {
|
|
1520
|
+
// Monitor died while in companion mode — send crash notification and go standalone
|
|
1521
|
+
log("warn", "monitor process died — transitioning to standalone");
|
|
1522
|
+
removePidFile(MONITOR_PID_FILE);
|
|
1523
|
+
recordMonitorCrashEvent();
|
|
1524
|
+
|
|
1525
|
+
const recentStartAge =
|
|
1526
|
+
lastMonitorStartAt > 0 ? Date.now() - lastMonitorStartAt : null;
|
|
1527
|
+
const rapidCrash =
|
|
1528
|
+
Number.isFinite(recentStartAge) &&
|
|
1529
|
+
recentStartAge <= sentinelConfig.monitorStartGraceMs;
|
|
1530
|
+
|
|
1531
|
+
// Notify user
|
|
1532
|
+
const host = os.hostname();
|
|
1533
|
+
const tag = projectName ? `[${projectName}]` : "";
|
|
1534
|
+
await sendTelegram(
|
|
1535
|
+
telegramChatId,
|
|
1536
|
+
[
|
|
1537
|
+
`🔥 ${tag} openfleet crashed`,
|
|
1538
|
+
"",
|
|
1539
|
+
`Host: \`${host}\``,
|
|
1540
|
+
`Time: ${new Date().toISOString()}`,
|
|
1541
|
+
rapidCrash
|
|
1542
|
+
? `Detected rapid crash (${formatUptime(recentStartAge)} after startup).`
|
|
1543
|
+
: "",
|
|
1544
|
+
"",
|
|
1545
|
+
"Sentinel is switching to standalone mode and will attempt automatic recovery.",
|
|
1546
|
+
].join("\n"),
|
|
1547
|
+
{ parseMode: "Markdown" },
|
|
1548
|
+
);
|
|
1549
|
+
|
|
1550
|
+
await transitionToStandalone("monitor process died");
|
|
1551
|
+
await attemptMonitorRecovery("monitor crashed in companion mode");
|
|
1552
|
+
}
|
|
1553
|
+
} else if (mode === "standalone") {
|
|
1554
|
+
if (monPid) {
|
|
1555
|
+
// Monitor appeared while in standalone mode (started externally)
|
|
1556
|
+
log(
|
|
1557
|
+
"info",
|
|
1558
|
+
`monitor detected (PID ${monPid}) — switching to companion mode`,
|
|
1559
|
+
);
|
|
1560
|
+
await transitionToCompanion(monPid);
|
|
1561
|
+
} else {
|
|
1562
|
+
// Check if main bot has acquired the poll lock (edge case: monitor starting up)
|
|
1563
|
+
const mainPolling = await isMainBotPolling();
|
|
1564
|
+
if (mainPolling && polling) {
|
|
1565
|
+
log("info", "main bot is polling — stopping sentinel polling");
|
|
1566
|
+
polling = false;
|
|
1567
|
+
if (pollAbort) {
|
|
1568
|
+
try {
|
|
1569
|
+
pollAbort.abort();
|
|
1570
|
+
} catch {
|
|
1571
|
+
/* ok */
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
await releaseSentinelPollLock();
|
|
1575
|
+
} else if (!mainPolling && !polling) {
|
|
1576
|
+
// Neither is polling — sentinel should resume
|
|
1577
|
+
log("info", "no poller active — resuming sentinel polling");
|
|
1578
|
+
await transitionToStandalone("no active poller detected");
|
|
1579
|
+
}
|
|
1580
|
+
|
|
1581
|
+
if (sentinelConfig.autoRestartMonitor && !monitorStartPromise) {
|
|
1582
|
+
await attemptMonitorRecovery("monitor not running during standalone health check");
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
// Clean up stale PID files
|
|
1588
|
+
const sentinelPid = readAlivePid(SENTINEL_PID_FILE);
|
|
1589
|
+
if (sentinelPid && sentinelPid !== process.pid) {
|
|
1590
|
+
// Another sentinel is alive — we shouldn't be running
|
|
1591
|
+
log(
|
|
1592
|
+
"warn",
|
|
1593
|
+
`another sentinel is alive (PID ${sentinelPid}) — stopping this instance`,
|
|
1594
|
+
);
|
|
1595
|
+
stopSentinel();
|
|
1596
|
+
return;
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
await writeHeartbeat();
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
// ── Heartbeat ────────────────────────────────────────────────────────────────
|
|
1603
|
+
|
|
1604
|
+
/**
|
|
1605
|
+
* Write the sentinel heartbeat file.
|
|
1606
|
+
* @returns {Promise<void>}
|
|
1607
|
+
*/
|
|
1608
|
+
async function writeHeartbeat() {
|
|
1609
|
+
/** @type {import("./telegram-sentinel.mjs").SentinelHeartbeat} */
|
|
1610
|
+
const heartbeat = {
|
|
1611
|
+
pid: process.pid,
|
|
1612
|
+
startedAt,
|
|
1613
|
+
mode,
|
|
1614
|
+
monitorPid: readAlivePid(MONITOR_PID_FILE),
|
|
1615
|
+
lastCheck: new Date().toISOString(),
|
|
1616
|
+
commandsQueued: commandQueue.length,
|
|
1617
|
+
commandsProcessed,
|
|
1618
|
+
};
|
|
1619
|
+
|
|
1620
|
+
try {
|
|
1621
|
+
mkdirSync(dirname(SENTINEL_HEARTBEAT_FILE), { recursive: true });
|
|
1622
|
+
await writeFile(
|
|
1623
|
+
SENTINEL_HEARTBEAT_FILE,
|
|
1624
|
+
JSON.stringify(heartbeat, null, 2),
|
|
1625
|
+
"utf8",
|
|
1626
|
+
);
|
|
1627
|
+
} catch (err) {
|
|
1628
|
+
log("warn", `heartbeat write failed: ${err.message}`);
|
|
1629
|
+
}
|
|
1630
|
+
}
|
|
1631
|
+
|
|
1632
|
+
// ── Public API ───────────────────────────────────────────────────────────────
|
|
1633
|
+
|
|
1634
|
+
/**
|
|
1635
|
+
* Start the Telegram sentinel. This is the main entry point.
|
|
1636
|
+
*
|
|
1637
|
+
* @param {object} [options]
|
|
1638
|
+
* @param {boolean} [options.skipExistingCheck] - Skip checking for an existing sentinel.
|
|
1639
|
+
* @returns {Promise<void>}
|
|
1640
|
+
*/
|
|
1641
|
+
export async function startSentinel(options = {}) {
|
|
1642
|
+
if (running) {
|
|
1643
|
+
log("warn", "sentinel is already running");
|
|
1644
|
+
return;
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
initEnv();
|
|
1648
|
+
|
|
1649
|
+
if (!telegramToken || !telegramChatId) {
|
|
1650
|
+
log(
|
|
1651
|
+
"error",
|
|
1652
|
+
"cannot start sentinel: TELEGRAM_BOT_TOKEN or TELEGRAM_CHAT_ID not configured",
|
|
1653
|
+
);
|
|
1654
|
+
console.error(
|
|
1655
|
+
`${TAG} Set these in .env (project root) or as environment variables.`,
|
|
1656
|
+
);
|
|
1657
|
+
process.exit(1);
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
// Ensure cache directory exists
|
|
1661
|
+
mkdirSync(cacheDir, { recursive: true });
|
|
1662
|
+
mkdirSync(dirname(MONITOR_PID_FILE), { recursive: true });
|
|
1663
|
+
|
|
1664
|
+
// Check for existing sentinel
|
|
1665
|
+
if (!options.skipExistingCheck) {
|
|
1666
|
+
const existingPid = readAlivePid(SENTINEL_PID_FILE);
|
|
1667
|
+
if (existingPid && existingPid !== process.pid) {
|
|
1668
|
+
console.error(
|
|
1669
|
+
`${TAG} Another sentinel is already running (PID ${existingPid}). Use --stop first.`,
|
|
1670
|
+
);
|
|
1671
|
+
process.exit(1);
|
|
1672
|
+
}
|
|
1673
|
+
}
|
|
1674
|
+
|
|
1675
|
+
running = true;
|
|
1676
|
+
startedAt = new Date().toISOString();
|
|
1677
|
+
loadRecoveryState();
|
|
1678
|
+
writePidFile(SENTINEL_PID_FILE, process.pid);
|
|
1679
|
+
|
|
1680
|
+
log("info", `sentinel started (PID ${process.pid})`);
|
|
1681
|
+
|
|
1682
|
+
// Determine initial mode
|
|
1683
|
+
const monPid = readAlivePid(MONITOR_PID_FILE);
|
|
1684
|
+
if (monPid) {
|
|
1685
|
+
log(
|
|
1686
|
+
"info",
|
|
1687
|
+
`openfleet already running (PID ${monPid}) — starting in companion mode`,
|
|
1688
|
+
);
|
|
1689
|
+
await transitionToCompanion(monPid);
|
|
1690
|
+
} else {
|
|
1691
|
+
log("info", "openfleet not running — starting in standalone mode");
|
|
1692
|
+
await transitionToStandalone("initial startup");
|
|
1693
|
+
}
|
|
1694
|
+
|
|
1695
|
+
// Set up periodic health checks
|
|
1696
|
+
healthCheckTimer = setInterval(() => {
|
|
1697
|
+
healthCheck().catch((err) => {
|
|
1698
|
+
log("error", `health check error: ${err.message}`);
|
|
1699
|
+
});
|
|
1700
|
+
}, HEALTH_CHECK_INTERVAL_MS);
|
|
1701
|
+
if (healthCheckTimer.unref) healthCheckTimer.unref();
|
|
1702
|
+
|
|
1703
|
+
// Set up periodic heartbeat writes
|
|
1704
|
+
heartbeatTimer = setInterval(() => {
|
|
1705
|
+
writeHeartbeat().catch(() => {});
|
|
1706
|
+
}, HEALTH_CHECK_INTERVAL_MS);
|
|
1707
|
+
if (heartbeatTimer.unref) heartbeatTimer.unref();
|
|
1708
|
+
|
|
1709
|
+
// Initial heartbeat
|
|
1710
|
+
await writeHeartbeat();
|
|
1711
|
+
|
|
1712
|
+
// Register shutdown handlers
|
|
1713
|
+
const shutdown = () => {
|
|
1714
|
+
log("info", "received shutdown signal");
|
|
1715
|
+
stopSentinel();
|
|
1716
|
+
process.exit(0);
|
|
1717
|
+
};
|
|
1718
|
+
process.on("SIGINT", shutdown);
|
|
1719
|
+
process.on("SIGTERM", shutdown);
|
|
1720
|
+
process.on("uncaughtException", (err) => {
|
|
1721
|
+
log("error", `uncaught exception: ${err.message}\n${err.stack}`);
|
|
1722
|
+
// Attempt crash notification
|
|
1723
|
+
sendTelegram(
|
|
1724
|
+
telegramChatId,
|
|
1725
|
+
`🛡️❌ Sentinel crashed: ${err.message}\nHost: \`${os.hostname()}\``,
|
|
1726
|
+
{ parseMode: "Markdown" },
|
|
1727
|
+
).catch(() => {});
|
|
1728
|
+
stopSentinel();
|
|
1729
|
+
process.exit(1);
|
|
1730
|
+
});
|
|
1731
|
+
process.on("unhandledRejection", (reason) => {
|
|
1732
|
+
log("error", `unhandled rejection: ${reason}`);
|
|
1733
|
+
});
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
/**
|
|
1737
|
+
* Stop the sentinel gracefully. Cleans up timers, locks, and PID files.
|
|
1738
|
+
*/
|
|
1739
|
+
export function stopSentinel() {
|
|
1740
|
+
if (!running) return;
|
|
1741
|
+
running = false;
|
|
1742
|
+
polling = false;
|
|
1743
|
+
|
|
1744
|
+
// Abort any pending poll
|
|
1745
|
+
if (pollAbort) {
|
|
1746
|
+
try {
|
|
1747
|
+
pollAbort.abort();
|
|
1748
|
+
} catch {
|
|
1749
|
+
/* ok */
|
|
1750
|
+
}
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1753
|
+
// Clear timers
|
|
1754
|
+
if (healthCheckTimer) {
|
|
1755
|
+
clearInterval(healthCheckTimer);
|
|
1756
|
+
healthCheckTimer = null;
|
|
1757
|
+
}
|
|
1758
|
+
if (heartbeatTimer) {
|
|
1759
|
+
clearInterval(heartbeatTimer);
|
|
1760
|
+
heartbeatTimer = null;
|
|
1761
|
+
}
|
|
1762
|
+
|
|
1763
|
+
// Release locks and PID files
|
|
1764
|
+
releaseSentinelPollLock().catch(() => {});
|
|
1765
|
+
removePidFile(SENTINEL_PID_FILE);
|
|
1766
|
+
|
|
1767
|
+
// Clean up heartbeat file
|
|
1768
|
+
try {
|
|
1769
|
+
if (existsSync(SENTINEL_HEARTBEAT_FILE))
|
|
1770
|
+
unlinkSync(SENTINEL_HEARTBEAT_FILE);
|
|
1771
|
+
} catch {
|
|
1772
|
+
/* best effort */
|
|
1773
|
+
}
|
|
1774
|
+
|
|
1775
|
+
log("info", "sentinel stopped");
|
|
1776
|
+
}
|
|
1777
|
+
|
|
1778
|
+
/**
|
|
1779
|
+
* Get the current sentinel status.
|
|
1780
|
+
* @returns {SentinelStatus}
|
|
1781
|
+
*/
|
|
1782
|
+
export function getSentinelStatus() {
|
|
1783
|
+
return {
|
|
1784
|
+
pid: process.pid,
|
|
1785
|
+
running,
|
|
1786
|
+
startedAt,
|
|
1787
|
+
mode,
|
|
1788
|
+
monitorPid: readAlivePid(MONITOR_PID_FILE),
|
|
1789
|
+
polling,
|
|
1790
|
+
commandsQueued: commandQueue.length,
|
|
1791
|
+
commandsProcessed,
|
|
1792
|
+
consecutivePollErrors,
|
|
1793
|
+
uptime: Date.now() - new Date(startedAt).getTime(),
|
|
1794
|
+
};
|
|
1795
|
+
}
|
|
1796
|
+
|
|
1797
|
+
export function getSentinelRecoveryStatus() {
|
|
1798
|
+
const now = Date.now();
|
|
1799
|
+
const crashes = pruneTimestamps(monitorCrashEvents, now).length;
|
|
1800
|
+
const restarts = pruneTimestamps(monitorRestartAttempts, now).length;
|
|
1801
|
+
return {
|
|
1802
|
+
crashLoopDetected: isCrashLoopDetected(now),
|
|
1803
|
+
crashesInWindow: crashes,
|
|
1804
|
+
restartsInWindow: restarts,
|
|
1805
|
+
crashLoopThreshold: sentinelConfig.crashLoopThreshold,
|
|
1806
|
+
crashLoopWindowMs: sentinelConfig.crashLoopWindowMs,
|
|
1807
|
+
lastRepairAt,
|
|
1808
|
+
recoveryInProgress,
|
|
1809
|
+
};
|
|
1810
|
+
}
|
|
1811
|
+
|
|
1812
|
+
export function __setRecoveryStateForTest(state = {}) {
|
|
1813
|
+
monitorRestartAttempts = Array.isArray(state.monitorRestartAttempts)
|
|
1814
|
+
? [...state.monitorRestartAttempts]
|
|
1815
|
+
: [];
|
|
1816
|
+
monitorCrashEvents = Array.isArray(state.monitorCrashEvents)
|
|
1817
|
+
? [...state.monitorCrashEvents]
|
|
1818
|
+
: [];
|
|
1819
|
+
lastRepairAt = Number(state.lastRepairAt) || 0;
|
|
1820
|
+
lastMonitorStartAt = Number(state.lastMonitorStartAt) || 0;
|
|
1821
|
+
monitorManualStopUntil = Number(state.monitorManualStopUntil) || 0;
|
|
1822
|
+
}
|
|
1823
|
+
|
|
1824
|
+
// ── Logging ──────────────────────────────────────────────────────────────────
|
|
1825
|
+
|
|
1826
|
+
/**
|
|
1827
|
+
* Simple structured logger. All output goes to stdout/stderr with a tag prefix.
|
|
1828
|
+
* @param {"info" | "warn" | "error" | "debug"} level
|
|
1829
|
+
* @param {string} message
|
|
1830
|
+
*/
|
|
1831
|
+
function log(level, message) {
|
|
1832
|
+
const timestamp = new Date().toISOString();
|
|
1833
|
+
const prefix = `${timestamp} ${TAG}`;
|
|
1834
|
+
switch (level) {
|
|
1835
|
+
case "error":
|
|
1836
|
+
console.error(`${prefix} ERROR: ${message}`);
|
|
1837
|
+
break;
|
|
1838
|
+
case "warn":
|
|
1839
|
+
console.warn(`${prefix} WARN: ${message}`);
|
|
1840
|
+
break;
|
|
1841
|
+
case "debug":
|
|
1842
|
+
if (process.env.SENTINEL_DEBUG === "1") {
|
|
1843
|
+
console.log(`${prefix} DEBUG: ${message}`);
|
|
1844
|
+
}
|
|
1845
|
+
break;
|
|
1846
|
+
default:
|
|
1847
|
+
console.log(`${prefix} ${message}`);
|
|
1848
|
+
}
|
|
1849
|
+
}
|
|
1850
|
+
|
|
1851
|
+
// ── Utility ──────────────────────────────────────────────────────────────────
|
|
1852
|
+
|
|
1853
|
+
/**
|
|
1854
|
+
* Format a duration in milliseconds to a human-readable string.
|
|
1855
|
+
* @param {number} ms
|
|
1856
|
+
* @returns {string}
|
|
1857
|
+
*/
|
|
1858
|
+
function formatUptime(ms) {
|
|
1859
|
+
if (ms < 0) ms = 0;
|
|
1860
|
+
const seconds = Math.floor(ms / 1000);
|
|
1861
|
+
const minutes = Math.floor(seconds / 60);
|
|
1862
|
+
const hours = Math.floor(minutes / 60);
|
|
1863
|
+
const days = Math.floor(hours / 24);
|
|
1864
|
+
|
|
1865
|
+
if (days > 0) return `${days}d ${hours % 24}h ${minutes % 60}m`;
|
|
1866
|
+
if (hours > 0) return `${hours}h ${minutes % 60}m`;
|
|
1867
|
+
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
|
1868
|
+
return `${seconds}s`;
|
|
1869
|
+
}
|
|
1870
|
+
|
|
1871
|
+
/**
|
|
1872
|
+
* Sleep for the given number of milliseconds.
|
|
1873
|
+
* @param {number} ms
|
|
1874
|
+
* @returns {Promise<void>}
|
|
1875
|
+
*/
|
|
1876
|
+
function sleep(ms) {
|
|
1877
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1878
|
+
}
|
|
1879
|
+
|
|
1880
|
+
// ── Type Definitions (JSDoc) ─────────────────────────────────────────────────
|
|
1881
|
+
|
|
1882
|
+
/**
|
|
1883
|
+
* @typedef {object} SentinelHeartbeat
|
|
1884
|
+
* @property {number} pid
|
|
1885
|
+
* @property {string} startedAt
|
|
1886
|
+
* @property {"standalone" | "companion"} mode
|
|
1887
|
+
* @property {number | null} monitorPid
|
|
1888
|
+
* @property {string} lastCheck
|
|
1889
|
+
* @property {number} commandsQueued
|
|
1890
|
+
* @property {number} commandsProcessed
|
|
1891
|
+
*/
|
|
1892
|
+
|
|
1893
|
+
/**
|
|
1894
|
+
* @typedef {object} SentinelStatus
|
|
1895
|
+
* @property {number} pid
|
|
1896
|
+
* @property {boolean} running
|
|
1897
|
+
* @property {string} startedAt
|
|
1898
|
+
* @property {"standalone" | "companion"} mode
|
|
1899
|
+
* @property {number | null} monitorPid
|
|
1900
|
+
* @property {boolean} polling
|
|
1901
|
+
* @property {number} commandsQueued
|
|
1902
|
+
* @property {number} commandsProcessed
|
|
1903
|
+
* @property {number} consecutivePollErrors
|
|
1904
|
+
* @property {number} uptime
|
|
1905
|
+
*/
|
|
1906
|
+
|
|
1907
|
+
// ── CLI Entry Point ──────────────────────────────────────────────────────────
|
|
1908
|
+
|
|
1909
|
+
const isDirectExecution = (() => {
|
|
1910
|
+
try {
|
|
1911
|
+
const thisFile = fileURLToPath(import.meta.url);
|
|
1912
|
+
const argv1 = process.argv[1];
|
|
1913
|
+
if (!argv1) return false;
|
|
1914
|
+
// Normalize paths for comparison (Windows backslash vs posix)
|
|
1915
|
+
const normalizedThis = thisFile.replace(/\\/g, "/").toLowerCase();
|
|
1916
|
+
const normalizedArgv = resolve(argv1).replace(/\\/g, "/").toLowerCase();
|
|
1917
|
+
return normalizedThis === normalizedArgv;
|
|
1918
|
+
} catch {
|
|
1919
|
+
return false;
|
|
1920
|
+
}
|
|
1921
|
+
})();
|
|
1922
|
+
|
|
1923
|
+
if (isDirectExecution) {
|
|
1924
|
+
const args = process.argv.slice(2);
|
|
1925
|
+
|
|
1926
|
+
if (args.includes("--help") || args.includes("-h")) {
|
|
1927
|
+
console.log(`
|
|
1928
|
+
telegram-sentinel — Always-on Telegram command listener for openfleet
|
|
1929
|
+
|
|
1930
|
+
USAGE
|
|
1931
|
+
node telegram-sentinel.mjs [options]
|
|
1932
|
+
|
|
1933
|
+
OPTIONS
|
|
1934
|
+
--stop Stop a running sentinel
|
|
1935
|
+
--status Check sentinel status
|
|
1936
|
+
--help Show this help
|
|
1937
|
+
|
|
1938
|
+
ENVIRONMENT
|
|
1939
|
+
TELEGRAM_BOT_TOKEN Telegram bot token (or set in .env)
|
|
1940
|
+
TELEGRAM_CHAT_ID Authorized chat ID (or set in .env)
|
|
1941
|
+
SENTINEL_DEBUG=1 Enable debug logging
|
|
1942
|
+
|
|
1943
|
+
The sentinel monitors openfleet and handles Telegram commands
|
|
1944
|
+
even when the main process is not running.
|
|
1945
|
+
`);
|
|
1946
|
+
process.exit(0);
|
|
1947
|
+
}
|
|
1948
|
+
|
|
1949
|
+
if (args.includes("--stop")) {
|
|
1950
|
+
const pid = readAlivePid(SENTINEL_PID_FILE);
|
|
1951
|
+
if (!pid) {
|
|
1952
|
+
console.log(" No sentinel running.");
|
|
1953
|
+
removePidFile(SENTINEL_PID_FILE);
|
|
1954
|
+
process.exit(0);
|
|
1955
|
+
}
|
|
1956
|
+
console.log(` Stopping sentinel (PID ${pid})...`);
|
|
1957
|
+
try {
|
|
1958
|
+
process.kill(pid, "SIGTERM");
|
|
1959
|
+
let gone = false;
|
|
1960
|
+
for (let i = 0; i < 20; i++) {
|
|
1961
|
+
await sleep(500);
|
|
1962
|
+
if (!isProcessAlive(pid)) {
|
|
1963
|
+
gone = true;
|
|
1964
|
+
break;
|
|
1965
|
+
}
|
|
1966
|
+
}
|
|
1967
|
+
if (!gone) {
|
|
1968
|
+
try {
|
|
1969
|
+
process.kill(pid, "SIGKILL");
|
|
1970
|
+
} catch {
|
|
1971
|
+
/* ok */
|
|
1972
|
+
}
|
|
1973
|
+
}
|
|
1974
|
+
removePidFile(SENTINEL_PID_FILE);
|
|
1975
|
+
console.log(" ✓ Sentinel stopped.");
|
|
1976
|
+
} catch (err) {
|
|
1977
|
+
console.error(` Failed: ${err.message}`);
|
|
1978
|
+
process.exit(1);
|
|
1979
|
+
}
|
|
1980
|
+
process.exit(0);
|
|
1981
|
+
}
|
|
1982
|
+
|
|
1983
|
+
if (args.includes("--status")) {
|
|
1984
|
+
const pid = readAlivePid(SENTINEL_PID_FILE);
|
|
1985
|
+
if (pid) {
|
|
1986
|
+
console.log(` Sentinel is running (PID ${pid})`);
|
|
1987
|
+
try {
|
|
1988
|
+
if (existsSync(SENTINEL_HEARTBEAT_FILE)) {
|
|
1989
|
+
const hb = JSON.parse(readFileSync(SENTINEL_HEARTBEAT_FILE, "utf8"));
|
|
1990
|
+
console.log(` Mode: ${hb.mode}`);
|
|
1991
|
+
console.log(` Monitor PID: ${hb.monitorPid || "none"}`);
|
|
1992
|
+
console.log(` Last check: ${hb.lastCheck}`);
|
|
1993
|
+
console.log(` Commands processed: ${hb.commandsProcessed}`);
|
|
1994
|
+
}
|
|
1995
|
+
} catch {
|
|
1996
|
+
/* best effort */
|
|
1997
|
+
}
|
|
1998
|
+
} else {
|
|
1999
|
+
console.log(" Sentinel is not running.");
|
|
2000
|
+
removePidFile(SENTINEL_PID_FILE);
|
|
2001
|
+
}
|
|
2002
|
+
process.exit(0);
|
|
2003
|
+
}
|
|
2004
|
+
|
|
2005
|
+
// Default: start sentinel
|
|
2006
|
+
startSentinel().catch((err) => {
|
|
2007
|
+
console.error(`${TAG} Fatal: ${err.message}`);
|
|
2008
|
+
process.exit(1);
|
|
2009
|
+
});
|
|
2010
|
+
}
|