@suzuke/agend 0.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/README.md +78 -0
  2. package/README.zh-TW.md +79 -0
  3. package/dist/access-path.d.ts +7 -0
  4. package/dist/access-path.js +12 -0
  5. package/dist/access-path.js.map +1 -0
  6. package/dist/backend/claude-code.d.ts +13 -0
  7. package/dist/backend/claude-code.js +114 -0
  8. package/dist/backend/claude-code.js.map +1 -0
  9. package/dist/backend/codex.d.ts +10 -0
  10. package/dist/backend/codex.js +58 -0
  11. package/dist/backend/codex.js.map +1 -0
  12. package/dist/backend/factory.d.ts +2 -0
  13. package/dist/backend/factory.js +19 -0
  14. package/dist/backend/factory.js.map +1 -0
  15. package/dist/backend/gemini-cli.d.ts +10 -0
  16. package/dist/backend/gemini-cli.js +68 -0
  17. package/dist/backend/gemini-cli.js.map +1 -0
  18. package/dist/backend/index.d.ts +6 -0
  19. package/dist/backend/index.js +6 -0
  20. package/dist/backend/index.js.map +1 -0
  21. package/dist/backend/opencode.d.ts +10 -0
  22. package/dist/backend/opencode.js +63 -0
  23. package/dist/backend/opencode.js.map +1 -0
  24. package/dist/backend/types.d.ts +26 -0
  25. package/dist/backend/types.js +2 -0
  26. package/dist/backend/types.js.map +1 -0
  27. package/dist/channel/access-manager.d.ts +18 -0
  28. package/dist/channel/access-manager.js +149 -0
  29. package/dist/channel/access-manager.js.map +1 -0
  30. package/dist/channel/adapters/discord.d.ts +45 -0
  31. package/dist/channel/adapters/discord.js +366 -0
  32. package/dist/channel/adapters/discord.js.map +1 -0
  33. package/dist/channel/adapters/telegram.d.ts +58 -0
  34. package/dist/channel/adapters/telegram.js +569 -0
  35. package/dist/channel/adapters/telegram.js.map +1 -0
  36. package/dist/channel/attachment-handler.d.ts +15 -0
  37. package/dist/channel/attachment-handler.js +55 -0
  38. package/dist/channel/attachment-handler.js.map +1 -0
  39. package/dist/channel/factory.d.ts +12 -0
  40. package/dist/channel/factory.js +38 -0
  41. package/dist/channel/factory.js.map +1 -0
  42. package/dist/channel/ipc-bridge.d.ts +26 -0
  43. package/dist/channel/ipc-bridge.js +170 -0
  44. package/dist/channel/ipc-bridge.js.map +1 -0
  45. package/dist/channel/mcp-server.d.ts +10 -0
  46. package/dist/channel/mcp-server.js +196 -0
  47. package/dist/channel/mcp-server.js.map +1 -0
  48. package/dist/channel/mcp-tools.d.ts +909 -0
  49. package/dist/channel/mcp-tools.js +346 -0
  50. package/dist/channel/mcp-tools.js.map +1 -0
  51. package/dist/channel/message-bus.d.ts +17 -0
  52. package/dist/channel/message-bus.js +86 -0
  53. package/dist/channel/message-bus.js.map +1 -0
  54. package/dist/channel/message-queue.d.ts +39 -0
  55. package/dist/channel/message-queue.js +248 -0
  56. package/dist/channel/message-queue.js.map +1 -0
  57. package/dist/channel/tool-router.d.ts +6 -0
  58. package/dist/channel/tool-router.js +69 -0
  59. package/dist/channel/tool-router.js.map +1 -0
  60. package/dist/channel/tool-tracker.d.ts +13 -0
  61. package/dist/channel/tool-tracker.js +58 -0
  62. package/dist/channel/tool-tracker.js.map +1 -0
  63. package/dist/channel/types.d.ts +116 -0
  64. package/dist/channel/types.js +2 -0
  65. package/dist/channel/types.js.map +1 -0
  66. package/dist/cli.d.ts +2 -0
  67. package/dist/cli.js +782 -0
  68. package/dist/cli.js.map +1 -0
  69. package/dist/config.d.ts +8 -0
  70. package/dist/config.js +85 -0
  71. package/dist/config.js.map +1 -0
  72. package/dist/context-guardian.d.ts +29 -0
  73. package/dist/context-guardian.js +123 -0
  74. package/dist/context-guardian.js.map +1 -0
  75. package/dist/cost-guard.d.ts +21 -0
  76. package/dist/cost-guard.js +113 -0
  77. package/dist/cost-guard.js.map +1 -0
  78. package/dist/daemon-entry.d.ts +1 -0
  79. package/dist/daemon-entry.js +29 -0
  80. package/dist/daemon-entry.js.map +1 -0
  81. package/dist/daemon.d.ts +88 -0
  82. package/dist/daemon.js +821 -0
  83. package/dist/daemon.js.map +1 -0
  84. package/dist/daily-summary.d.ts +13 -0
  85. package/dist/daily-summary.js +55 -0
  86. package/dist/daily-summary.js.map +1 -0
  87. package/dist/event-log.d.ts +22 -0
  88. package/dist/event-log.js +66 -0
  89. package/dist/event-log.js.map +1 -0
  90. package/dist/export-import.d.ts +2 -0
  91. package/dist/export-import.js +110 -0
  92. package/dist/export-import.js.map +1 -0
  93. package/dist/fleet-context.d.ts +36 -0
  94. package/dist/fleet-context.js +4 -0
  95. package/dist/fleet-context.js.map +1 -0
  96. package/dist/fleet-manager.d.ts +115 -0
  97. package/dist/fleet-manager.js +1739 -0
  98. package/dist/fleet-manager.js.map +1 -0
  99. package/dist/fleet-system-prompt.d.ts +11 -0
  100. package/dist/fleet-system-prompt.js +60 -0
  101. package/dist/fleet-system-prompt.js.map +1 -0
  102. package/dist/hang-detector.d.ts +16 -0
  103. package/dist/hang-detector.js +53 -0
  104. package/dist/hang-detector.js.map +1 -0
  105. package/dist/index.d.ts +8 -0
  106. package/dist/index.js +6 -0
  107. package/dist/index.js.map +1 -0
  108. package/dist/logger.d.ts +3 -0
  109. package/dist/logger.js +63 -0
  110. package/dist/logger.js.map +1 -0
  111. package/dist/plugin/agend/.claude-plugin/plugin.json +5 -0
  112. package/dist/scheduler/db.d.ts +16 -0
  113. package/dist/scheduler/db.js +132 -0
  114. package/dist/scheduler/db.js.map +1 -0
  115. package/dist/scheduler/db.test.d.ts +1 -0
  116. package/dist/scheduler/db.test.js +92 -0
  117. package/dist/scheduler/db.test.js.map +1 -0
  118. package/dist/scheduler/index.d.ts +4 -0
  119. package/dist/scheduler/index.js +4 -0
  120. package/dist/scheduler/index.js.map +1 -0
  121. package/dist/scheduler/scheduler.d.ts +25 -0
  122. package/dist/scheduler/scheduler.js +119 -0
  123. package/dist/scheduler/scheduler.js.map +1 -0
  124. package/dist/scheduler/scheduler.test.d.ts +1 -0
  125. package/dist/scheduler/scheduler.test.js +119 -0
  126. package/dist/scheduler/scheduler.test.js.map +1 -0
  127. package/dist/scheduler/types.d.ts +47 -0
  128. package/dist/scheduler/types.js +7 -0
  129. package/dist/scheduler/types.js.map +1 -0
  130. package/dist/service-installer.d.ts +14 -0
  131. package/dist/service-installer.js +91 -0
  132. package/dist/service-installer.js.map +1 -0
  133. package/dist/setup-wizard.d.ts +14 -0
  134. package/dist/setup-wizard.js +517 -0
  135. package/dist/setup-wizard.js.map +1 -0
  136. package/dist/stt.d.ts +10 -0
  137. package/dist/stt.js +33 -0
  138. package/dist/stt.js.map +1 -0
  139. package/dist/tmux-manager.d.ts +22 -0
  140. package/dist/tmux-manager.js +132 -0
  141. package/dist/tmux-manager.js.map +1 -0
  142. package/dist/topic-commands.d.ts +22 -0
  143. package/dist/topic-commands.js +176 -0
  144. package/dist/topic-commands.js.map +1 -0
  145. package/dist/transcript-monitor.d.ts +21 -0
  146. package/dist/transcript-monitor.js +149 -0
  147. package/dist/transcript-monitor.js.map +1 -0
  148. package/dist/types.d.ts +153 -0
  149. package/dist/types.js +2 -0
  150. package/dist/types.js.map +1 -0
  151. package/dist/webhook-emitter.d.ts +15 -0
  152. package/dist/webhook-emitter.js +41 -0
  153. package/dist/webhook-emitter.js.map +1 -0
  154. package/package.json +58 -4
  155. package/templates/launchd.plist.ejs +29 -0
  156. package/templates/systemd.service.ejs +15 -0
  157. package/index.js +0 -1
@@ -0,0 +1,1739 @@
1
+ import { existsSync, readFileSync, mkdirSync, writeFileSync, unlinkSync } from "node:fs";
2
+ import { access } from "node:fs/promises";
3
+ import { createServer } from "node:http";
4
+ import { join, dirname, basename } from "node:path";
5
+ import { homedir } from "node:os";
6
+ import { fileURLToPath } from "node:url";
7
+ import yaml from "js-yaml";
8
+ const __filename = fileURLToPath(import.meta.url);
9
+ const __dirname = dirname(__filename);
10
+ import { isProbeableRouteTarget } from "./fleet-context.js";
11
+ import { loadFleetConfig, DEFAULT_COST_GUARD, DEFAULT_DAILY_SUMMARY, DEFAULT_INSTANCE_CONFIG } from "./config.js";
12
+ import { EventLog } from "./event-log.js";
13
+ import { CostGuard, formatCents } from "./cost-guard.js";
14
+ import { TmuxManager } from "./tmux-manager.js";
15
+ import { AccessManager } from "./channel/access-manager.js";
16
+ import { IpcClient } from "./channel/ipc-bridge.js";
17
+ import { createAdapter } from "./channel/factory.js";
18
+ import { createLogger } from "./logger.js";
19
+ import { processAttachments } from "./channel/attachment-handler.js";
20
+ import { routeToolCall } from "./channel/tool-router.js";
21
+ import { Scheduler } from "./scheduler/index.js";
22
+ import { DEFAULT_SCHEDULER_CONFIG } from "./scheduler/index.js";
23
+ import { TopicCommands, sanitizeInstanceName } from "./topic-commands.js";
24
+ import { DailySummary } from "./daily-summary.js";
25
+ import { WebhookEmitter } from "./webhook-emitter.js";
26
+ const TMUX_SESSION = "agend";
27
+ export function resolveReplyThreadId(argsThreadId, instanceConfig) {
28
+ if (typeof argsThreadId === "string" && argsThreadId.length > 0) {
29
+ return argsThreadId;
30
+ }
31
+ if (instanceConfig?.general_topic) {
32
+ return undefined;
33
+ }
34
+ return instanceConfig?.topic_id != null ? String(instanceConfig.topic_id) : undefined;
35
+ }
36
+ export class FleetManager {
37
+ dataDir;
38
+ children = new Map();
39
+ daemons = new Map();
40
+ fleetConfig = null;
41
+ adapter = null;
42
+ routingTable = new Map();
43
+ instanceIpcClients = new Map();
44
+ scheduler = null;
45
+ configPath = "";
46
+ logger = createLogger("info");
47
+ topicCommands;
48
+ // sessionName → instanceName mapping for external sessions
49
+ sessionRegistry = new Map();
50
+ eventLog = null;
51
+ costGuard = null;
52
+ statuslineWatchers = new Map();
53
+ instanceRateLimits = new Map();
54
+ dailySummary = null;
55
+ webhookEmitter = null;
56
+ // Topic icon + auto-archive state
57
+ topicIcons = {};
58
+ lastActivity = new Map();
59
+ archivedTopics = new Set();
60
+ archiveTimer = null;
61
+ static ARCHIVE_IDLE_MS = 24 * 60 * 60 * 1000; // 24 hours
62
+ // Model failover state
63
+ failoverActive = new Map(); // instance → current failover model
64
+ // Health endpoint
65
+ healthServer = null;
66
+ startedAt = 0;
67
+ constructor(dataDir) {
68
+ this.dataDir = dataDir;
69
+ this.topicCommands = new TopicCommands(this);
70
+ }
71
+ /** Load fleet.yaml and build routing table */
72
+ loadConfig(configPath) {
73
+ this.fleetConfig = loadFleetConfig(configPath);
74
+ return this.fleetConfig;
75
+ }
76
+ /** Build topic routing table: { topicId -> RouteTarget } */
77
+ buildRoutingTable() {
78
+ const table = new Map();
79
+ if (!this.fleetConfig)
80
+ return table;
81
+ for (const [name, inst] of Object.entries(this.fleetConfig.instances)) {
82
+ if (inst.topic_id != null) {
83
+ table.set(inst.topic_id, {
84
+ kind: inst.general_topic ? "general" : "instance",
85
+ name,
86
+ });
87
+ }
88
+ }
89
+ return table;
90
+ }
91
+ getInstanceDir(name) {
92
+ return join(this.dataDir, "instances", name);
93
+ }
94
+ getInstanceStatus(name) {
95
+ const pidPath = join(this.getInstanceDir(name), "daemon.pid");
96
+ if (!existsSync(pidPath))
97
+ return "stopped";
98
+ const pid = parseInt(readFileSync(pidPath, "utf-8").trim(), 10);
99
+ try {
100
+ process.kill(pid, 0);
101
+ return "running";
102
+ }
103
+ catch {
104
+ return "crashed";
105
+ }
106
+ }
107
+ async startInstance(name, config, topicMode) {
108
+ if (this.daemons.has(name)) {
109
+ this.logger.info({ name }, "Instance already running, skipping");
110
+ return;
111
+ }
112
+ if (!existsSync(config.working_directory)) {
113
+ this.logger.error({ name, working_directory: config.working_directory }, "Working directory does not exist — skipping instance");
114
+ return;
115
+ }
116
+ const instanceDir = this.getInstanceDir(name);
117
+ mkdirSync(instanceDir, { recursive: true });
118
+ const { Daemon } = await import("./daemon.js");
119
+ const { createBackend } = await import("./backend/factory.js");
120
+ const backendName = config.backend ?? this.fleetConfig?.defaults?.backend ?? "claude-code";
121
+ const backend = createBackend(backendName, instanceDir);
122
+ const daemon = new Daemon(name, config, instanceDir, topicMode, backend);
123
+ await daemon.start();
124
+ this.daemons.set(name, daemon);
125
+ daemon.on("restart_complete", (data) => {
126
+ this.eventLog?.insert(name, "context_rotation", data);
127
+ this.logger.info({ name, ...data }, "Context restart completed");
128
+ });
129
+ const hangDetector = daemon.getHangDetector();
130
+ if (hangDetector) {
131
+ hangDetector.on("hang", () => {
132
+ this.eventLog?.insert(name, "hang_detected", {});
133
+ this.logger.warn({ name }, "Instance appears hung");
134
+ this.sendHangNotification(name);
135
+ this.webhookEmitter?.emit("hang", name);
136
+ });
137
+ }
138
+ daemon.on("crash_loop", () => {
139
+ this.eventLog?.insert(name, "crash_loop", {});
140
+ this.logger.error({ name }, "Instance in crash loop — respawn paused");
141
+ this.notifyInstanceTopic(name, `🔴 ${name} keeps crashing shortly after launch — respawn paused. Check rate limits or run \`agend fleet restart\`.`);
142
+ this.setTopicIcon(name, "red");
143
+ });
144
+ this.setTopicIcon(name, "green");
145
+ this.touchActivity(name);
146
+ }
147
+ async stopInstance(name) {
148
+ this.setTopicIcon(name, "remove");
149
+ this.failoverActive.delete(name);
150
+ const daemon = this.daemons.get(name);
151
+ if (daemon) {
152
+ await daemon.stop();
153
+ this.daemons.delete(name);
154
+ }
155
+ else {
156
+ const pidPath = join(this.getInstanceDir(name), "daemon.pid");
157
+ if (existsSync(pidPath)) {
158
+ const pid = parseInt(readFileSync(pidPath, "utf-8").trim(), 10);
159
+ try {
160
+ process.kill(pid, "SIGTERM");
161
+ }
162
+ catch (e) {
163
+ this.logger.debug({ err: e, pid }, "SIGTERM failed for stale process");
164
+ }
165
+ }
166
+ }
167
+ }
168
+ /** Load .env file from data dir into process.env */
169
+ loadEnvFile() {
170
+ const envPath = join(this.dataDir, ".env");
171
+ if (!existsSync(envPath))
172
+ return;
173
+ const content = readFileSync(envPath, "utf-8");
174
+ for (const line of content.split("\n")) {
175
+ const trimmed = line.trim();
176
+ if (!trimmed || trimmed.startsWith("#"))
177
+ continue;
178
+ const eqIdx = trimmed.indexOf("=");
179
+ if (eqIdx < 0)
180
+ continue;
181
+ const key = trimmed.slice(0, eqIdx);
182
+ const raw = trimmed.slice(eqIdx + 1);
183
+ const value = raw.replace(/^["'](.*)["']$/, '$1');
184
+ if (!process.env[key]) {
185
+ process.env[key] = value;
186
+ }
187
+ }
188
+ }
189
+ /** Start all instances from fleet config */
190
+ async startAll(configPath) {
191
+ this.configPath = configPath;
192
+ this.loadEnvFile();
193
+ const fleet = this.loadConfig(configPath);
194
+ const topicMode = fleet.channel?.mode === "topic";
195
+ await TmuxManager.ensureSession(TMUX_SESSION);
196
+ // Stop any running daemons first (their health checks would respawn killed windows)
197
+ for (const [name] of this.daemons) {
198
+ await this.stopInstance(name);
199
+ }
200
+ // Then kill all remaining agend instance windows to prevent orphans
201
+ const existingWindows = await TmuxManager.listWindows(TMUX_SESSION);
202
+ for (const w of existingWindows) {
203
+ if (w.name !== "zsh") {
204
+ const tm = new TmuxManager(TMUX_SESSION, w.id);
205
+ await tm.killWindow();
206
+ }
207
+ }
208
+ const pidPath = join(this.dataDir, "fleet.pid");
209
+ writeFileSync(pidPath, String(process.pid), "utf-8");
210
+ this.eventLog = new EventLog(join(this.dataDir, "events.db"));
211
+ const costGuardConfig = {
212
+ ...DEFAULT_COST_GUARD,
213
+ ...fleet.defaults?.cost_guard ?? {},
214
+ };
215
+ this.costGuard = new CostGuard(costGuardConfig, this.eventLog);
216
+ this.costGuard.startMidnightReset();
217
+ const webhookConfigs = fleet.defaults?.webhooks ?? [];
218
+ if (webhookConfigs.length > 0) {
219
+ this.webhookEmitter = new WebhookEmitter(webhookConfigs, this.logger);
220
+ this.logger.info({ count: webhookConfigs.length }, "Webhook emitter initialized");
221
+ }
222
+ this.costGuard.on("warn", (instance, totalCents, limitCents) => {
223
+ this.notifyInstanceTopic(instance, `⚠️ ${instance} cost: ${formatCents(totalCents)} / ${formatCents(limitCents)} (${Math.round(totalCents / limitCents * 100)}%)`);
224
+ this.webhookEmitter?.emit("cost_warning", instance, { cost_cents: totalCents, limit_cents: limitCents });
225
+ });
226
+ this.costGuard.on("limit", (instance, totalCents, limitCents) => {
227
+ this.notifyInstanceTopic(instance, `🛑 ${instance} daily limit ${formatCents(limitCents)} reached — pausing instance.`);
228
+ this.eventLog?.insert(instance, "instance_paused", { reason: "cost_limit", cost_cents: totalCents });
229
+ this.webhookEmitter?.emit("cost_limit", instance, { cost_cents: totalCents, limit_cents: limitCents });
230
+ this.stopInstance(instance).catch(err => this.logger.error({ err, instance }, "Failed to pause instance on cost limit"));
231
+ });
232
+ const summaryConfig = {
233
+ ...DEFAULT_DAILY_SUMMARY,
234
+ ...fleet.defaults?.daily_summary ?? {},
235
+ };
236
+ this.dailySummary = new DailySummary(summaryConfig, costGuardConfig.timezone, (text) => {
237
+ if (!this.adapter || !this.fleetConfig?.channel?.group_id)
238
+ return;
239
+ this.adapter.sendText(String(this.fleetConfig.channel.group_id), text)
240
+ .catch(e => this.logger.debug({ err: e }, "Failed to send daily summary"));
241
+ }, () => {
242
+ const instances = Object.keys(this.fleetConfig?.instances ?? {});
243
+ const costMap = new Map();
244
+ for (const name of instances) {
245
+ costMap.set(name, this.costGuard?.getDailyCostCents(name) ?? 0);
246
+ }
247
+ return DailySummary.generateText(this.eventLog, instances, costMap, this.costGuard?.getFleetTotalCents() ?? 0);
248
+ });
249
+ this.dailySummary.start();
250
+ // Auto-create general instance if none configured
251
+ const hasGeneralTopic = Object.values(fleet.instances).some(inst => inst.general_topic === true);
252
+ if (!hasGeneralTopic) {
253
+ this.logger.info("Auto-creating general instance for General Topic");
254
+ const generalDir = join(homedir(), ".agend", "general");
255
+ mkdirSync(generalDir, { recursive: true });
256
+ const claudeMdPath = join(generalDir, "CLAUDE.md");
257
+ if (!existsSync(claudeMdPath)) {
258
+ writeFileSync(claudeMdPath, `# General Assistant
259
+
260
+ 你是這個 AgEnD fleet 的通用入口。
261
+
262
+ ## 行為準則
263
+
264
+ - 簡單任務(搜尋、翻譯、一般問答):自己處理。
265
+ - 屬於特定專案的任務:用 list_instances() 找到對應 agent,需要時用 start_instance() 啟動,再用 send_to_instance() 委派。
266
+ - 需要多個 agent 協作的任務:協調各 agent 並行或串行執行,收集結果後彙整。
267
+ - 使用者想開新的專案 agent:用 create_instance() 建立。
268
+ - 不再需要的 instance(例如功能完成):用 delete_instance() 清除。
269
+ - 收到其他 instance 委派的任務時,完成後一定要用 send_to_instance() 回報結果。
270
+
271
+ ## 委派原則
272
+
273
+ 只在有具體理由時才委派:
274
+ - 任務需要存取特定專案的檔案
275
+ - 任務可以從多 agent 平行執行中受益
276
+ - 保留自己的 context 更重要,把不相關的工作交出去
277
+ - 絕不把任務回委給委派你的 instance
278
+
279
+ 自己能做的,就自己做。
280
+ `, "utf-8");
281
+ }
282
+ const generalConfig = {
283
+ ...DEFAULT_INSTANCE_CONFIG,
284
+ working_directory: generalDir,
285
+ general_topic: true,
286
+ };
287
+ fleet.instances["general"] = generalConfig;
288
+ this.saveFleetConfig();
289
+ }
290
+ const instanceEntries = Object.entries(fleet.instances);
291
+ for (const [name, config] of instanceEntries) {
292
+ await this.startInstance(name, config, topicMode).catch(err => this.logger.error({ err, name }, "Failed to start instance"));
293
+ }
294
+ if (topicMode && fleet.channel) {
295
+ const schedulerConfig = {
296
+ ...DEFAULT_SCHEDULER_CONFIG,
297
+ ...this.fleetConfig?.defaults?.scheduler ?? {},
298
+ };
299
+ this.scheduler = new Scheduler(join(this.dataDir, "scheduler.db"), (schedule) => this.handleScheduleTrigger(schedule), schedulerConfig, (name) => this.fleetConfig?.instances?.[name] != null);
300
+ this.scheduler.init();
301
+ this.logger.info("Scheduler initialized");
302
+ await this.startSharedAdapter(fleet);
303
+ // Auto-create topics AFTER adapter is ready (needs adapter.createTopic)
304
+ await this.topicCommands.autoCreateTopics();
305
+ this.routingTable = this.buildRoutingTable();
306
+ const routeSummary = [...this.routingTable.entries()].map(([tid, target]) => `#${tid}→${target.name}`).join(", ");
307
+ this.logger.info(`Routes: ${routeSummary}`);
308
+ // Resolve topic icon emoji IDs and start idle archive poller
309
+ await this.resolveTopicIcons();
310
+ this.startArchivePoller();
311
+ await new Promise(r => setTimeout(r, 3000));
312
+ await this.connectToInstances(fleet);
313
+ for (const name of Object.keys(fleet.instances)) {
314
+ this.startStatuslineWatcher(name);
315
+ }
316
+ }
317
+ // Health HTTP endpoint
318
+ this.startHealthServer(fleet.health_port ?? 19280);
319
+ // SIGHUP: reload scheduler (use once + re-register to avoid duplicates)
320
+ const onSighup = () => {
321
+ this.logger.info("Received SIGHUP, reloading scheduler...");
322
+ this.scheduler?.reload();
323
+ process.once("SIGHUP", onSighup);
324
+ };
325
+ process.once("SIGHUP", onSighup);
326
+ const onRestart = () => {
327
+ this.logger.info("Received SIGUSR2, initiating graceful restart...");
328
+ this.restartInstances()
329
+ .catch(err => this.logger.error({ err }, "Graceful restart failed"))
330
+ .finally(() => process.once("SIGUSR2", onRestart));
331
+ };
332
+ process.once("SIGUSR2", onRestart);
333
+ // SIGUSR1: full process reload (graceful stop → exit → CLI restarts)
334
+ const onFullRestart = () => {
335
+ this.logger.info("Received SIGUSR1, initiating full restart (process reload)...");
336
+ this.gracefulShutdownForReload()
337
+ .then(() => {
338
+ this.logger.info("Full restart: shutdown complete, exiting for reload");
339
+ process.exit(0);
340
+ })
341
+ .catch(err => {
342
+ this.logger.error({ err }, "Full restart: graceful shutdown failed");
343
+ process.exit(1);
344
+ });
345
+ };
346
+ process.once("SIGUSR1", onFullRestart);
347
+ }
348
+ /** Start the shared Telegram adapter for topic mode */
349
+ async startSharedAdapter(fleet) {
350
+ const channelConfig = fleet.channel;
351
+ const botToken = process.env[channelConfig.bot_token_env];
352
+ if (!botToken) {
353
+ this.logger.warn({ env: channelConfig.bot_token_env }, "Bot token env not set, skipping shared adapter");
354
+ return;
355
+ }
356
+ const accessDir = join(this.dataDir, "access");
357
+ mkdirSync(accessDir, { recursive: true });
358
+ const accessManager = new AccessManager(channelConfig.access, join(accessDir, "access.json"));
359
+ const inboxDir = join(this.dataDir, "inbox");
360
+ mkdirSync(inboxDir, { recursive: true });
361
+ this.adapter = await createAdapter(channelConfig, {
362
+ id: "fleet",
363
+ botToken,
364
+ accessManager,
365
+ inboxDir,
366
+ });
367
+ this.adapter.on("message", (msg) => {
368
+ this.handleInboundMessage(msg);
369
+ });
370
+ this.adapter.on("callback_query", async (data) => {
371
+ if (data.callbackData.startsWith("hang:")) {
372
+ const parts = data.callbackData.split(":");
373
+ const action = parts[1];
374
+ const instanceName = parts[2];
375
+ if (action === "restart") {
376
+ await this.stopInstance(instanceName);
377
+ const config = this.fleetConfig?.instances[instanceName];
378
+ if (config) {
379
+ const topicMode = this.fleetConfig?.channel?.mode === "topic";
380
+ await this.startInstance(instanceName, config, topicMode);
381
+ await new Promise(r => setTimeout(r, 3000));
382
+ await this.connectIpcToInstance(instanceName);
383
+ }
384
+ this.adapter?.editMessage(data.chatId, data.messageId, `🔄 ${instanceName} restarted.`).catch(() => { });
385
+ }
386
+ else {
387
+ this.adapter?.editMessage(data.chatId, data.messageId, `⏳ Continuing to wait for ${instanceName}.`).catch(() => { });
388
+ }
389
+ return;
390
+ }
391
+ });
392
+ this.adapter.on("topic_closed", (data) => {
393
+ const tid = parseInt(data.threadId, 10);
394
+ // Skip unbind if we archived this topic ourselves
395
+ if (this.archivedTopics.has(tid))
396
+ return;
397
+ this.topicCommands.handleTopicDeleted(tid);
398
+ });
399
+ await this.topicCommands.registerBotCommands();
400
+ await this.adapter.start();
401
+ if (fleet.channel?.group_id) {
402
+ this.adapter.setChatId(String(fleet.channel.group_id));
403
+ }
404
+ this.adapter.on("started", (username) => {
405
+ this.logger.info(`Telegram bot @${username} polling`);
406
+ });
407
+ this.adapter.on("polling_conflict", ({ attempt, delay }) => {
408
+ this.logger.warn(`409 Conflict (attempt ${attempt}), retry in ${delay / 1000}s`);
409
+ });
410
+ this.adapter.on("handler_error", (err) => {
411
+ this.logger.warn({ err: err instanceof Error ? err.message : String(err) }, "Telegram handler error");
412
+ });
413
+ this.startTopicCleanupPoller();
414
+ // Prune stale external sessions every 5 minutes
415
+ this.sessionPruneTimer = setInterval(() => {
416
+ this.pruneStaleExternalSessions().catch(err => this.logger.debug({ err }, "Session prune failed"));
417
+ }, 5 * 60 * 1000);
418
+ }
419
+ /** Connect IPC clients to each daemon instance's channel.sock */
420
+ async connectToInstances(fleet) {
421
+ for (const name of Object.keys(fleet.instances)) {
422
+ await this.connectIpcToInstance(name);
423
+ }
424
+ }
425
+ /** Connect IPC to a single instance with all handlers */
426
+ async connectIpcToInstance(name) {
427
+ const sockPath = join(this.getInstanceDir(name), "channel.sock");
428
+ if (!existsSync(sockPath))
429
+ return;
430
+ const ipc = new IpcClient(sockPath);
431
+ try {
432
+ await ipc.connect();
433
+ this.instanceIpcClients.set(name, ipc);
434
+ ipc.on("message", (msg) => {
435
+ if (msg.type === "mcp_ready") {
436
+ // Register external sessions (sessionName differs from instance name)
437
+ const sessionName = msg.sessionName;
438
+ if (sessionName && sessionName !== name) {
439
+ this.sessionRegistry.set(sessionName, name);
440
+ this.logger.info({ sessionName, instanceName: name }, "Registered external session");
441
+ }
442
+ }
443
+ else if (msg.type === "session_disconnected") {
444
+ const sessionName = msg.sessionName;
445
+ if (sessionName && this.sessionRegistry.has(sessionName)) {
446
+ this.sessionRegistry.delete(sessionName);
447
+ this.logger.info({ sessionName, instanceName: name }, "Unregistered external session");
448
+ }
449
+ }
450
+ else if (msg.type === "fleet_outbound") {
451
+ // Auto-register external session on first outbound message — covers the
452
+ // race where mcp_ready arrived before fleet manager connected and query_sessions
453
+ // fired before the MCP server reconnected.
454
+ const sender = msg.senderSessionName;
455
+ if (sender && sender !== name && !this.sessionRegistry.has(sender)) {
456
+ this.sessionRegistry.set(sender, name);
457
+ this.logger.info({ sessionName: sender, instanceName: name }, "Registered external session");
458
+ }
459
+ this.handleOutboundFromInstance(name, msg).catch(err => this.logger.error({ err }, "handleOutboundFromInstance error"));
460
+ }
461
+ else if (msg.type === "fleet_tool_status") {
462
+ this.handleToolStatusFromInstance(name, msg);
463
+ }
464
+ else if (msg.type === "fleet_schedule_create" || msg.type === "fleet_schedule_list" ||
465
+ msg.type === "fleet_schedule_update" || msg.type === "fleet_schedule_delete") {
466
+ this.handleScheduleCrud(name, msg);
467
+ }
468
+ });
469
+ // Ask daemon for any sessions that registered before we connected
470
+ // (fixes race condition where mcp_ready was broadcast before fleet manager connected)
471
+ ipc.send({ type: "query_sessions" });
472
+ this.logger.debug({ name }, "Connected to instance IPC");
473
+ if (!this.statuslineWatchers.has(name)) {
474
+ this.startStatuslineWatcher(name);
475
+ }
476
+ }
477
+ catch (err) {
478
+ this.logger.warn({ name, err }, "Failed to connect to instance IPC");
479
+ }
480
+ }
481
+ /** Handle inbound message — transcribe voice if present, then route */
482
+ findGeneralInstance() {
483
+ if (!this.fleetConfig)
484
+ return undefined;
485
+ for (const [name, config] of Object.entries(this.fleetConfig.instances)) {
486
+ if (config.general_topic === true) {
487
+ return this.daemons.has(name) ? name : undefined;
488
+ }
489
+ }
490
+ return undefined;
491
+ }
492
+ async handleInboundMessage(msg) {
493
+ const threadId = msg.threadId ? parseInt(msg.threadId, 10) : undefined;
494
+ if (threadId == null) {
495
+ // General topic: check for /status command
496
+ if (await this.topicCommands.handleGeneralCommand(msg))
497
+ return;
498
+ // Forward to General Topic instance if configured
499
+ const generalInstance = this.findGeneralInstance();
500
+ if (generalInstance) {
501
+ if (this.replyIfRateLimited(generalInstance, msg))
502
+ return;
503
+ const { text, extraMeta } = await processAttachments(msg, this.adapter, this.logger, generalInstance);
504
+ const ipc = this.instanceIpcClients.get(generalInstance);
505
+ if (ipc) {
506
+ if (this.adapter && msg.chatId && msg.messageId) {
507
+ this.adapter.react(msg.chatId, msg.messageId, "👀")
508
+ .catch(e => this.logger.debug({ err: e.message }, "Auto-react failed"));
509
+ }
510
+ ipc.send({
511
+ type: "fleet_inbound",
512
+ content: text,
513
+ targetSession: generalInstance,
514
+ meta: {
515
+ chat_id: msg.chatId,
516
+ message_id: msg.messageId,
517
+ user: msg.username,
518
+ user_id: msg.userId,
519
+ ts: msg.timestamp.toISOString(),
520
+ thread_id: "",
521
+ ...(msg.replyToText ? { reply_to_text: msg.replyToText } : {}),
522
+ ...extraMeta,
523
+ },
524
+ });
525
+ this.logger.info(`← ${generalInstance} ${msg.username}: ${(text ?? "").slice(0, 100)}`);
526
+ }
527
+ }
528
+ return;
529
+ }
530
+ const target = this.routingTable.get(threadId);
531
+ if (!target) {
532
+ this.topicCommands.handleUnboundTopic(msg);
533
+ return;
534
+ }
535
+ const instanceName = target.name;
536
+ // Reopen archived topic before routing
537
+ if (this.archivedTopics.has(threadId)) {
538
+ await this.reopenArchivedTopic(threadId, instanceName);
539
+ }
540
+ this.touchActivity(instanceName);
541
+ this.setTopicIcon(instanceName, "blue");
542
+ if (this.replyIfRateLimited(instanceName, msg))
543
+ return;
544
+ const { text, extraMeta } = await processAttachments(msg, this.adapter, this.logger, instanceName);
545
+ const ipc = this.instanceIpcClients.get(instanceName);
546
+ if (!ipc) {
547
+ this.logger.warn({ instanceName }, "No IPC connection to instance");
548
+ return;
549
+ }
550
+ if (this.adapter && msg.chatId && msg.messageId) {
551
+ this.adapter.react(msg.chatId, msg.messageId, "👀")
552
+ .catch(e => this.logger.debug({ err: e.message }, "Auto-react failed"));
553
+ }
554
+ ipc.send({
555
+ type: "fleet_inbound",
556
+ content: text,
557
+ targetSession: instanceName, // Telegram messages → instance's own session
558
+ meta: {
559
+ chat_id: msg.chatId,
560
+ message_id: msg.messageId,
561
+ user: msg.username,
562
+ user_id: msg.userId,
563
+ ts: msg.timestamp.toISOString(),
564
+ thread_id: msg.threadId ?? "",
565
+ ...(msg.replyToText ? { reply_to_text: msg.replyToText } : {}),
566
+ ...extraMeta,
567
+ },
568
+ });
569
+ this.logger.info(`← ${instanceName} ${msg.username}: ${(text ?? "").slice(0, 100)}`);
570
+ }
571
+ /** Handle outbound tool calls from a daemon instance */
572
+ replyIfRateLimited(instanceName, msg) {
573
+ const rl = this.instanceRateLimits.get(instanceName);
574
+ if (!rl || rl.seven_day_pct < 100)
575
+ return false;
576
+ if (this.adapter && msg.chatId) {
577
+ const threadId = msg.threadId ?? undefined;
578
+ this.adapter.sendText(msg.chatId, `⏸ ${instanceName} has hit the weekly usage limit. Your message was not delivered. Limit resets automatically — check /status for details.`, { threadId })
579
+ .catch(e => this.logger.debug({ err: e }, "Failed to send rate limit notice"));
580
+ }
581
+ this.logger.info({ instanceName }, "Blocked inbound message — weekly rate limit at 100%");
582
+ return true;
583
+ }
584
+ /** Handle outbound tool calls from a daemon instance */
585
+ async handleOutboundFromInstance(instanceName, msg) {
586
+ if (!this.adapter)
587
+ return;
588
+ this.touchActivity(instanceName);
589
+ this.setTopicIcon(instanceName, "green");
590
+ const tool = msg.tool;
591
+ const args = (msg.args ?? {});
592
+ const requestId = msg.requestId;
593
+ const fleetRequestId = msg.fleetRequestId;
594
+ const senderSessionName = msg.senderSessionName;
595
+ const respond = (result, error) => {
596
+ const ipc = this.instanceIpcClients.get(instanceName);
597
+ if (fleetRequestId) {
598
+ ipc?.send({ type: "fleet_outbound_response", fleetRequestId, result, error });
599
+ }
600
+ else {
601
+ ipc?.send({ type: "fleet_outbound_response", requestId, result, error });
602
+ }
603
+ };
604
+ // Resolve threadId from instance → topic_id mapping
605
+ const instanceConfig = this.fleetConfig?.instances[instanceName];
606
+ const threadId = resolveReplyThreadId(args.thread_id, instanceConfig);
607
+ // Route standard channel tools (reply, react, edit_message, download_attachment)
608
+ if (routeToolCall(this.adapter, tool, args, threadId, respond)) {
609
+ if (tool === "reply") {
610
+ this.logger.info(`→ ${instanceName} claude: ${(args.text ?? "").slice(0, 100)}`);
611
+ }
612
+ return;
613
+ }
614
+ // Fleet-specific tools
615
+ switch (tool) {
616
+ case "send_to_instance": {
617
+ const targetName = args.instance_name;
618
+ const message = args.message;
619
+ if (!targetName) {
620
+ respond(null, "send_to_instance: missing required argument 'instance_name'");
621
+ break;
622
+ }
623
+ if (!message) {
624
+ respond(null, "send_to_instance: missing required argument 'message'");
625
+ break;
626
+ }
627
+ const senderLabel = senderSessionName ?? instanceName;
628
+ const isExternalSender = senderSessionName != null && senderSessionName !== instanceName;
629
+ // Resolve target: could be an instance name or an external session name
630
+ let targetIpc = this.instanceIpcClients.get(targetName);
631
+ let targetSession = targetName; // default: target is the instance itself
632
+ let targetInstanceName = targetName;
633
+ if (!targetIpc) {
634
+ // Check if target is an external session
635
+ const hostInstance = this.sessionRegistry.get(targetName);
636
+ if (hostInstance) {
637
+ targetIpc = this.instanceIpcClients.get(hostInstance);
638
+ targetSession = targetName; // deliver to the external session
639
+ targetInstanceName = hostInstance;
640
+ }
641
+ }
642
+ if (!targetIpc) {
643
+ // Check if instance exists in config but is stopped
644
+ const existsInConfig = targetName in (this.fleetConfig?.instances ?? {});
645
+ if (existsInConfig) {
646
+ respond(null, `Instance '${targetName}' is stopped. Use start_instance('${targetName}') to start it first.`);
647
+ }
648
+ else {
649
+ respond(null, `Instance or session not found: ${targetName}`);
650
+ }
651
+ break;
652
+ }
653
+ // Build structured metadata (Phase 2)
654
+ const correlationId = args.correlation_id || `cid-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
655
+ const meta = {
656
+ chat_id: "",
657
+ message_id: `xmsg-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
658
+ user: `instance:${senderLabel}`,
659
+ user_id: `instance:${senderLabel}`,
660
+ ts: new Date().toISOString(),
661
+ thread_id: "",
662
+ from_instance: senderLabel,
663
+ correlation_id: correlationId,
664
+ };
665
+ if (args.request_kind)
666
+ meta.request_kind = args.request_kind;
667
+ if (args.requires_reply != null)
668
+ meta.requires_reply = String(args.requires_reply);
669
+ if (args.task_summary)
670
+ meta.task_summary = args.task_summary;
671
+ if (args.working_directory)
672
+ meta.working_directory = args.working_directory;
673
+ if (args.branch)
674
+ meta.branch = args.branch;
675
+ targetIpc.send({
676
+ type: "fleet_inbound",
677
+ targetSession,
678
+ content: message,
679
+ meta,
680
+ });
681
+ // Post to Telegram topics for visibility
682
+ const groupId = this.fleetConfig?.channel?.group_id;
683
+ if (groupId && this.adapter) {
684
+ const senderTopicId = this.fleetConfig?.instances[instanceName]?.topic_id;
685
+ const targetTopicId = this.fleetConfig?.instances[targetInstanceName]?.topic_id;
686
+ // Post full message to topics — adapter handles 4096-char chunking
687
+ // Only post to sender topic if sender is the instance itself (not external)
688
+ if (senderTopicId && !isExternalSender) {
689
+ this.adapter.sendText(String(groupId), `→ ${targetName}:\n${message}`, {
690
+ threadId: String(senderTopicId),
691
+ }).catch(e => this.logger.debug({ err: e }, "Failed to post cross-instance notification"));
692
+ }
693
+ // Only post to target topic if target is an instance (not external session)
694
+ if (targetTopicId && !this.sessionRegistry.has(targetName)) {
695
+ this.adapter.sendText(String(groupId), `← ${senderLabel}:\n${message}`, {
696
+ threadId: String(targetTopicId),
697
+ }).catch(e => this.logger.debug({ err: e }, "Failed to post cross-instance notification"));
698
+ }
699
+ }
700
+ this.logger.info(`✉ ${senderLabel} → ${targetName}: ${(message ?? "").slice(0, 100)}`);
701
+ respond({ sent: true, target: targetName, correlation_id: correlationId });
702
+ break;
703
+ }
704
+ case "list_instances": {
705
+ const senderLabel = senderSessionName ?? instanceName;
706
+ const allInstances = Object.entries(this.fleetConfig?.instances ?? {})
707
+ .filter(([name]) => name !== instanceName && name !== senderLabel)
708
+ .map(([name, config]) => ({
709
+ name,
710
+ type: "instance",
711
+ status: this.daemons.has(name) ? "running" : "stopped",
712
+ working_directory: config.working_directory,
713
+ topic_id: config.topic_id ?? null,
714
+ description: config.description ?? null,
715
+ tags: config.tags ?? [],
716
+ last_activity: this.lastActivity.get(name) ? new Date(this.lastActivity.get(name)).toISOString() : null,
717
+ }));
718
+ // Include external sessions (excluding self)
719
+ const externalSessions = [...this.sessionRegistry.entries()]
720
+ .filter(([sessName]) => sessName !== senderLabel)
721
+ .map(([sessName, hostInstance]) => ({
722
+ name: sessName, type: "session", host: hostInstance,
723
+ }));
724
+ respond({ instances: allInstances, external_sessions: externalSessions });
725
+ break;
726
+ }
727
+ // Phase 3: High-level collaboration tools (wrappers around send_to_instance)
728
+ case "request_information": {
729
+ const targetName = args.target_instance;
730
+ const question = args.question;
731
+ const context = args.context;
732
+ const body = context ? `${question}\n\nContext: ${context}` : question;
733
+ // Re-dispatch as send_to_instance with structured metadata
734
+ args.instance_name = targetName;
735
+ args.message = body;
736
+ args.request_kind = "query";
737
+ args.requires_reply = true;
738
+ args.task_summary = question.slice(0, 120);
739
+ // Recursively handle via the same switch (will hit send_to_instance case above)
740
+ return this.handleOutboundFromInstance(instanceName, { tool: "send_to_instance", args, requestId, fleetRequestId, senderSessionName });
741
+ }
742
+ case "delegate_task": {
743
+ const targetName = args.target_instance;
744
+ const task = args.task;
745
+ const criteria = args.success_criteria;
746
+ const context = args.context;
747
+ let body = task;
748
+ if (criteria)
749
+ body += `\n\nSuccess criteria: ${criteria}`;
750
+ if (context)
751
+ body += `\n\nContext: ${context}`;
752
+ args.instance_name = targetName;
753
+ args.message = body;
754
+ args.request_kind = "task";
755
+ args.requires_reply = true;
756
+ args.task_summary = task.slice(0, 120);
757
+ return this.handleOutboundFromInstance(instanceName, { tool: "send_to_instance", args, requestId, fleetRequestId, senderSessionName });
758
+ }
759
+ case "report_result": {
760
+ const targetName = args.target_instance;
761
+ const summary = args.summary;
762
+ const artifacts = args.artifacts;
763
+ if (!args.correlation_id) {
764
+ this.logger.warn({ instanceName, targetName }, "report_result called without correlation_id — recipient cannot match this to an original request");
765
+ }
766
+ let body = summary;
767
+ if (artifacts)
768
+ body += `\n\nArtifacts: ${artifacts}`;
769
+ args.instance_name = targetName;
770
+ args.message = body;
771
+ args.request_kind = "report";
772
+ args.requires_reply = false;
773
+ args.task_summary = summary.slice(0, 120);
774
+ return this.handleOutboundFromInstance(instanceName, { tool: "send_to_instance", args, requestId, fleetRequestId, senderSessionName });
775
+ }
776
+ // Phase 4: Capability discovery
777
+ case "describe_instance": {
778
+ const targetName = args.name;
779
+ const config = this.fleetConfig?.instances[targetName];
780
+ if (config) {
781
+ respond({
782
+ name: targetName,
783
+ type: "instance",
784
+ description: config.description ?? null,
785
+ tags: config.tags ?? [],
786
+ working_directory: config.working_directory,
787
+ status: this.daemons.has(targetName) ? "running" : "stopped",
788
+ topic_id: config.topic_id ?? null,
789
+ model: config.model ?? null,
790
+ last_activity: this.lastActivity.get(targetName) ? new Date(this.lastActivity.get(targetName)).toISOString() : null,
791
+ worktree_source: config.worktree_source ?? null,
792
+ });
793
+ break;
794
+ }
795
+ // Check if it's a known external session
796
+ const hostInstance = this.sessionRegistry.get(targetName);
797
+ if (hostInstance) {
798
+ respond({
799
+ name: targetName,
800
+ type: "session",
801
+ host: hostInstance,
802
+ status: "running",
803
+ });
804
+ break;
805
+ }
806
+ respond(null, `Instance or session '${targetName}' not found`);
807
+ break;
808
+ }
809
+ case "start_instance": {
810
+ const targetName = args.name;
811
+ // Already running?
812
+ if (this.daemons.has(targetName)) {
813
+ respond({ success: true, status: "already_running" });
814
+ break;
815
+ }
816
+ // Exists in config?
817
+ const targetConfig = this.fleetConfig?.instances[targetName];
818
+ if (!targetConfig) {
819
+ respond(null, `Instance '${targetName}' not found in fleet config`);
820
+ break;
821
+ }
822
+ try {
823
+ await this.startInstance(targetName, targetConfig, true);
824
+ await this.connectIpcToInstance(targetName);
825
+ respond({ success: true, status: "started" });
826
+ }
827
+ catch (err) {
828
+ respond(null, `Failed to start instance '${targetName}': ${err.message}`);
829
+ }
830
+ break;
831
+ }
832
+ case "create_instance": {
833
+ const directory = args.directory.replace(/^~/, process.env.HOME || "~");
834
+ const topicName = args.topic_name || basename(directory);
835
+ const description = args.description;
836
+ const branch = args.branch;
837
+ // Validate directory exists
838
+ try {
839
+ await access(directory);
840
+ }
841
+ catch {
842
+ respond(null, `Directory does not exist: ${directory}`);
843
+ break;
844
+ }
845
+ // Check for duplicate early (before worktree creation) — only when no branch
846
+ if (!branch) {
847
+ const expandHome = (p) => p.replace(/^~/, process.env.HOME || "~");
848
+ const existingInstance = Object.entries(this.fleetConfig?.instances ?? {})
849
+ .find(([_, config]) => expandHome(config.working_directory) === directory);
850
+ if (existingInstance) {
851
+ const [eName, eConfig] = existingInstance;
852
+ respond({
853
+ success: true,
854
+ status: "already_exists",
855
+ name: eName,
856
+ topic_id: eConfig.topic_id,
857
+ running: this.daemons.has(eName),
858
+ });
859
+ break;
860
+ }
861
+ }
862
+ // If branch specified, create git worktree
863
+ let workDir = directory;
864
+ let worktreePath;
865
+ if (branch) {
866
+ try {
867
+ const { execFile: execFileCb } = await import("node:child_process");
868
+ const { promisify } = await import("node:util");
869
+ const execFileAsync = promisify(execFileCb);
870
+ // Verify it's a git repo
871
+ await execFileAsync("git", ["rev-parse", "--git-dir"], { cwd: directory });
872
+ // Determine worktree path: sibling directory named repo-branch
873
+ const repoName = basename(directory);
874
+ const safeBranch = branch.replace(/\//g, "-");
875
+ worktreePath = join(dirname(directory), `${repoName}-${safeBranch}`);
876
+ // Check if branch exists
877
+ let branchExists = false;
878
+ try {
879
+ await execFileAsync("git", ["rev-parse", "--verify", branch], { cwd: directory });
880
+ branchExists = true;
881
+ }
882
+ catch { /* branch doesn't exist */ }
883
+ if (branchExists) {
884
+ await execFileAsync("git", ["worktree", "add", worktreePath, branch], { cwd: directory });
885
+ }
886
+ else {
887
+ await execFileAsync("git", ["worktree", "add", worktreePath, "-b", branch], { cwd: directory });
888
+ }
889
+ this.logger.info({ worktreePath, branch, repo: directory }, "Created git worktree for instance");
890
+ workDir = worktreePath;
891
+ }
892
+ catch (err) {
893
+ respond(null, `Failed to create worktree: ${err.message}`);
894
+ break;
895
+ }
896
+ }
897
+ // Check worktree path for duplicates (branch case only — non-branch already checked above)
898
+ if (worktreePath) {
899
+ const expandHome = (p) => p.replace(/^~/, process.env.HOME || "~");
900
+ const existingInstance = Object.entries(this.fleetConfig?.instances ?? {})
901
+ .find(([_, config]) => expandHome(config.working_directory) === workDir);
902
+ if (existingInstance) {
903
+ const [eName, eConfig] = existingInstance;
904
+ respond({
905
+ success: true,
906
+ status: "already_exists",
907
+ name: eName,
908
+ topic_id: eConfig.topic_id,
909
+ running: this.daemons.has(eName),
910
+ });
911
+ break;
912
+ }
913
+ }
914
+ // Sequential steps with rollback
915
+ let createdTopicId;
916
+ let newInstanceName;
917
+ try {
918
+ // Step a: Create Telegram topic
919
+ createdTopicId = await this.createForumTopic(topicName);
920
+ // Step b: Register in config
921
+ // Use topicName for worktree instances to avoid long paths (Unix socket limit 104 bytes)
922
+ const nameBase = worktreePath ? topicName : basename(workDir);
923
+ newInstanceName = `${sanitizeInstanceName(nameBase)}-t${createdTopicId}`;
924
+ const instanceConfig = {
925
+ ...DEFAULT_INSTANCE_CONFIG,
926
+ ...this.fleetConfig.defaults,
927
+ working_directory: workDir,
928
+ topic_id: createdTopicId,
929
+ ...(description ? { description } : {}),
930
+ ...(args.model ? { model: args.model } : {}),
931
+ ...(args.backend ? { backend: args.backend } : {}),
932
+ ...(worktreePath ? { worktree_source: directory } : {}),
933
+ };
934
+ this.fleetConfig.instances[newInstanceName] = instanceConfig;
935
+ this.routingTable.set(createdTopicId, { kind: "instance", name: newInstanceName });
936
+ this.saveFleetConfig();
937
+ // Step c: Start instance
938
+ await this.startInstance(newInstanceName, instanceConfig, true);
939
+ await this.connectIpcToInstance(newInstanceName);
940
+ respond({
941
+ success: true,
942
+ name: newInstanceName,
943
+ topic_id: createdTopicId,
944
+ ...(worktreePath ? { worktree_path: worktreePath, branch } : {}),
945
+ });
946
+ }
947
+ catch (err) {
948
+ // Rollback in reverse order
949
+ if (newInstanceName && this.daemons.has(newInstanceName)) {
950
+ await this.stopInstance(newInstanceName).catch(() => { });
951
+ }
952
+ if (newInstanceName && this.fleetConfig?.instances[newInstanceName]) {
953
+ delete this.fleetConfig.instances[newInstanceName];
954
+ if (createdTopicId)
955
+ this.routingTable.delete(createdTopicId);
956
+ this.saveFleetConfig();
957
+ }
958
+ if (createdTopicId) {
959
+ await this.deleteForumTopic(createdTopicId);
960
+ }
961
+ // Rollback worktree
962
+ if (worktreePath) {
963
+ try {
964
+ const { execFile: execFileCb } = await import("node:child_process");
965
+ const { promisify } = await import("node:util");
966
+ const execFileAsync = promisify(execFileCb);
967
+ await execFileAsync("git", ["worktree", "remove", "--force", worktreePath], { cwd: directory });
968
+ }
969
+ catch { /* best-effort worktree cleanup */ }
970
+ }
971
+ respond(null, `Failed to create instance: ${err.message}`);
972
+ }
973
+ break;
974
+ }
975
+ case "delete_instance": {
976
+ const instanceName = args.name;
977
+ const deleteTopic = args.delete_topic ?? false;
978
+ const instanceConfig = this.fleetConfig?.instances[instanceName];
979
+ if (!instanceConfig) {
980
+ respond(null, `Instance not found: ${instanceName}`);
981
+ break;
982
+ }
983
+ if (instanceConfig.general_topic) {
984
+ respond(null, "Cannot delete the General instance");
985
+ break;
986
+ }
987
+ // Delete Telegram topic if requested (before removeInstance clears config)
988
+ if (deleteTopic && instanceConfig.topic_id) {
989
+ await this.deleteForumTopic(instanceConfig.topic_id);
990
+ }
991
+ await this.removeInstance(instanceName);
992
+ respond({ success: true, name: instanceName, topic_deleted: deleteTopic });
993
+ break;
994
+ }
995
+ default:
996
+ respond(null, `Unknown tool: ${tool}`);
997
+ }
998
+ }
999
+ /** Handle tool status update from a daemon instance */
1000
+ handleToolStatusFromInstance(instanceName, msg) {
1001
+ if (!this.adapter)
1002
+ return;
1003
+ const text = msg.text;
1004
+ const editMessageId = msg.editMessageId;
1005
+ const instanceConfig = this.fleetConfig?.instances[instanceName];
1006
+ const threadId = instanceConfig?.topic_id ? String(instanceConfig.topic_id) : undefined;
1007
+ const chatId = this.adapter.getChatId();
1008
+ if (!chatId)
1009
+ return;
1010
+ if (editMessageId) {
1011
+ this.adapter.editMessage(chatId, editMessageId, text).catch(e => this.logger.debug({ err: e }, "Failed to edit tool status message"));
1012
+ }
1013
+ else {
1014
+ this.adapter.sendText(chatId, text, { threadId }).then((sent) => {
1015
+ const ipc = this.instanceIpcClients.get(instanceName);
1016
+ ipc?.send({ type: "fleet_tool_status_ack", messageId: sent.messageId });
1017
+ }).catch(e => this.logger.debug({ err: e }, "Failed to send tool status message"));
1018
+ }
1019
+ }
1020
+ // ===================== Scheduler =====================
1021
+ async handleScheduleTrigger(schedule) {
1022
+ const { target, reply_chat_id, reply_thread_id, message, label, id, source } = schedule;
1023
+ const RATE_LIMIT_DEFER_THRESHOLD = 85;
1024
+ const rl = this.instanceRateLimits.get(target);
1025
+ if (rl && rl.five_hour_pct > RATE_LIMIT_DEFER_THRESHOLD) {
1026
+ this.scheduler.recordRun(id, "deferred", `5hr rate limit at ${rl.five_hour_pct}%`);
1027
+ this.eventLog?.insert(target, "schedule_deferred", {
1028
+ schedule_id: id,
1029
+ label,
1030
+ five_hour_pct: rl.five_hour_pct,
1031
+ });
1032
+ this.webhookEmitter?.emit("schedule_deferred", target, { schedule_id: id, label, five_hour_pct: rl.five_hour_pct });
1033
+ this.notifyInstanceTopic(target, `⏳ Schedule "${label ?? id}" deferred — rate limit at ${rl.five_hour_pct}%`);
1034
+ this.logger.info({ target, scheduleId: id, rateLimitPct: rl.five_hour_pct }, "Schedule deferred due to rate limit");
1035
+ return;
1036
+ }
1037
+ const defaults = this.fleetConfig?.defaults;
1038
+ const schedulerDefaults = defaults?.scheduler;
1039
+ const retryCount = schedulerDefaults?.retry_count ?? 3;
1040
+ const retryInterval = schedulerDefaults?.retry_interval_ms ?? 30_000;
1041
+ const deliver = () => {
1042
+ const ipc = this.instanceIpcClients.get(target);
1043
+ if (!ipc?.connected)
1044
+ return false;
1045
+ ipc.send({
1046
+ type: "fleet_schedule_trigger",
1047
+ payload: { schedule_id: id, message: `[排程任務] ${message}`, label },
1048
+ meta: { chat_id: reply_chat_id, thread_id: reply_thread_id, user: "scheduler" },
1049
+ });
1050
+ return true;
1051
+ };
1052
+ if (deliver()) {
1053
+ this.scheduler.recordRun(id, "delivered");
1054
+ if (source !== target)
1055
+ this.notifySourceTopic(schedule);
1056
+ return;
1057
+ }
1058
+ for (let i = 0; i < retryCount; i++) {
1059
+ await new Promise((r) => setTimeout(r, retryInterval));
1060
+ if (deliver()) {
1061
+ this.scheduler.recordRun(id, "delivered");
1062
+ if (source !== target)
1063
+ this.notifySourceTopic(schedule);
1064
+ return;
1065
+ }
1066
+ }
1067
+ this.scheduler.recordRun(id, "instance_offline", `retry ${retryCount}x failed`);
1068
+ this.notifyScheduleFailure(schedule);
1069
+ }
1070
+ notifySourceTopic(schedule) {
1071
+ if (!this.adapter)
1072
+ return;
1073
+ const text = `⏰ 排程「${schedule.label ?? schedule.id}」已觸發,目標實例:${schedule.target}`;
1074
+ this.adapter.sendText(schedule.reply_chat_id, text, {
1075
+ threadId: schedule.reply_thread_id ?? undefined,
1076
+ }).catch((err) => this.logger.error({ err }, "Failed to send cross-instance notification"));
1077
+ }
1078
+ notifyScheduleFailure(schedule) {
1079
+ if (!this.adapter)
1080
+ return;
1081
+ const text = `⏰ 排程「${schedule.label ?? schedule.id}」觸發失敗:實例 ${schedule.target} 未在線。`;
1082
+ this.adapter.sendText(schedule.reply_chat_id, text, {
1083
+ threadId: schedule.reply_thread_id ?? undefined,
1084
+ }).catch((err) => this.logger.error({ err }, "Failed to send schedule failure notification"));
1085
+ }
1086
+ handleScheduleCrud(instanceName, msg) {
1087
+ const fleetRequestId = msg.fleetRequestId;
1088
+ const payload = (msg.payload ?? {});
1089
+ const meta = (msg.meta ?? {});
1090
+ const ipc = this.instanceIpcClients.get(instanceName);
1091
+ if (!ipc)
1092
+ return;
1093
+ try {
1094
+ let result;
1095
+ switch (msg.type) {
1096
+ case "fleet_schedule_create": {
1097
+ const params = {
1098
+ cron: payload.cron,
1099
+ message: payload.message,
1100
+ source: instanceName,
1101
+ target: payload.target || instanceName,
1102
+ reply_chat_id: meta.chat_id,
1103
+ reply_thread_id: meta.thread_id || null,
1104
+ label: payload.label,
1105
+ timezone: payload.timezone,
1106
+ };
1107
+ result = this.scheduler.create(params);
1108
+ break;
1109
+ }
1110
+ case "fleet_schedule_list":
1111
+ result = this.scheduler.list(payload.target);
1112
+ break;
1113
+ case "fleet_schedule_update":
1114
+ result = this.scheduler.update(payload.id, payload);
1115
+ break;
1116
+ case "fleet_schedule_delete":
1117
+ this.scheduler.delete(payload.id);
1118
+ result = "ok";
1119
+ break;
1120
+ }
1121
+ ipc.send({ type: "fleet_schedule_response", fleetRequestId, result });
1122
+ }
1123
+ catch (err) {
1124
+ ipc.send({ type: "fleet_schedule_response", fleetRequestId, error: err.message });
1125
+ }
1126
+ }
1127
+ // ===================== Topic management =====================
1128
+ /** Create a forum topic via the adapter. Returns the message_thread_id. */
1129
+ async createForumTopic(topicName) {
1130
+ if (!this.adapter?.createTopic) {
1131
+ throw new Error("Adapter does not support topic creation");
1132
+ }
1133
+ return this.adapter.createTopic(topicName);
1134
+ }
1135
+ async deleteForumTopic(topicId) {
1136
+ try {
1137
+ const groupId = this.fleetConfig?.channel?.group_id;
1138
+ const botTokenEnv = this.fleetConfig?.channel?.bot_token_env;
1139
+ if (!groupId || !botTokenEnv)
1140
+ return;
1141
+ const botToken = process.env[botTokenEnv];
1142
+ if (!botToken)
1143
+ return;
1144
+ await fetch(`https://api.telegram.org/bot${botToken}/deleteForumTopic`, {
1145
+ method: "POST",
1146
+ headers: { "Content-Type": "application/json" },
1147
+ body: JSON.stringify({ chat_id: groupId, message_thread_id: topicId }),
1148
+ });
1149
+ }
1150
+ catch (err) {
1151
+ this.logger.warn({ err, topicId }, "Failed to delete forum topic during rollback");
1152
+ }
1153
+ }
1154
+ topicCleanupTimer = null;
1155
+ sessionPruneTimer = null;
1156
+ /** Periodically check if bound topics still exist */
1157
+ startTopicCleanupPoller() {
1158
+ this.topicCleanupTimer = setInterval(async () => {
1159
+ if (!this.fleetConfig?.channel?.group_id || !this.adapter?.topicExists)
1160
+ return;
1161
+ for (const [threadId, target] of this.routingTable) {
1162
+ try {
1163
+ if (!isProbeableRouteTarget(target)) {
1164
+ continue;
1165
+ }
1166
+ const exists = await this.adapter.topicExists(threadId);
1167
+ if (!exists) {
1168
+ await this.topicCommands.handleTopicDeleted(threadId);
1169
+ }
1170
+ }
1171
+ catch (err) {
1172
+ this.logger.debug({ err, threadId }, "Topic existence check failed");
1173
+ }
1174
+ }
1175
+ }, 5 * 60_000);
1176
+ }
1177
+ /** Save fleet config back to fleet.yaml */
1178
+ saveFleetConfig() {
1179
+ if (!this.fleetConfig || !this.configPath)
1180
+ return;
1181
+ const toSave = {};
1182
+ if (this.fleetConfig.project_roots)
1183
+ toSave.project_roots = this.fleetConfig.project_roots;
1184
+ if (this.fleetConfig.channel)
1185
+ toSave.channel = this.fleetConfig.channel;
1186
+ if (this.fleetConfig.health_port)
1187
+ toSave.health_port = this.fleetConfig.health_port;
1188
+ if (Object.keys(this.fleetConfig.defaults).length > 0)
1189
+ toSave.defaults = this.fleetConfig.defaults;
1190
+ toSave.instances = {};
1191
+ for (const [name, inst] of Object.entries(this.fleetConfig.instances)) {
1192
+ const serialized = {
1193
+ working_directory: inst.working_directory,
1194
+ topic_id: inst.topic_id,
1195
+ };
1196
+ // Preserve all optional user-configured fields so saveFleetConfig() never silently drops them
1197
+ if (inst.general_topic)
1198
+ serialized.general_topic = true;
1199
+ if (inst.description)
1200
+ serialized.description = inst.description;
1201
+ if (inst.tags?.length)
1202
+ serialized.tags = inst.tags;
1203
+ if (inst.model)
1204
+ serialized.model = inst.model;
1205
+ if (inst.model_failover?.length)
1206
+ serialized.model_failover = inst.model_failover;
1207
+ if (inst.worktree_source)
1208
+ serialized.worktree_source = inst.worktree_source;
1209
+ if (inst.backend)
1210
+ serialized.backend = inst.backend;
1211
+ if (inst.systemPrompt)
1212
+ serialized.systemPrompt = inst.systemPrompt;
1213
+ if (inst.skipPermissions)
1214
+ serialized.skipPermissions = inst.skipPermissions;
1215
+ if (inst.lightweight)
1216
+ serialized.lightweight = inst.lightweight;
1217
+ if (inst.cost_guard)
1218
+ serialized.cost_guard = inst.cost_guard;
1219
+ toSave.instances[name] = serialized;
1220
+ }
1221
+ writeFileSync(this.configPath, yaml.dump(toSave, { lineWidth: 120 }));
1222
+ this.logger.info({ path: this.configPath }, "Saved fleet config");
1223
+ }
1224
+ async removeInstance(name) {
1225
+ const config = this.fleetConfig?.instances[name];
1226
+ if (!config)
1227
+ return;
1228
+ // Never remove the General instance
1229
+ if (config.general_topic) {
1230
+ this.logger.warn({ name }, "Refusing to remove General instance");
1231
+ return;
1232
+ }
1233
+ // Clean up schedules
1234
+ if (this.scheduler && config.topic_id) {
1235
+ const count = this.scheduler.deleteByInstanceOrThread(name, String(config.topic_id));
1236
+ if (count > 0) {
1237
+ this.logger.info({ name, count }, "Cleaned up schedules for deleted instance");
1238
+ }
1239
+ }
1240
+ // Stop daemon if running
1241
+ if (this.daemons.has(name)) {
1242
+ await this.stopInstance(name);
1243
+ }
1244
+ // Clean up git worktree if applicable
1245
+ if (config.worktree_source && config.working_directory) {
1246
+ const { existsSync } = await import("node:fs");
1247
+ if (!existsSync(config.working_directory)) {
1248
+ this.logger.info({ worktree: config.working_directory }, "Worktree directory already gone, skipping removal");
1249
+ }
1250
+ else {
1251
+ try {
1252
+ const { execFile: execFileCb } = await import("node:child_process");
1253
+ const { promisify } = await import("node:util");
1254
+ const execFileAsync = promisify(execFileCb);
1255
+ await execFileAsync("git", ["worktree", "remove", "--force", config.working_directory], {
1256
+ cwd: config.worktree_source,
1257
+ });
1258
+ this.logger.info({ worktree: config.working_directory }, "Removed git worktree");
1259
+ }
1260
+ catch (err) {
1261
+ this.logger.warn({ err, worktree: config.working_directory }, "Failed to remove git worktree");
1262
+ }
1263
+ }
1264
+ }
1265
+ // Clean up IPC
1266
+ const ipc = this.instanceIpcClients.get(name);
1267
+ if (ipc) {
1268
+ await ipc.close();
1269
+ this.instanceIpcClients.delete(name);
1270
+ }
1271
+ // Remove from routing table
1272
+ if (config.topic_id) {
1273
+ this.routingTable.delete(config.topic_id);
1274
+ }
1275
+ // Remove from fleet config and save
1276
+ delete this.fleetConfig.instances[name];
1277
+ this.saveFleetConfig();
1278
+ this.logger.info({ name }, "Instance removed");
1279
+ }
1280
+ startStatuslineWatcher(name) {
1281
+ const statusFile = join(this.getInstanceDir(name), "statusline.json");
1282
+ const timer = setInterval(() => {
1283
+ try {
1284
+ const data = JSON.parse(readFileSync(statusFile, "utf-8"));
1285
+ this.costGuard?.updateCost(name, data.cost?.total_cost_usd ?? 0);
1286
+ const rl = data.rate_limits;
1287
+ if (rl) {
1288
+ const prev = this.instanceRateLimits.get(name);
1289
+ const newSevenDay = rl.seven_day?.used_percentage ?? 0;
1290
+ if (prev?.seven_day_pct === 100 && newSevenDay < 100) {
1291
+ this.notifyInstanceTopic(name, `✅ ${name} weekly usage limit has reset — instance is available again.`);
1292
+ this.logger.info({ name }, "Weekly rate limit recovered");
1293
+ }
1294
+ this.instanceRateLimits.set(name, {
1295
+ five_hour_pct: rl.five_hour?.used_percentage ?? 0,
1296
+ seven_day_pct: newSevenDay,
1297
+ });
1298
+ this.checkModelFailover(name, rl.five_hour?.used_percentage ?? 0);
1299
+ }
1300
+ }
1301
+ catch { /* file may not exist yet or be mid-write */ }
1302
+ }, 10_000);
1303
+ this.statuslineWatchers.set(name, timer);
1304
+ }
1305
+ // ── Model failover ──────────────────────────────────────────────────────
1306
+ static FAILOVER_TRIGGER_PCT = 90;
1307
+ static FAILOVER_RECOVER_PCT = 50;
1308
+ checkModelFailover(name, fiveHourPct) {
1309
+ const config = this.fleetConfig?.instances[name];
1310
+ if (!config?.model_failover?.length)
1311
+ return;
1312
+ const daemon = this.daemons.get(name);
1313
+ if (!daemon)
1314
+ return;
1315
+ const failoverList = config.model_failover;
1316
+ const primaryModel = failoverList[0];
1317
+ const currentFailover = this.failoverActive.get(name);
1318
+ if (fiveHourPct >= FleetManager.FAILOVER_TRIGGER_PCT && !currentFailover) {
1319
+ // Trigger failover: pick next model in list
1320
+ const fallbackModel = failoverList.length > 1 ? failoverList[1] : undefined;
1321
+ if (!fallbackModel)
1322
+ return;
1323
+ this.failoverActive.set(name, fallbackModel);
1324
+ daemon.setModelOverride(fallbackModel);
1325
+ this.logger.info({ instance: name, from: primaryModel, to: fallbackModel, ratePct: fiveHourPct }, "Model failover triggered");
1326
+ this.eventLog?.insert(name, "model_failover", {
1327
+ from: primaryModel, to: fallbackModel, five_hour_pct: fiveHourPct,
1328
+ });
1329
+ this.webhookEmitter?.emit("model_failover", name, { from: primaryModel, to: fallbackModel, five_hour_pct: fiveHourPct });
1330
+ this.notifyInstanceTopic(name, `⚡ Rate limit ${fiveHourPct}% — next rotation will use ${fallbackModel} (was ${primaryModel})`);
1331
+ }
1332
+ else if (fiveHourPct < FleetManager.FAILOVER_RECOVER_PCT && currentFailover) {
1333
+ // Recover: switch back to primary
1334
+ this.failoverActive.delete(name);
1335
+ daemon.setModelOverride(undefined);
1336
+ this.logger.info({ instance: name, restored: primaryModel, ratePct: fiveHourPct }, "Model failover recovered");
1337
+ this.eventLog?.insert(name, "model_recovered", {
1338
+ restored: primaryModel, five_hour_pct: fiveHourPct,
1339
+ });
1340
+ this.webhookEmitter?.emit("model_recovered", name, { restored: primaryModel, five_hour_pct: fiveHourPct });
1341
+ this.notifyInstanceTopic(name, `✅ Rate limit recovered (${fiveHourPct}%) — next rotation will use ${primaryModel}`);
1342
+ }
1343
+ }
1344
+ notifyInstanceTopic(instanceName, text) {
1345
+ if (!this.adapter)
1346
+ return;
1347
+ const groupId = this.fleetConfig?.channel?.group_id;
1348
+ if (!groupId)
1349
+ return;
1350
+ const threadId = this.fleetConfig?.instances[instanceName]?.topic_id;
1351
+ this.adapter.sendText(String(groupId), text, {
1352
+ threadId: threadId != null ? String(threadId) : undefined,
1353
+ }).catch(e => this.logger.debug({ err: e }, "Failed to send notification"));
1354
+ }
1355
+ async sendHangNotification(instanceName) {
1356
+ if (!this.adapter)
1357
+ return;
1358
+ const groupId = this.fleetConfig?.channel?.group_id;
1359
+ if (!groupId)
1360
+ return;
1361
+ const threadId = this.fleetConfig?.instances[instanceName]?.topic_id;
1362
+ this.setTopicIcon(instanceName, "red");
1363
+ await this.adapter.notifyAlert(String(groupId), {
1364
+ type: "hang",
1365
+ instanceName,
1366
+ message: `⚠️ ${instanceName} appears hung (no activity for 15+ minutes)`,
1367
+ choices: [
1368
+ { id: `hang:restart:${instanceName}`, label: "🔄 Force restart" },
1369
+ { id: `hang:wait:${instanceName}`, label: "⏳ Keep waiting" },
1370
+ ],
1371
+ }, {
1372
+ threadId: threadId != null ? String(threadId) : undefined,
1373
+ }).catch(e => this.logger.debug({ err: e }, "Failed to send hang notification"));
1374
+ }
1375
+ // ── Topic icon + auto-archive ─────────────────────────────────────────────
1376
+ /** Fetch forum topic icon stickers and pick emoji IDs for each state */
1377
+ async resolveTopicIcons() {
1378
+ if (!this.adapter?.getTopicIconStickers)
1379
+ return;
1380
+ try {
1381
+ const stickers = await this.adapter.getTopicIconStickers();
1382
+ if (stickers.length === 0)
1383
+ return;
1384
+ // Telegram's getForumTopicIconStickers returns a fixed set.
1385
+ // Try to match by emoji character, fall back to positional.
1386
+ const find = (targets) => stickers.find((s) => targets.some((t) => s.emoji.includes(t)));
1387
+ const green = find(["🟢", "✅", "💚"]);
1388
+ const blue = find(["🔵", "💙", "📘"]);
1389
+ const red = find(["🔴", "❌", "💔"]);
1390
+ this.topicIcons = {
1391
+ green: green?.customEmojiId ?? stickers[0]?.customEmojiId,
1392
+ blue: blue?.customEmojiId ?? stickers[1]?.customEmojiId ?? stickers[0]?.customEmojiId,
1393
+ red: red?.customEmojiId ?? stickers[Math.min(5, stickers.length - 1)]?.customEmojiId,
1394
+ };
1395
+ this.logger.info({ icons: this.topicIcons }, "Resolved topic icon emoji IDs");
1396
+ }
1397
+ catch (err) {
1398
+ this.logger.debug({ err }, "Failed to resolve topic icons (non-fatal)");
1399
+ }
1400
+ }
1401
+ /** Set topic icon based on instance state */
1402
+ setTopicIcon(instanceName, state) {
1403
+ const topicId = this.fleetConfig?.instances[instanceName]?.topic_id;
1404
+ if (topicId == null || !this.adapter?.editForumTopic)
1405
+ return;
1406
+ const emojiId = state === "remove" ? "" : this.topicIcons[state];
1407
+ if (emojiId == null && state !== "remove")
1408
+ return; // no icon resolved
1409
+ this.adapter.editForumTopic(topicId, { iconCustomEmojiId: emojiId })
1410
+ .catch((e) => this.logger.debug({ err: e, instanceName, state }, "Topic icon update failed"));
1411
+ }
1412
+ /** Track activity timestamp for idle detection */
1413
+ touchActivity(instanceName) {
1414
+ this.lastActivity.set(instanceName, Date.now());
1415
+ }
1416
+ /** Start periodic idle archive checker */
1417
+ startArchivePoller() {
1418
+ this.archiveTimer = setInterval(() => {
1419
+ this.archiveIdleTopics().catch((err) => this.logger.debug({ err }, "Archive idle check failed"));
1420
+ }, 30 * 60_000); // check every 30 minutes
1421
+ }
1422
+ /** Close topics that have been idle beyond threshold */
1423
+ async archiveIdleTopics() {
1424
+ if (!this.adapter?.closeForumTopic || !this.fleetConfig)
1425
+ return;
1426
+ const now = Date.now();
1427
+ for (const [name, config] of Object.entries(this.fleetConfig.instances)) {
1428
+ const topicId = config.topic_id;
1429
+ if (topicId == null || config.general_topic)
1430
+ continue;
1431
+ if (this.archivedTopics.has(topicId))
1432
+ continue;
1433
+ const status = this.getInstanceStatus(name);
1434
+ if (status !== "running")
1435
+ continue; // only archive running-but-idle
1436
+ const last = this.lastActivity.get(name) ?? 0;
1437
+ if (last === 0)
1438
+ continue; // never active → skip (just started)
1439
+ if (now - last < FleetManager.ARCHIVE_IDLE_MS)
1440
+ continue;
1441
+ this.logger.info({ name, topicId, idleHours: Math.round((now - last) / 3600000) }, "Archiving idle topic");
1442
+ this.archivedTopics.add(topicId);
1443
+ this.setTopicIcon(name, "remove");
1444
+ await this.adapter.closeForumTopic(topicId);
1445
+ }
1446
+ }
1447
+ /** Reopen an archived topic and restore icon */
1448
+ async reopenArchivedTopic(topicId, instanceName) {
1449
+ if (!this.archivedTopics.has(topicId))
1450
+ return;
1451
+ this.archivedTopics.delete(topicId);
1452
+ if (this.adapter?.reopenForumTopic) {
1453
+ await this.adapter.reopenForumTopic(topicId);
1454
+ }
1455
+ this.setTopicIcon(instanceName, "green");
1456
+ this.touchActivity(instanceName);
1457
+ this.logger.info({ instanceName, topicId }, "Reopened archived topic");
1458
+ }
1459
+ clearStatuslineWatchers() {
1460
+ for (const [, timer] of this.statuslineWatchers)
1461
+ clearInterval(timer);
1462
+ this.statuslineWatchers.clear();
1463
+ this.instanceRateLimits.clear();
1464
+ this.failoverActive.clear();
1465
+ }
1466
+ async stopAll() {
1467
+ this.clearStatuslineWatchers();
1468
+ this.costGuard?.stop();
1469
+ this.dailySummary?.stop();
1470
+ if (this.topicCleanupTimer) {
1471
+ clearInterval(this.topicCleanupTimer);
1472
+ this.topicCleanupTimer = null;
1473
+ }
1474
+ if (this.sessionPruneTimer) {
1475
+ clearInterval(this.sessionPruneTimer);
1476
+ this.sessionPruneTimer = null;
1477
+ }
1478
+ if (this.archiveTimer) {
1479
+ clearInterval(this.archiveTimer);
1480
+ this.archiveTimer = null;
1481
+ }
1482
+ this.scheduler?.shutdown();
1483
+ await Promise.allSettled([...this.daemons.entries()].map(async ([name, daemon]) => {
1484
+ try {
1485
+ await daemon.stop();
1486
+ }
1487
+ catch (err) {
1488
+ this.logger.warn({ name, err }, "Stop failed");
1489
+ }
1490
+ this.daemons.delete(name);
1491
+ }));
1492
+ for (const [, ipc] of this.instanceIpcClients) {
1493
+ await ipc.close();
1494
+ }
1495
+ this.instanceIpcClients.clear();
1496
+ if (this.adapter) {
1497
+ await this.adapter.stop();
1498
+ this.adapter = null;
1499
+ }
1500
+ if (this.healthServer) {
1501
+ this.healthServer.close();
1502
+ this.healthServer = null;
1503
+ }
1504
+ this.eventLog?.close();
1505
+ const pidPath = join(this.dataDir, "fleet.pid");
1506
+ try {
1507
+ unlinkSync(pidPath);
1508
+ }
1509
+ catch (e) {
1510
+ this.logger.debug({ err: e }, "Failed to remove fleet PID file");
1511
+ }
1512
+ }
1513
+ /**
1514
+ * Prune stale external sessions by re-querying each daemon for live sessions.
1515
+ * Sessions in the registry that are no longer reported by any daemon are removed.
1516
+ */
1517
+ async pruneStaleExternalSessions() {
1518
+ const liveSessions = new Set();
1519
+ // Ask each daemon for its currently connected external sessions
1520
+ const queries = [...this.instanceIpcClients.entries()].map(([name, ipc]) => {
1521
+ if (!ipc.connected)
1522
+ return Promise.resolve();
1523
+ return new Promise((resolve) => {
1524
+ const timeout = setTimeout(resolve, 5000);
1525
+ const handler = (msg) => {
1526
+ if (msg.type !== "query_sessions_response")
1527
+ return;
1528
+ ipc.removeListener("message", handler);
1529
+ clearTimeout(timeout);
1530
+ for (const s of msg.sessions)
1531
+ liveSessions.add(s);
1532
+ resolve();
1533
+ };
1534
+ ipc.on("message", handler);
1535
+ ipc.send({ type: "query_sessions" });
1536
+ });
1537
+ });
1538
+ await Promise.all(queries);
1539
+ // Remove sessions not found in any daemon
1540
+ let pruned = 0;
1541
+ for (const [sessionName] of this.sessionRegistry) {
1542
+ if (!liveSessions.has(sessionName)) {
1543
+ this.sessionRegistry.delete(sessionName);
1544
+ this.logger.info({ sessionName }, "Pruned stale external session");
1545
+ pruned++;
1546
+ }
1547
+ }
1548
+ if (pruned > 0) {
1549
+ this.logger.info({ pruned, remaining: this.sessionRegistry.size }, "Session registry pruned");
1550
+ }
1551
+ return pruned;
1552
+ }
1553
+ /**
1554
+ * Graceful shutdown for full reload: wait for idle, notify, then stop everything.
1555
+ * The caller is expected to exit the process after this resolves.
1556
+ */
1557
+ async gracefulShutdownForReload() {
1558
+ const instanceNames = [...this.daemons.keys()];
1559
+ if (instanceNames.length === 0) {
1560
+ this.logger.info("No instances to stop");
1561
+ await this.stopAll();
1562
+ return;
1563
+ }
1564
+ this.logger.info(`Full restart: waiting for ${instanceNames.length} instances to idle...`);
1565
+ const groupId = this.fleetConfig?.channel?.group_id;
1566
+ if (groupId && this.adapter) {
1567
+ await this.adapter.sendText(String(groupId), `🔄 Full restart initiated — waiting for all instances to idle, then reloading process...`)
1568
+ .catch(e => this.logger.debug({ err: e }, "Failed to post full restart notification"));
1569
+ }
1570
+ // Wait for idle with 5-minute timeout
1571
+ const IDLE_TIMEOUT_MS = 5 * 60 * 1000;
1572
+ let timeoutHandle;
1573
+ const idleDeadline = new Promise((_, reject) => {
1574
+ timeoutHandle = setTimeout(() => reject(new Error("Idle wait timed out after 5 minutes")), IDLE_TIMEOUT_MS);
1575
+ });
1576
+ try {
1577
+ await Promise.race([
1578
+ Promise.all(instanceNames.map(async (name) => {
1579
+ const daemon = this.daemons.get(name);
1580
+ if (daemon) {
1581
+ this.logger.info(`Waiting for ${name} to idle...`);
1582
+ await daemon.waitForIdle(10_000);
1583
+ this.logger.info(`${name} is idle`);
1584
+ }
1585
+ })),
1586
+ idleDeadline,
1587
+ ]);
1588
+ }
1589
+ catch (err) {
1590
+ this.logger.warn({ err }, "Idle wait timed out — force stopping");
1591
+ }
1592
+ finally {
1593
+ clearTimeout(timeoutHandle);
1594
+ }
1595
+ this.logger.info("All instances idle — stopping for reload...");
1596
+ await this.stopAll();
1597
+ }
1598
+ /**
1599
+ * Graceful restart: wait for all instances to be idle, then stop and start them.
1600
+ */
1601
+ async restartInstances() {
1602
+ if (!this.configPath) {
1603
+ this.logger.error("Cannot restart: no config path (was startAll called?)");
1604
+ return;
1605
+ }
1606
+ const instanceNames = [...this.daemons.keys()];
1607
+ if (instanceNames.length === 0) {
1608
+ this.logger.info("No instances to restart");
1609
+ return;
1610
+ }
1611
+ this.logger.info(`Graceful restart: waiting for ${instanceNames.length} instances to idle...`);
1612
+ const groupId = this.fleetConfig?.channel?.group_id;
1613
+ if (groupId && this.adapter) {
1614
+ await this.adapter.sendText(String(groupId), `🔄 Graceful restart initiated — waiting for all instances to idle...`)
1615
+ .catch(e => this.logger.debug({ err: e }, "Failed to post restart notification"));
1616
+ }
1617
+ const IDLE_TIMEOUT_MS = 5 * 60 * 1000;
1618
+ let timeoutHandle;
1619
+ const idleDeadline = new Promise((_, reject) => {
1620
+ timeoutHandle = setTimeout(() => reject(new Error("Idle wait timed out after 5 minutes")), IDLE_TIMEOUT_MS);
1621
+ });
1622
+ try {
1623
+ await Promise.race([
1624
+ Promise.all(instanceNames.map(async (name) => {
1625
+ const daemon = this.daemons.get(name);
1626
+ if (daemon) {
1627
+ this.logger.info(`Waiting for ${name} to idle...`);
1628
+ await daemon.waitForIdle(10_000);
1629
+ this.logger.info(`${name} is idle`);
1630
+ }
1631
+ })),
1632
+ idleDeadline,
1633
+ ]);
1634
+ }
1635
+ catch (err) {
1636
+ this.logger.warn({ err }, "Idle wait timed out — force restarting");
1637
+ }
1638
+ finally {
1639
+ clearTimeout(timeoutHandle);
1640
+ }
1641
+ this.logger.info("All instances idle — restarting...");
1642
+ this.clearStatuslineWatchers();
1643
+ for (const [, ipc] of this.instanceIpcClients) {
1644
+ await ipc.close();
1645
+ }
1646
+ this.instanceIpcClients.clear();
1647
+ await Promise.allSettled(instanceNames.map(name => this.stopInstance(name)));
1648
+ const fleet = this.loadConfig(this.configPath);
1649
+ this.fleetConfig = fleet;
1650
+ const topicMode = fleet.channel?.mode === "topic";
1651
+ for (const [name, config] of Object.entries(fleet.instances)) {
1652
+ await this.startInstance(name, config, topicMode);
1653
+ }
1654
+ if (topicMode) {
1655
+ this.routingTable = this.buildRoutingTable();
1656
+ await new Promise(r => setTimeout(r, 3000));
1657
+ await this.connectToInstances(fleet);
1658
+ for (const name of Object.keys(fleet.instances)) {
1659
+ this.startStatuslineWatcher(name);
1660
+ }
1661
+ }
1662
+ this.logger.info("Graceful restart complete");
1663
+ if (groupId && this.adapter) {
1664
+ await this.adapter.sendText(String(groupId), `✅ Graceful restart complete — ${this.daemons.size} instances running`)
1665
+ .catch(e => this.logger.debug({ err: e }, "Failed to post restart completion notification"));
1666
+ // Notify each instance's channel so Claude resumes work
1667
+ const instances = Object.entries(this.fleetConfig?.instances ?? {});
1668
+ this.logger.info({ count: instances.length }, "Sending restart notification to instances");
1669
+ for (const [name, config] of instances) {
1670
+ const threadId = config.topic_id != null ? String(config.topic_id) : undefined;
1671
+ // Send to Telegram topic so the message appears in the chat
1672
+ if (threadId) {
1673
+ this.adapter.sendText(String(groupId), "Fleet restart complete. Continue from where you left off.", { threadId })
1674
+ .catch(e => this.logger.warn({ err: e, name, threadId }, "Failed to post per-instance restart notification"));
1675
+ }
1676
+ // Push to daemon IPC so the Claude session receives the message
1677
+ const ipc = this.instanceIpcClients.get(name);
1678
+ if (ipc?.connected) {
1679
+ ipc.send({
1680
+ type: "fleet_inbound",
1681
+ content: "Fleet restart complete. Continue from where you left off.",
1682
+ meta: {
1683
+ chat_id: String(groupId),
1684
+ thread_id: threadId ?? "",
1685
+ ts: new Date().toISOString(),
1686
+ },
1687
+ });
1688
+ }
1689
+ }
1690
+ }
1691
+ }
1692
+ // ── Health HTTP endpoint ─────────────────────────────────────────────
1693
+ startHealthServer(port) {
1694
+ this.startedAt = Date.now();
1695
+ this.healthServer = createServer((req, res) => {
1696
+ res.setHeader("Content-Type", "application/json");
1697
+ if (req.method === "GET" && req.url === "/health") {
1698
+ const instanceCount = this.fleetConfig?.instances
1699
+ ? Object.keys(this.fleetConfig.instances).length
1700
+ : 0;
1701
+ res.writeHead(200);
1702
+ res.end(JSON.stringify({
1703
+ status: "ok",
1704
+ instances: instanceCount,
1705
+ uptime: Math.floor((Date.now() - this.startedAt) / 1000),
1706
+ }));
1707
+ return;
1708
+ }
1709
+ if (req.method === "GET" && req.url === "/status") {
1710
+ const instances = Object.keys(this.fleetConfig?.instances ?? {}).map(name => {
1711
+ const statusFile = join(this.getInstanceDir(name), "statusline.json");
1712
+ let context_pct = 0;
1713
+ let cost = 0;
1714
+ try {
1715
+ const data = JSON.parse(readFileSync(statusFile, "utf-8"));
1716
+ context_pct = data.context_window?.used_percentage ?? 0;
1717
+ cost = data.cost?.total_cost_usd ?? 0;
1718
+ }
1719
+ catch { /* statusline not yet available */ }
1720
+ return {
1721
+ name,
1722
+ status: this.getInstanceStatus(name),
1723
+ context_pct,
1724
+ cost,
1725
+ };
1726
+ });
1727
+ res.writeHead(200);
1728
+ res.end(JSON.stringify({ instances }));
1729
+ return;
1730
+ }
1731
+ res.writeHead(404);
1732
+ res.end(JSON.stringify({ error: "not found" }));
1733
+ });
1734
+ this.healthServer.listen(port, "127.0.0.1", () => {
1735
+ this.logger.info({ port }, "Health endpoint listening");
1736
+ });
1737
+ }
1738
+ }
1739
+ //# sourceMappingURL=fleet-manager.js.map