@songsid/agend 0.0.13 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -82,6 +82,10 @@ export class FleetManager {
82
82
  classicChannels = null;
83
83
  // Model failover state
84
84
  failoverActive = new Map(); // instance → current failover model
85
+ // IPC reconnect: tracks instances being intentionally stopped (skip reconnect)
86
+ ipcStoppingInstances = new Set();
87
+ // Adapter restart: prevents re-entrant restart attempts
88
+ adapterRestarting = new Set();
85
89
  // Health endpoint
86
90
  healthServer = null;
87
91
  startedAt = 0;
@@ -783,6 +787,10 @@ export class FleetManager {
783
787
  this.adapter.on("handler_error", safeHandler((err) => {
784
788
  this.logger.warn({ err: err instanceof Error ? err.message : String(err) }, "Adapter handler error");
785
789
  }, this.logger, "adapter.handler_error"));
790
+ this.adapter.on("error", (err) => {
791
+ this.logger.error({ err }, "Primary adapter fatal error");
792
+ this.restartAdapter(this.adapter, "primary").catch(() => { });
793
+ });
786
794
  this.adapter.on("new_group_detected", safeHandler((data) => {
787
795
  const adminMsg = `šŸ†• Bot added to new server:\n• Name: ${data.groupTitle}\n• ID: ${data.groupId}\n• Platform: ${data.source}\n\nTo allow: add \`${data.groupId}\` to classicBot.yaml \`allowed_guilds\``;
788
796
  const generalId = this.findGeneralInstance();
@@ -972,6 +980,10 @@ export class FleetManager {
972
980
  if (generalId)
973
981
  this.notifyInstanceTopic(generalId, adminMsg);
974
982
  }, this.logger, `adapter[${adapterId}].new_group_detected`));
983
+ adapter.on("error", (err) => {
984
+ this.logger.error({ err, adapterId }, "Additional adapter fatal error");
985
+ this.restartAdapter(adapter, adapterId).catch(() => { });
986
+ });
975
987
  this.logger.info({ adapterId, type: channelConfig.type }, "Additional adapter started");
976
988
  }
977
989
  /** Connect IPC to a single instance with all handlers */
@@ -979,6 +991,7 @@ export class FleetManager {
979
991
  // Close existing client to prevent socket leak on reconnect
980
992
  const existing = this.instanceIpcClients.get(name);
981
993
  if (existing) {
994
+ this.ipcStoppingInstances.add(name);
982
995
  try {
983
996
  existing.close();
984
997
  }
@@ -986,6 +999,7 @@ export class FleetManager {
986
999
  this.logger.debug({ err, name }, "IPC client close failed (likely already closed)");
987
1000
  }
988
1001
  this.instanceIpcClients.delete(name);
1002
+ this.ipcStoppingInstances.delete(name);
989
1003
  }
990
1004
  const sockPath = join(this.getInstanceDir(name), "channel.sock");
991
1005
  if (!existsSync(sockPath))
@@ -1049,11 +1063,89 @@ export class FleetManager {
1049
1063
  if (!this.statuslineWatcher.has(name)) {
1050
1064
  this.statuslineWatcher.watch(name);
1051
1065
  }
1066
+ // Auto-reconnect on disconnect (unless intentionally stopping)
1067
+ ipc.on("disconnect", () => {
1068
+ this.instanceIpcClients.delete(name);
1069
+ if (this.ipcStoppingInstances.has(name))
1070
+ return;
1071
+ this.ipcReconnect(name).catch(() => { });
1072
+ });
1052
1073
  }
1053
1074
  catch (err) {
1054
1075
  this.logger.warn({ name, err }, "Failed to connect to instance IPC");
1055
1076
  }
1056
1077
  }
1078
+ /** Attempt IPC reconnection with exponential backoff */
1079
+ async ipcReconnect(name) {
1080
+ for (let attempt = 1;; attempt++) {
1081
+ if (this.ipcStoppingInstances.has(name) || !this.daemons.has(name))
1082
+ return;
1083
+ const delay = attempt <= 3 ? 3000 * Math.pow(2, attempt - 1) : 60_000; // 3s, 6s, 12s, then 60s
1084
+ await new Promise(r => setTimeout(r, delay));
1085
+ if (this.ipcStoppingInstances.has(name) || !this.daemons.has(name))
1086
+ return;
1087
+ try {
1088
+ await this.connectIpcToInstance(name);
1089
+ if (this.instanceIpcClients.has(name)) {
1090
+ this.logger.info({ name, attempt }, "IPC reconnected");
1091
+ return;
1092
+ }
1093
+ }
1094
+ catch { /* retry */ }
1095
+ // Periodic pane health check (every attempt after initial 3)
1096
+ if (attempt >= 3) {
1097
+ const instanceDir = this.getInstanceDir(name);
1098
+ const windowIdPath = join(instanceDir, "window-id");
1099
+ if (existsSync(windowIdPath)) {
1100
+ const windowId = readFileSync(windowIdPath, "utf-8").trim();
1101
+ if (windowId) {
1102
+ try {
1103
+ const { execSync } = await import("node:child_process");
1104
+ execSync(`tmux list-panes -t "${windowId}"`, { stdio: "ignore" });
1105
+ }
1106
+ catch {
1107
+ // Pane dead — respawn
1108
+ this.logger.info({ name }, "Tmux pane dead after IPC loss — respawning instance");
1109
+ this.restartSingleInstance(name).catch(err => this.logger.error({ name, err }, "Auto-respawn after IPC loss failed"));
1110
+ return;
1111
+ }
1112
+ }
1113
+ }
1114
+ }
1115
+ if (attempt % 10 === 0) {
1116
+ this.logger.warn({ name, attempt }, "IPC reconnect still failing");
1117
+ }
1118
+ }
1119
+ }
1120
+ /** Restart a channel adapter after fatal error with infinite retry + 60s cap */
1121
+ async restartAdapter(adapter, id) {
1122
+ if (this.adapterRestarting.has(id))
1123
+ return;
1124
+ this.adapterRestarting.add(id);
1125
+ try {
1126
+ for (let attempt = 1;; attempt++) {
1127
+ if (this.ipcStoppingInstances.has("__fleet_stopping__"))
1128
+ return;
1129
+ const delay = attempt <= 3 ? 5000 * Math.pow(2, attempt - 1) : 60_000; // 5s, 10s, 20s, then 60s
1130
+ await new Promise(r => setTimeout(r, delay));
1131
+ if (this.ipcStoppingInstances.has("__fleet_stopping__"))
1132
+ return;
1133
+ try {
1134
+ await adapter.stop().catch(() => { });
1135
+ await adapter.start();
1136
+ this.logger.info({ id, attempt }, "Adapter restarted successfully");
1137
+ return;
1138
+ }
1139
+ catch { /* retry */ }
1140
+ if (attempt % 10 === 0) {
1141
+ this.logger.warn({ id, attempt }, "Adapter restart still failing");
1142
+ }
1143
+ }
1144
+ }
1145
+ finally {
1146
+ this.adapterRestarting.delete(id);
1147
+ }
1148
+ }
1057
1149
  /** Handle inbound message — transcribe voice if present, then route */
1058
1150
  findGeneralInstance(adapterId) {
1059
1151
  if (!this.fleetConfig)
@@ -2629,6 +2721,7 @@ When users create specialized instances, suggest these configurations:
2629
2721
  return `šŸ›‘ Agent stopped in this channel.`;
2630
2722
  }
2631
2723
  async stopAll() {
2724
+ this.ipcStoppingInstances.add("__fleet_stopping__");
2632
2725
  this.clearStatuslineWatchers();
2633
2726
  this.costGuard?.stop();
2634
2727
  this.dailySummary?.stop();
@@ -2655,6 +2748,8 @@ When users create specialized instances, suggest these configurations:
2655
2748
  // Concurrency limited to avoid overwhelming the tmux server.
2656
2749
  const STOP_CONCURRENCY = 5;
2657
2750
  const entries = [...this.daemons.entries()];
2751
+ for (const [name] of entries)
2752
+ this.ipcStoppingInstances.add(name);
2658
2753
  for (let i = 0; i < entries.length; i += STOP_CONCURRENCY) {
2659
2754
  const batch = entries.slice(i, i + STOP_CONCURRENCY);
2660
2755
  await Promise.all(batch.map(async ([name, daemon]) => {
@@ -2671,6 +2766,7 @@ When users create specialized instances, suggest these configurations:
2671
2766
  await ipc.close();
2672
2767
  }
2673
2768
  this.instanceIpcClients.clear();
2769
+ this.ipcStoppingInstances.clear();
2674
2770
  for (const [, w] of this.worlds) {
2675
2771
  await w.stop().catch(() => { });
2676
2772
  }