npm - @link-assistant/hive-mind - Versions diffs - 1.23.5 → 1.23.6 - Mend

@link-assistant/hive-mind 1.23.5 → 1.23.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/CHANGELOG.md +6 -0
package/package.json +2 -2
package/src/telegram-bot-launcher.lib.mjs +190 -0
package/src/telegram-bot.mjs +20 -20

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,11 @@
 # @link-assistant/hive-mind
+## 1.23.6
+### Patch Changes
+- 0a7dbcf: Add exponential backoff retry when bot launch fails with 409 Conflict error (e.g., due to restart overlap, stale connections, or network issues). Retry schedule: 1s, 2s, 4s, ... up to 10 minutes max. Non-retryable errors (401 Unauthorized) still cause immediate exit.
 ## 1.23.5
 ### Patch Changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@link-assistant/hive-mind",
-  "version": "1.23.5",
+  "version": "1.23.6",
   "description": "AI-powered issue solver and hive mind for collaborative problem solving",
   "main": "src/hive.mjs",
   "type": "module",
@@ -13,7 +13,7 @@
     "hive-telegram-bot": "./src/telegram-bot.mjs"
   },
   "scripts": {
-    "test": "node tests/solve-queue.test.mjs && node tests/limits-display.test.mjs && node tests/test-usage-limit.mjs && node tests/test-telegram-message-filters.mjs && node tests/test-solve-queue-command.mjs && node tests/test-queue-display-1267.mjs",
+    "test": "node tests/solve-queue.test.mjs && node tests/limits-display.test.mjs && node tests/test-usage-limit.mjs && node tests/test-telegram-message-filters.mjs && node tests/test-solve-queue-command.mjs && node tests/test-queue-display-1267.mjs && node tests/test-telegram-bot-launcher.mjs",
     "test:queue": "node tests/solve-queue.test.mjs",
     "test:limits-display": "node tests/limits-display.test.mjs",
     "test:usage-limit": "node tests/test-usage-limit.mjs",

package/src/telegram-bot-launcher.lib.mjs ADDED Viewed

@@ -0,0 +1,190 @@
+/**
+ * Bot launcher with exponential backoff retry for Telegraf polling mode.
+ *
+ * Handles transient errors (409 Conflict, network errors, 5xx) by retrying
+ * with exponential backoff. Non-retryable errors (401 Unauthorized) cause
+ * immediate exit.
+ *
+ * @see https://github.com/link-assistant/hive-mind/issues/1240
+ * @see https://core.telegram.org/bots/api#getupdates
+ */
+/**
+ * Default configuration for the retry mechanism.
+ */
+export const LAUNCHER_DEFAULTS = {
+  baseDelayMs: 1000, // Initial retry delay: 1 second
+  maxDelayMs: 10 * 60 * 1000, // Maximum retry delay: 10 minutes
+  backoffMultiplier: 2, // Exponential growth factor
+  jitterFraction: 0.1, // 10% random jitter to prevent thundering herd
+};
+/**
+ * Error codes that should NOT be retried (fatal errors).
+ * 401 = Invalid bot token -- retrying won't help.
+ */
+const NON_RETRYABLE_CODES = new Set([401]);
+/**
+ * Determines whether a given error is retryable.
+ *
+ * Retryable: 409 (Conflict), 429 (Rate limit), 5xx (Server errors),
+ *            network/fetch errors (no code or ECONNRESET, ETIMEDOUT, etc.)
+ * Non-retryable: 401 (Unauthorized/invalid token)
+ *
+ * @param {Error} error - The error to classify
+ * @returns {boolean} true if the error is retryable
+ */
+export function isRetryableError(error) {
+  if (NON_RETRYABLE_CODES.has(error.code)) {
+    return false;
+  }
+  return true;
+}
+/**
+ * Calculates the delay before the next retry attempt using exponential backoff
+ * with jitter.
+ *
+ * Formula: min(baseDelay * multiplier^(attempt-1), maxDelay) + random jitter
+ *
+ * @param {number} attempt - Current attempt number (1-based)
+ * @param {object} [options] - Configuration options
+ * @param {number} [options.baseDelayMs] - Base delay in milliseconds
+ * @param {number} [options.maxDelayMs] - Maximum delay cap in milliseconds
+ * @param {number} [options.backoffMultiplier] - Exponential growth factor
+ * @param {number} [options.jitterFraction] - Fraction of delay to use as jitter (0-1)
+ * @returns {number} Delay in milliseconds before next retry
+ */
+export function calculateRetryDelay(attempt, options = {}) {
+  const { baseDelayMs = LAUNCHER_DEFAULTS.baseDelayMs, maxDelayMs = LAUNCHER_DEFAULTS.maxDelayMs, backoffMultiplier = LAUNCHER_DEFAULTS.backoffMultiplier, jitterFraction = LAUNCHER_DEFAULTS.jitterFraction } = options;
+  const exponentialDelay = baseDelayMs * Math.pow(backoffMultiplier, attempt - 1);
+  const cappedDelay = Math.min(exponentialDelay, maxDelayMs);
+  const jitter = cappedDelay * jitterFraction * Math.random();
+  return Math.round(cappedDelay + jitter);
+}
+/**
+ * Formats a delay in milliseconds as a human-readable string.
+ *
+ * @param {number} delayMs - Delay in milliseconds
+ * @returns {string} Human-readable delay (e.g., "5s", "2m 30s", "10m")
+ */
+export function formatDelay(delayMs) {
+  const totalSeconds = Math.round(delayMs / 1000);
+  if (totalSeconds < 60) {
+    return `${totalSeconds}s`;
+  }
+  const minutes = Math.floor(totalSeconds / 60);
+  const seconds = totalSeconds % 60;
+  if (seconds === 0) {
+    return `${minutes}m`;
+  }
+  return `${minutes}m ${seconds}s`;
+}
+/**
+ * Launches a Telegraf bot with retry logic and exponential backoff.
+ *
+ * On each attempt:
+ * 1. Deletes any existing webhook (to prevent webhook/polling conflict)
+ * 2. Calls bot.launch() in polling mode
+ *
+ * If bot.launch() fails:
+ * - For retryable errors (409, network, 5xx): waits with exponential backoff
+ *   and retries
+ * - For non-retryable errors (401): exits immediately
+ *
+ * @param {object} bot - Telegraf bot instance
+ * @param {object} launchOptions - Options passed to bot.launch()
+ * @param {object} [retryOptions] - Retry configuration
+ * @param {number} [retryOptions.baseDelayMs] - Initial retry delay (default: 1000)
+ * @param {number} [retryOptions.maxDelayMs] - Maximum retry delay (default: 600000)
+ * @param {number} [retryOptions.backoffMultiplier] - Growth factor (default: 2)
+ * @param {number} [retryOptions.jitterFraction] - Jitter fraction (default: 0.1)
+ * @param {boolean} [retryOptions.verbose] - Enable verbose logging
+ * @param {Function} [retryOptions.onRetry] - Callback on each retry: (attempt, error, delayMs) => void
+ * @param {AbortSignal} [retryOptions.signal] - AbortSignal to cancel retry loop
+ * @returns {Promise<void>} Resolves when bot is successfully launched
+ * @throws {Error} If a non-retryable error occurs or signal is aborted
+ */
+export async function launchBotWithRetry(bot, launchOptions, retryOptions = {}) {
+  const { verbose = false, onRetry, signal, ...backoffConfig } = retryOptions;
+  let attempt = 0;
+  while (true) {
+    // Check if abort was requested (e.g., during shutdown)
+    if (signal?.aborted) {
+      const abortError = new Error('Bot launch aborted');
+      abortError.code = 'ABORT';
+      throw abortError;
+    }
+    attempt++;
+    try {
+      // Step 1: Delete webhook to prevent webhook/polling conflict
+      if (verbose) console.log(`[VERBOSE] Launch attempt ${attempt}: deleting webhook...`);
+      await bot.telegram.deleteWebhook({ drop_pending_updates: true });
+      if (verbose) console.log(`[VERBOSE] Launch attempt ${attempt}: starting polling...`);
+      // Step 2: Launch bot in polling mode
+      await bot.launch(launchOptions);
+      // Success -- bot is running
+      if (attempt > 1) {
+        console.log(`✅ Bot launched successfully after ${attempt} attempts`);
+      }
+      return;
+    } catch (error) {
+      // Check if the error is retryable
+      if (!isRetryableError(error)) {
+        console.error(`❌ Non-retryable error (${error.code}): ${error.message}`);
+        throw error;
+      }
+      // Calculate delay with exponential backoff
+      const delayMs = calculateRetryDelay(attempt, backoffConfig);
+      console.warn(`⚠️  Bot launch attempt ${attempt} failed` + ` (${error.code || 'unknown'}): ${error.message}.` + ` Retrying in ${formatDelay(delayMs)}...`);
+      if (verbose) {
+        console.warn(`[VERBOSE] Retry delay: ${delayMs}ms, next attempt: ${attempt + 1}`);
+        if (error.response) {
+          console.warn('[VERBOSE] API response:', JSON.stringify(error.response));
+        }
+      }
+      // Notify retry callback if provided
+      if (onRetry) {
+        onRetry(attempt, error, delayMs);
+      }
+      // Wait before retrying (interruptible via AbortSignal)
+      await new Promise((resolve, reject) => {
+        const timer = setTimeout(resolve, delayMs);
+        if (signal) {
+          const onAbort = () => {
+            clearTimeout(timer);
+            reject(new Error('Bot launch aborted during retry wait'));
+          };
+          if (signal.aborted) {
+            clearTimeout(timer);
+            reject(new Error('Bot launch aborted during retry wait'));
+            return;
+          }
+          signal.addEventListener('abort', onAbort, { once: true });
+          // Clean up the listener when the timer fires naturally
+          const originalResolve = resolve;
+          resolve = () => {
+            signal.removeEventListener('abort', onAbort);
+            originalResolve();
+          };
+        }
+      });
+    }
+  }
+}

package/src/telegram-bot.mjs CHANGED Viewed

@@ -45,6 +45,8 @@ const { escapeMarkdown, escapeMarkdownV2, cleanNonPrintableChars, makeSpecialCha
 const { getSolveQueue, createQueueExecuteCallback } = await import('./telegram-solve-queue.lib.mjs');
 // Import extracted message filter functions for testability (issue #1207)
 const { isOldMessage: _isOldMessage, isGroupChat: _isGroupChat, isChatAuthorized: _isChatAuthorized, isForwardedOrReply: _isForwardedOrReply, extractCommandFromText } = await import('./telegram-message-filters.lib.mjs');
+// Import bot launcher with exponential backoff retry (issue #1240)
+const { launchBotWithRetry } = await import('./telegram-bot-launcher.lib.mjs');
 const config = yargs(hideBin(process.argv))
   .usage('Usage: hive-telegram-bot [options]')
@@ -1395,26 +1397,22 @@ if (VERBOSE) {
   console.log('[VERBOSE] Bot start time (ISO):', new Date(BOT_START_TIME * 1000).toISOString());
 }
-// Delete existing webhook (critical: webhooks prevent polling from working)
-if (VERBOSE) console.log('[VERBOSE] Deleting webhook...');
-bot.telegram
-  .deleteWebhook({ drop_pending_updates: true })
-  .then(result => {
-    if (VERBOSE) {
-      console.log('[VERBOSE] Webhook deletion result:', result);
-    }
-    console.log('🔄 Webhook deleted (if existed), starting polling mode...');
-    if (VERBOSE) {
-      console.log('[VERBOSE] Launching bot with config:', {
-        allowedUpdates: ['message'],
-        dropPendingUpdates: true,
-      });
-    }
-    return bot.launch({
-      allowedUpdates: ['message', 'callback_query'], // Receive messages and callback queries
-      dropPendingUpdates: true, // Drop pending updates sent before bot started
-    });
-  })
+// Launch bot with retry logic (issue #1240: handle 409 Conflict with exponential backoff)
+// The launcher handles deleteWebhook + bot.launch() with retry on transient errors.
+// Non-retryable errors (401 Unauthorized) cause immediate exit.
+const launchAbortController = new AbortController();
+launchBotWithRetry(
+  bot,
+  {
+    allowedUpdates: ['message', 'callback_query'], // Receive messages and callback queries
+    dropPendingUpdates: true, // Drop pending updates sent before bot started
+  },
+  {
+    verbose: VERBOSE,
+    signal: launchAbortController.signal,
+  }
+)
   .then(async () => {
     if (isShuttingDown) return; // Skip success messages if shutting down
@@ -1483,6 +1481,7 @@ process.once('SIGINT', () => {
   isShuttingDown = true;
   console.log('\n🛑 Received SIGINT (Ctrl+C), stopping bot...');
   if (VERBOSE) console.log(`[VERBOSE] Signal: SIGINT, PID: ${process.pid}, PPID: ${process.ppid}`);
+  launchAbortController.abort(); // Cancel retry loop if still retrying (issue #1240)
   stopSolveQueue();
   bot.stop('SIGINT');
 });
@@ -1491,6 +1490,7 @@ process.once('SIGTERM', () => {
   isShuttingDown = true;
   console.log('\n🛑 Received SIGTERM, stopping bot... (Check system logs: journalctl -u <service> or dmesg)');
   if (VERBOSE) console.log(`[VERBOSE] Signal: SIGTERM, PID: ${process.pid}, PPID: ${process.ppid}`);
+  launchAbortController.abort(); // Cancel retry loop if still retrying (issue #1240)
   stopSolveQueue();
   bot.stop('SIGTERM');
 });