npm - @1presence/bridge - Versions diffs - 0.33.0 → 0.35.0 - Mend

@1presence/bridge 0.33.0 → 0.35.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/claude.js CHANGED Viewed

@@ -111,9 +111,59 @@ function renderHistoryMessage(msg) {
 }
 // ─── Active processes ─────────────────────────────────────────────────────────
 const active = new Map();
+// conversationId → pending retry timer. A retry is scheduled with a backoff
+// delay, during which the conversation has NO entry in `active`. If a new user
+// message arrives in that window it must cancel the stale retry (otherwise the
+// retry would re-run the OLD turn's history and clobber the new one). The
+// supersede block clears any pending timer here before spawning.
+const pendingRetries = new Map();
+// Automatic retries when the `claude` CLI exits non-zero BEFORE producing any
+// real output. This covers the known Claude Code print-mode 400 regression that
+// surfaces as "API Error: 400 due to tool use concurrency issues" (GitHub
+// anthropics/claude-code#18131, still open) — it is non-deterministic enough
+// that a fresh spawn often succeeds. We retry ONLY when the failed attempt
+// produced no real assistant text and no tool calls, so a failure that lands
+// after real work (where retrying could double-execute a side-effectful tool)
+// is surfaced, never silently re-run.
+//
+// 2 retries = up to 3 attempts/turn. The first retry captures nearly all of the
+// transient wins; further attempts buy little on transient failures but add
+// latency and re-send the full (1M-context) history again on deterministic ones
+// — see vault Bugs.md. Retries use escalating backoff (avoids a subscription
+// rate-limit cascade from rapid re-spawns) and stop once total retry time
+// exceeds the wall-clock cap, so a slow-failing attempt can't strand the user.
+// All below the SSE boundary, so the user sees only a slightly longer
+// "thinking" gap, never an intermediate error.
+const MAX_TURN_RETRIES = 2;
+const RETRY_BACKOFF_BASE_MS = 750; // delay = base * attempt# → 750ms, 1500ms
+const RETRY_WALL_CLOCK_CAP_MS = 12_000; // stop retrying past this much elapsed
+// Map a non-zero CLI exit + any captured "API Error:" line to a concise,
+// user-facing Local Mode message. The raw upstream text stays in operator logs
+// only — we never echo a wall of provider error JSON into the chat. Referring
+// to "Claude Code" here is intentional and consistent with Local Mode's other
+// operational errors (e.g. the "claude CLI not found" message): in Local Mode
+// the user is knowingly running their own Claude Code install.
+//
+// NOTE on the 400 tool-use case: this is an open Claude Code print-mode
+// regression (introduced in 2.1.19, still present in 2.1.146 — the current
+// latest), so upgrading does NOT fix it. We deliberately do not suggest an
+// upgrade; the automatic retry is the real mitigation and resending sometimes
+// gets through.
+function describeCliFailure(code, apiErrorText) {
+    const t = apiErrorText.trim();
+    if (/API Error:\s*400/i.test(t) && /(tool use|concurren|parallel)/i.test(t)) {
+        return 'Local Mode hit a known Claude Code error (a print-mode bug that affects every current version). I retried a few times automatically — sending the message again sometimes gets through. See https://github.com/anthropics/claude-code/issues/18131';
+    }
+    if (/^API Error:/i.test(t)) {
+        return `Local Mode error from Claude Code: ${t.replace(/^API Error:\s*/i, '').trim()}`;
+    }
+    return `Local Mode stopped unexpectedly (claude exited with code ${code ?? 'unknown'}). Please try again.`;
+}
 // ─── Spawn ────────────────────────────────────────────────────────────────────
 function spawnClaude(params) {
-    const { conversationId, presenceSessionId, text, uid, history, vaultFileOpen, clientCapabilities, syncedFolders, onEvent, onDone, onError } = params;
+    const { conversationId, presenceSessionId, text, uid, history, vaultFileOpen, clientCapabilities, syncedFolders, onEvent, onDone, onError, onNotice } = params;
+    const attemptIdx = params._attemptIdx ?? 0;
+    const firstAttemptAt = params._firstAttemptAt ?? Date.now();
     const systemPromptPath = (0, path_1.join)((0, os_1.tmpdir)(), `agent-${uid}.md`);
     const mcpConfigPath = (0, path_1.join)((0, os_1.tmpdir)(), `mcp-${uid}.json`);
     if (verbose) {
@@ -125,6 +175,13 @@ function spawnClaude(params) {
         process.stderr.write(paint('90', `[bridge:verbose] conversation:  ${conversationId}`) + '\n');
         process.stderr.write(paint('90', `[bridge:verbose] history turns: ${history.length}`) + '\n');
     }
+    // Surface the user's UID before the session line in every mode — it's the
+    // Firestore doc prefix (`sessions/<uid>_<conversationId>`), so logging it
+    // makes a reported bridge failure correlatable to the stored session without
+    // having to ask which account hit it. The CLI's own `--session-id` is
+    // ephemeral and is NOT the Firestore conversationId, so the uid is the key
+    // join column when debugging.
+    process.stderr.write(`[bridge] user ${uid}\n`);
     // Debug transcript: lead with the user prompt for this turn (the clean
     // message, before the gateway's ephemeral-context prefix), plus the session
     // id (correlates with the chat URL / Firestore session doc) and a hint at
@@ -162,6 +219,19 @@ function spawnClaude(params) {
         existing.kill('SIGTERM');
         active.delete(conversationId);
     }
+    // Cancel any retry scheduled for this conversation that hasn't fired yet.
+    // Without this, a new user message arriving during a retry's backoff window
+    // would race the stale retry — which carries the OLD turn's history and would
+    // clobber the new turn. Skip when this call IS the retry firing (attemptIdx>0,
+    // the timer already deleted itself before invoking us).
+    if (attemptIdx === 0) {
+        const pending = pendingRetries.get(conversationId);
+        if (pending) {
+            clearTimeout(pending);
+            pendingRetries.delete(conversationId);
+            process.stderr.write(`[bridge] cancelled pending retry for ${conversationId} (superseded by new turn)\n`);
+        }
+    }
     // Note: ephemeral context (vault_file_open / client_capabilities / synced_folders)
     // is injected into the last user message by the gateway BEFORE history is
     // sent over the WS. The bridge no longer constructs `userMessageText` —
@@ -198,6 +268,10 @@ function spawnClaude(params) {
     //   across turns of a chat — even with --no-session-persistence. The
     //   bridge passes the per-spawn `conversationId` here; the presence
     //   sessionId is correlated separately via bridge logs and spool records.
+    // The CLI treats --session-id as "claim this new session ID" and rejects a
+    // reused id with "Session ID X is already in use". A retry is a fresh spawn,
+    // so it MUST use a new uuid; the first attempt keeps the correlation id.
+    const spawnSessionId = attemptIdx === 0 ? presenceSessionId : crypto.randomUUID();
     const args = [
         '--print',
         '--input-format', 'stream-json',
@@ -210,7 +284,7 @@ function spawnClaude(params) {
         '--mcp-config', mcpConfigPath,
         '--strict-mcp-config',
         '--no-session-persistence',
-        '--session-id', presenceSessionId,
+        '--session-id', spawnSessionId,
     ];
     const pinnedModel = (0, config_1.getBridgeModel)();
     if (pinnedModel) {
@@ -266,6 +340,15 @@ function spawnClaude(params) {
     let extractedModel = null;
     let buffer = '';
     let killedForViolation = false;
+    // Retry/error-surfacing tracking for this attempt:
+    //  - sawApiError: the CLI emitted an "API Error:" assistant text event (the
+    //    way Claude Code reports an underlying API failure mid-turn).
+    //  - apiErrorText: that text, captured for describeCliFailure().
+    //  - producedRealOutput: any real assistant text or tool_use was emitted, so
+    //    a later failure must NOT be retried (could double-run a side-effect).
+    let sawApiError = false;
+    let apiErrorText = '';
+    let producedRealOutput = false;
     proc.stdout.on('data', (chunk) => {
         buffer += chunk.toString('utf-8');
         const lines = buffer.split('\n');
@@ -282,6 +365,10 @@ function spawnClaude(params) {
                 continue;
             }
             const type = event['type'];
+            // Set when this event is the CLI's "API Error:" turn — we neither forward
+            // it to the PWA nor let it reach the accumulator (it carries no real
+            // content and would poison history / show a raw error mid-stream).
+            let suppressEvent = false;
             // Extract model + key source info from the first system/init event.
             // No session-id persistence — Firestore is the only source of truth
             // now, and we pin --session-id to presenceSessionId on every spawn.
@@ -322,6 +409,7 @@ function spawnClaude(params) {
                     let wroteText = false;
                     for (const block of content) {
                         if (block['type'] === 'tool_use') {
+                            producedRealOutput = true;
                             const toolName = block['name'];
                             const toolId = block['id'];
                             if (toolId)
@@ -370,13 +458,27 @@ function spawnClaude(params) {
                         else if (block['type'] === 'text') {
                             const text = block['text'];
                             if (text) {
-                                if (debug) {
-                                    // Full text, newlines intact — the readable transcript.
-                                    debugBlock('assistant', exports.SECTION_COLORS.assistant, text);
+                                if (/^API Error:/i.test(text.trimStart())) {
+                                    // The CLI is reporting an underlying API failure as assistant
+                                    // text. Capture it for the user-facing message, and suppress
+                                    // the whole event so the raw error never reaches the PWA or
+                                    // the accumulator (the gateway also blanks it via
+                                    // cleanTurnText — this is the upstream defense).
+                                    sawApiError = true;
+                                    apiErrorText = text.trim();
+                                    suppressEvent = true;
+                                    process.stderr.write(paint(exports.SECTION_COLORS.result, `[bridge] ${text.replace(/\n+/g, ' ')}`) + '\n');
                                 }
                                 else {
-                                    process.stderr.write(paint(exports.SECTION_COLORS.assistant, text.replace(/\n+/g, ' ')));
-                                    wroteText = true;
+                                    producedRealOutput = true;
+                                    if (debug) {
+                                        // Full text, newlines intact — the readable transcript.
+                                        debugBlock('assistant', exports.SECTION_COLORS.assistant, text);
+                                    }
+                                    else {
+                                        process.stderr.write(paint(exports.SECTION_COLORS.assistant, text.replace(/\n+/g, ' ')));
+                                        wroteText = true;
+                                    }
                                 }
                             }
                         }
@@ -412,7 +514,8 @@ function spawnClaude(params) {
                 if (typeof c === 'number')
                     costUsd = c;
             }
-            onEvent(event);
+            if (!suppressEvent)
+                onEvent(event);
         }
     });
     proc.stderr.on('data', (chunk) => {
@@ -434,9 +537,29 @@ function spawnClaude(params) {
             catch { /* ignore */ }
         }
         if (code !== 0 && code !== null) {
+            // Auto-retry when the CLI failed BEFORE producing any real output — the
+            // signature of the known print-mode 400 regression. A fresh spawn (new
+            // --session-id) often succeeds. We never retry once real text or a tool
+            // call landed, to avoid double-running a side-effectful tool. Retries use
+            // escalating backoff and stop past the wall-clock cap (see consts above).
+            const elapsed = Date.now() - firstAttemptAt;
+            if (attemptIdx < MAX_TURN_RETRIES && sawApiError && !producedRealOutput && elapsed < RETRY_WALL_CLOCK_CAP_MS) {
+                const delay = RETRY_BACKOFF_BASE_MS * (attemptIdx + 1);
+                const nextAttempt = attemptIdx + 2;
+                process.stderr.write(`[bridge] turn failed before output (${apiErrorText.replace(/\n+/g, ' ').slice(0, 120)}) — retrying (${nextAttempt} of ${MAX_TURN_RETRIES + 1}) in ${delay}ms\n`);
+                // Admin-only ephemeral thread notice — jargon is fine in Local Mode.
+                onNotice?.(`Claude Code print-mode 400 (tool-use concurrency, anthropics/claude-code#18131) — respawning, attempt ${nextAttempt}/${MAX_TURN_RETRIES + 1}…`);
+                const timer = setTimeout(() => {
+                    pendingRetries.delete(conversationId);
+                    spawnClaude({ ...params, _attemptIdx: attemptIdx + 1, _firstAttemptAt: firstAttemptAt });
+                }, delay);
+                pendingRetries.set(conversationId, timer);
+                return;
+            }
             // Pass any partial token usage we observed before the failure so the
-            // PWA and the gateway's bridge usage store can still record it.
-            onError(`claude exited with code ${code}`, usage, extractedModel);
+            // PWA and the gateway's bridge usage store can still record it. Surface a
+            // classified, user-readable message instead of the opaque exit code.
+            onError(describeCliFailure(code, apiErrorText), usage, extractedModel);
         }
         else {
             onDone(messageCount, costUsd, usage, extractedModel);

package/dist/index.js CHANGED Viewed

@@ -244,6 +244,14 @@ async function handleMessage(conversationId, text, sessionId, history, auth, vau
                 currentWs.send(JSON.stringify({ type: 'stream', conversationId, event }));
             }
         },
+        onNotice: (message) => {
+            // Ephemeral, non-persisted thread notice (admin-only Local Mode). Relayed
+            // by the gateway to the PWA SSE stream as a `notice` AgentEvent; it does
+            // NOT go through the turn accumulator, so it never lands in history.
+            if (currentWs?.readyState === ws_1.default.OPEN) {
+                currentWs.send(JSON.stringify({ type: 'notice', conversationId, message }));
+            }
+        },
         onDone: (messageCount, costUsd, usage, model) => {
             const parts = [];
             if (usage)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@1presence/bridge",
-  "version": "0.33.0",
+  "version": "0.35.0",
   "description": "Run 1Presence on your Mac and use your Claude.ai Pro subscription from any device",
   "bin": {
     "1presence-bridge": "dist/index.js"