npm - compact-agent - Versions diffs - 1.8.3 → 1.9.0 - Mend

compact-agent 1.8.3 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/query.js CHANGED Viewed

@@ -1,29 +1,43 @@
 import { ALL_TOOLS, getToolByName } from './tools/index.js';
-import { streamChat } from './api.js';
+import { streamChat, resetClient } from './api.js';
 import { checkPermission } from './permissions.js';
 import { buildSystemPrompt } from './system-prompt.js';
 import { runHooks } from './hooks.js';
 import { scanToolCall, printSecurityWarning } from './security.js';
 import { trackUsage } from './cost-tracker.js';
 import { shouldCompact, compactMessages, quickCompact, DEFAULT_COMPACTION } from './compaction.js';
-import { theme, sym, printToolRun, printToolResult, printThinkingOpen, printThinkingText, printThinkingClose, printCost, printApiError, formatDuration } from './theme.js';
+import { theme, sym, printToolRun, printToolResult, printThinkingOpen, printThinkingText, printThinkingClose, printCost, printApiError, formatDuration, categorizeApiError } from './theme.js';
 import { isVoiceEnabled, getTtsConfig, getAccessibilityConfig, speakAssistantResponse, speak, speakUserEcho, } from './voice.js';
 import { isLikelyDestructive, describeDestructive, countWords, summarize } from './accessibility.js';
 import { audioCue } from './audio.js';
 import { setStatus } from './status.js';
-function suppressInputDuringStream() {
+import { collapseCompletedTurns } from './turn-context.js';
+function startInputSuppression() {
     const stdin = process.stdin;
     if (!stdin.isTTY) {
-        return { restore: () => { } };
+        return { pause: () => { }, resume: () => { }, restore: () => { } };
     }
     const wasRaw = stdin.isRaw;
-    // Snapshot the keypress listeners that aren't ours. Those are what we
-    // need to detach to stop readline from echoing + buffering. Slice to
-    // protect against the array mutating mid-iteration.
+    // Snapshot non-tagged keypress listeners. These are the ones we toggle
+    // on suppress/unsuppress; the tagged hotkey listener (F1–F10) stays
+    // attached unconditionally so status keys work during streaming and
+    // tool execution alike.
     const allKeypressListeners = stdin.listeners('keypress').slice();
-    const detachedListeners = allKeypressListeners.filter((l) => !l.__crowcoderHotkey__);
-    for (const l of detachedListeners) {
-        stdin.removeListener('keypress', l);
+    const togglableListeners = allKeypressListeners.filter((l) => !l.__crowcoderHotkey__);
+    let detached = false;
+    function suppress() {
+        if (detached)
+            return;
+        for (const l of togglableListeners)
+            stdin.removeListener('keypress', l);
+        detached = true;
+    }
+    function unsuppress() {
+        if (!detached)
+            return;
+        for (const l of togglableListeners)
+            stdin.on('keypress', l);
+        detached = false;
     }
     // Swallow data — Ctrl+C still exits, everything else is discarded so
     // it can't bubble up to anything we missed.
@@ -42,13 +56,14 @@ function suppressInputDuringStream() {
     catch { /* noop */ }
     stdin.on('data', dataHandler);
     stdin.resume();
+    // Start suppressed — typing during model streaming is the default-block case
+    suppress();
     return {
+        pause: unsuppress, // pause suppression = allow typing (for permission prompts)
+        resume: suppress, // resume suppression = block typing again
         restore: () => {
+            unsuppress(); // ensure listeners are back before we leave
             stdin.removeListener('data', dataHandler);
-            // Re-attach readline's keypress listeners in the original order.
-            for (const l of detachedListeners) {
-                stdin.on('keypress', l);
-            }
             try {
                 stdin.setRawMode(wasRaw);
             }
@@ -113,215 +128,268 @@ export async function runQuery(ctx) {
     // assistant turn, but the final TTS pass only fires after the no-tool-call
     // exit so tool descriptions aren't read out.
     let accumulatedAssistantText = '';
-    // Auto-compact if context is getting large
-    if (shouldCompact(ctx.messages, DEFAULT_COMPACTION)) {
-        console.log(theme.dim(`  ${sym.running} auto-compacting conversation context...`));
-        setStatus({ state: 'compacting' });
-        ctx.messages = await compactMessages(ctx.messages, ctx.config);
-    }
-    else {
-        // Quick compact: truncate oversized tool results
-        ctx.messages = quickCompact(ctx.messages);
-    }
-    // Tell the status singleton who we are. This is what F2 ("where am I?")
-    // speaks back to the user. Updated once per chain — model/provider/mode
-    // can't change mid-chain.
-    setStatus({
-        model: ctx.config.model,
-        provider: ctx.config.provider,
-        mode: ctx.mode,
-        permissionMode: ctx.config.permissionMode,
-    });
-    while (turns < maxTurns) {
-        turns++;
-        // Get the last user message for context-aware system prompt
-        const lastUserMsg = ctx.messages.filter((m) => m.role === 'user').pop();
-        const userQuery = typeof lastUserMsg?.content === 'string' ? lastUserMsg.content : undefined;
-        // Build full messages array with system prompt
-        const systemPrompt = buildSystemPrompt(ctx.config, ctx.cwd, ctx.mode, userQuery);
-        const apiMessages = [
-            { role: 'system', content: systemPrompt },
-            ...ctx.messages,
-        ];
-        let fullText = '';
-        let toolCalls;
-        let hasOutput = false;
-        let thinkingActive = false;
-        let leadingTrimmed = false; // strip leading whitespace from the model's first text chunk
-        let lastCharWasNewline = false; // collapse 3+ consecutive newlines down to 2
-        let consecutiveNewlines = 0;
-        const turnStart = Date.now();
-        function writeStreamText(chunk) {
-            // Trim leading whitespace until the first non-whitespace character so
-            // the model can't produce big vertical gaps before its real reply.
-            let text = chunk;
-            if (!leadingTrimmed) {
-                text = text.replace(/^[\s\n]+/, '');
-                if (text.length === 0)
-                    return; // entire chunk was leading whitespace
-                leadingTrimmed = true;
-            }
-            // Collapse runs of 3+ newlines into 2 so the body of the response is
-            // dense but still has paragraph breaks where the model intended them.
-            let out = '';
-            for (const ch of text) {
-                if (ch === '\n') {
-                    consecutiveNewlines++;
-                    if (consecutiveNewlines <= 2)
-                        out += ch;
+    // Auto-fallback: when the primary model returns a cryptic / unknown
+    // provider error (common for free experimental models like
+    // openrouter/owl-alpha which returns literally "ERROR" or "Provider
+    // returned error"), we transparently retry the SAME turn once with the
+    // user's configured fallbackModel. After we use it, this latches so we
+    // don't bounce back and forth between failing models in a single chain.
+    let usedFallbackModel = false;
+    // Input suppression spans the entire chain: model streaming AND tool
+    // execution. executeToolCalls calls inputGuard.pause()/resume() around
+    // permission prompts so rl.question() can still read user input. Final
+    // teardown happens in the finally block at the bottom of runQuery so
+    // the guard is always cleaned up even if something throws unexpectedly.
+    const inputGuard = startInputSuppression();
+    try {
+        // Turn-boundary collapse runs BEFORE compaction. Every completed prior
+        // turn becomes [user, "<final text>\n[Completed: used X, Y]"] — the
+        // model no longer sees stale tool_calls that it might mistake for
+        // pending work (the "I'll handle BOTH requests" / "all THREE requests"
+        // bug). The current turn (latest user message forward) is left intact
+        // because its tool_calls and tool messages are still in flight.
+        ctx.messages = collapseCompletedTurns(ctx.messages);
+        // Auto-compact if context is getting large
+        if (shouldCompact(ctx.messages, DEFAULT_COMPACTION)) {
+            console.log(theme.dim(`  ${sym.running} auto-compacting conversation context...`));
+            setStatus({ state: 'compacting' });
+            ctx.messages = await compactMessages(ctx.messages, ctx.config);
+        }
+        else {
+            // Quick compact: truncate oversized tool results
+            ctx.messages = quickCompact(ctx.messages);
+        }
+        // Tell the status singleton who we are. This is what F2 ("where am I?")
+        // speaks back to the user. Updated once per chain — model/provider/mode
+        // can't change mid-chain.
+        setStatus({
+            model: ctx.config.model,
+            provider: ctx.config.provider,
+            mode: ctx.mode,
+            permissionMode: ctx.config.permissionMode,
+        });
+        while (turns < maxTurns) {
+            turns++;
+            // Get the last user message for context-aware system prompt
+            const lastUserMsg = ctx.messages.filter((m) => m.role === 'user').pop();
+            const userQuery = typeof lastUserMsg?.content === 'string' ? lastUserMsg.content : undefined;
+            // Build full messages array with system prompt
+            const systemPrompt = buildSystemPrompt(ctx.config, ctx.cwd, ctx.mode, userQuery);
+            const apiMessages = [
+                { role: 'system', content: systemPrompt },
+                ...ctx.messages,
+            ];
+            let fullText = '';
+            let toolCalls;
+            let hasOutput = false;
+            let thinkingActive = false;
+            let leadingTrimmed = false; // strip leading whitespace from the model's first text chunk
+            let lastCharWasNewline = false; // collapse 3+ consecutive newlines down to 2
+            let consecutiveNewlines = 0;
+            const turnStart = Date.now();
+            function writeStreamText(chunk) {
+                // Trim leading whitespace until the first non-whitespace character so
+                // the model can't produce big vertical gaps before its real reply.
+                let text = chunk;
+                if (!leadingTrimmed) {
+                    text = text.replace(/^[\s\n]+/, '');
+                    if (text.length === 0)
+                        return; // entire chunk was leading whitespace
+                    leadingTrimmed = true;
                 }
-                else {
-                    consecutiveNewlines = 0;
-                    out += ch;
+                // Collapse runs of 3+ newlines into 2 so the body of the response is
+                // dense but still has paragraph breaks where the model intended them.
+                let out = '';
+                for (const ch of text) {
+                    if (ch === '\n') {
+                        consecutiveNewlines++;
+                        if (consecutiveNewlines <= 2)
+                            out += ch;
+                    }
+                    else {
+                        consecutiveNewlines = 0;
+                        out += ch;
+                    }
                 }
+                if (out.length === 0)
+                    return;
+                lastCharWasNewline = out.endsWith('\n');
+                process.stdout.write(theme.primary(out));
+                fullText += out;
             }
-            if (out.length === 0)
-                return;
-            lastCharWasNewline = out.endsWith('\n');
-            process.stdout.write(theme.primary(out));
-            fullText += out;
-        }
-        // Suppress terminal echo while we stream so mid-stream keystrokes
-        // don't interleave with the model's output. Restored in `finally`.
-        const inputGuard = suppressInputDuringStream();
-        // We're about to wait on the API; tell the status singleton so a blind
-        // user pressing F1 hears "calling claude-sonnet-4, 6 seconds elapsed"
-        // instead of a stale "idle".
-        setStatus({ state: 'streaming' });
-        try {
-            for await (const event of streamChat(ctx.config, apiMessages, ALL_TOOLS)) {
-                if (event.type === 'thinking' && event.content) {
-                    // showThinking defaults to true; only off when explicitly disabled.
-                    if (ctx.config.showThinking !== false) {
-                        if (!thinkingActive) {
-                            printThinkingOpen();
-                            thinkingActive = true;
+            // (inputGuard is now lifted to runQuery scope — see above. It spans
+            // both streaming and tool execution, with pause/resume around the
+            // permission prompts inside executeToolCalls.)
+            // We're about to wait on the API; tell the status singleton so a blind
+            // user pressing F1 hears "calling claude-sonnet-4, 6 seconds elapsed"
+            // instead of a stale "idle".
+            setStatus({ state: 'streaming' });
+            try {
+                for await (const event of streamChat(ctx.config, apiMessages, ALL_TOOLS)) {
+                    if (event.type === 'thinking' && event.content) {
+                        // showThinking defaults to true; only off when explicitly disabled.
+                        if (ctx.config.showThinking !== false) {
+                            if (!thinkingActive) {
+                                printThinkingOpen();
+                                thinkingActive = true;
+                            }
+                            printThinkingText(event.content);
                         }
-                        printThinkingText(event.content);
                     }
-                }
-                else if (event.type === 'text' && event.content) {
-                    if (thinkingActive) {
-                        printThinkingClose();
-                        thinkingActive = false;
+                    else if (event.type === 'text' && event.content) {
+                        if (thinkingActive) {
+                            printThinkingClose();
+                            thinkingActive = false;
+                        }
+                        if (!hasOutput) {
+                            hasOutput = true;
+                            // First token arrived; promote status so F1 reports "receiving"
+                            // rather than the still-waiting "streaming" message.
+                            setStatus({ state: 'responding' });
+                        }
+                        writeStreamText(event.content);
                     }
-                    if (!hasOutput) {
-                        hasOutput = true;
-                        // First token arrived; promote status so F1 reports "receiving"
-                        // rather than the still-waiting "streaming" message.
-                        setStatus({ state: 'responding' });
+                    else if (event.type === 'tool_call') {
+                        toolCalls = event.toolCalls;
+                    }
+                    else if (event.type === 'done') {
+                        if (event.usage) {
+                            const u = event.usage;
+                            const { cost, warning } = trackUsage(ctx.sessionId, ctx.config.model, u.prompt, u.completion);
+                            // Single newline separator if we just streamed text, then the
+                            // compact telemetry line.
+                            if (hasOutput && !lastCharWasNewline)
+                                process.stdout.write('\n');
+                            printCost(u.prompt, u.completion, cost, warning, Date.now() - turnStart);
+                        }
                     }
-                    writeStreamText(event.content);
                 }
-                else if (event.type === 'tool_call') {
-                    toolCalls = event.toolCalls;
+            }
+            catch (err) {
+                const msg = err instanceof Error ? err.message : String(err);
+                // Always close the streaming line first so the error doesn't glue to text.
+                if (hasOutput && !lastCharWasNewline)
+                    process.stdout.write('\n');
+                // ── Auto-fallback path ─────────────────────────────────
+                // Categorize the error. If it's "unknown" (the provider returned a
+                // cryptic empty error like "ERROR" or "Provider returned error" that
+                // matches no specific pattern) AND we have a fallbackModel configured
+                // AND we haven't already used it, swap models and silently retry the
+                // same turn. This rescues users from broken free models without them
+                // having to manually /clear and /model switch.
+                const cat = categorizeApiError(msg, {
+                    baseURL: ctx.config.baseURL,
+                    provider: ctx.config.provider,
+                    model: ctx.config.model,
+                });
+                const canFallback = cat.category === 'unknown'
+                    && ctx.config.fallbackModel
+                    && ctx.config.fallbackModel !== ctx.config.model
+                    && !usedFallbackModel;
+                if (canFallback) {
+                    usedFallbackModel = true;
+                    const failedModel = ctx.config.model;
+                    const fallback = ctx.config.fallbackModel;
+                    ctx.config.model = fallback;
+                    resetClient();
+                    console.log(theme.warning(`  ${sym.warn} ${failedModel} returned a cryptic provider error — retrying once with fallback model ${fallback}.`));
+                    console.log(theme.dim('    (configure a different fallback with: /fallback <model-id>)'));
+                    turns--; // this retry doesn't burn a turn slot from the max-turns budget
+                    continue;
                 }
-                else if (event.type === 'done') {
-                    if (event.usage) {
-                        const u = event.usage;
-                        const { cost, warning } = trackUsage(ctx.sessionId, ctx.config.model, u.prompt, u.completion);
-                        // Single newline separator if we just streamed text, then the
-                        // compact telemetry line.
-                        if (hasOutput && !lastCharWasNewline)
-                            process.stdout.write('\n');
-                        printCost(u.prompt, u.completion, cost, warning, Date.now() - turnStart);
+                printApiError(msg, {
+                    baseURL: ctx.config.baseURL,
+                    provider: ctx.config.provider,
+                    model: ctx.config.model,
+                });
+                // Voice: announce errors aloud for screen-reader users
+                if (isVoiceEnabled(ctx.config) && getAccessibilityConfig(ctx.config).announceErrors) {
+                    const tts = getTtsConfig(ctx.config);
+                    if (tts.apiKey) {
+                        // Keep it terse — one short sentence — to avoid burning quota on
+                        // long stack traces. The error pretty-printer already showed the
+                        // categorized version to the screen-reader.
+                        speak(`API error: ${msg.slice(0, 120)}`, ctx.config, { voiceId: tts.assistantVoiceId }).catch(() => { });
+                    }
+                    if (getAccessibilityConfig(ctx.config).audioCues) {
+                        audioCue('error').catch(() => { });
                     }
                 }
+                ctx.messages.push({ role: 'assistant', content: `[API error: ${msg}]` });
+                break;
             }
-        }
-        catch (err) {
-            const msg = err instanceof Error ? err.message : String(err);
-            // Always close the streaming line first so the error doesn't glue to text.
-            if (hasOutput && !lastCharWasNewline)
+            if (hasOutput && !lastCharWasNewline) {
                 process.stdout.write('\n');
-            printApiError(msg, {
-                baseURL: ctx.config.baseURL,
-                provider: ctx.config.provider,
-                model: ctx.config.model,
-            });
-            // Voice: announce errors aloud for screen-reader users
-            if (isVoiceEnabled(ctx.config) && getAccessibilityConfig(ctx.config).announceErrors) {
-                const tts = getTtsConfig(ctx.config);
-                if (tts.apiKey) {
-                    // Keep it terse — one short sentence — to avoid burning quota on
-                    // long stack traces. The error pretty-printer already showed the
-                    // categorized version to the screen-reader.
-                    speak(`API error: ${msg.slice(0, 120)}`, ctx.config, { voiceId: tts.assistantVoiceId }).catch(() => { });
-                }
-                if (getAccessibilityConfig(ctx.config).audioCues) {
-                    audioCue('error').catch(() => { });
-                }
             }
-            ctx.messages.push({ role: 'assistant', content: `[API error: ${msg}]` });
-            inputGuard.restore();
-            break;
+            // Save assistant message
+            const assistantMsg = { role: 'assistant', content: fullText || null };
+            if (toolCalls && toolCalls.length > 0) {
+                assistantMsg.tool_calls = toolCalls;
+            }
+            ctx.messages.push(assistantMsg);
+            // Accumulate visible assistant text for chain-end TTS. We don't TTS
+            // mid-chain because the model often emits short bridging sentences
+            // between tool calls — speaking each one is noisy and slow.
+            if (fullText)
+                accumulatedAssistantText += (accumulatedAssistantText ? '\n\n' : '') + fullText;
+            // If no tool calls, we're done
+            if (!toolCalls || toolCalls.length === 0)
+                break;
+            // Execute tool calls — executeToolCalls itself flips per-tool state
+            // and uses inputGuard.pause()/resume() around each permission prompt
+            // so rl.question() can read user input even though suppression is on
+            // for the rest of the chain.
+            const toolResults = await executeToolCalls(toolCalls, ctx, inputGuard);
+            ctx.messages.push(...toolResults);
         }
-        inputGuard.restore();
-        if (hasOutput && !lastCharWasNewline) {
-            process.stdout.write('\n');
+        // Chain ended; back to idle so F1 reports the correct state.
+        setStatus({ state: 'idle' });
+        // ── Voice: read the assistant's final response ────────────
+        // Off the hot path — fire-and-forget so the next prompt appears
+        // immediately. The playback runs in background; F2 pauses, F4 skips.
+        if (isVoiceEnabled(ctx.config) && accumulatedAssistantText.trim()) {
+            const tts = getTtsConfig(ctx.config);
+            if (tts.apiKey) {
+                const a = getAccessibilityConfig(ctx.config);
+                let toRead = accumulatedAssistantText;
+                // If the response is long, abbreviate via cheap heuristic summary so
+                // blind users aren't forced to listen to 800 words. They can press
+                // F3 (replay) on chunks or ask "give me the full version" verbally.
+                const words = countWords(toRead);
+                if (words >= a.longResponseThreshold) {
+                    toRead = summarize(toRead, a.longResponseThreshold);
+                }
+                // Register an abort controller + last-chunk + last-full-response
+                // globally so the hotkey handler in index.ts can cancel / replay.
+                // - __voiceLastChunk drives PGUP "replay last chunk"
+                // - __voiceLastFullResponse drives F3 "read full" + F4 "read summary"
+                const g = globalThis;
+                const ctl = new AbortController();
+                g.__voicePlaybackCtl = ctl;
+                g.__voiceLastChunk = toRead;
+                g.__voiceLastFullResponse = accumulatedAssistantText;
+                speakAssistantResponse(toRead, ctx.config, ctl.signal).catch(() => { });
+            }
         }
-        // Save assistant message
-        const assistantMsg = { role: 'assistant', content: fullText || null };
-        if (toolCalls && toolCalls.length > 0) {
-            assistantMsg.tool_calls = toolCalls;
+        if (turns >= maxTurns) {
+            console.log(theme.warning(`\n  ${sym.warn} reached max turns limit`));
         }
-        ctx.messages.push(assistantMsg);
-        // Accumulate visible assistant text for chain-end TTS. We don't TTS
-        // mid-chain because the model often emits short bridging sentences
-        // between tool calls — speaking each one is noisy and slow.
-        if (fullText)
-            accumulatedAssistantText += (accumulatedAssistantText ? '\n\n' : '') + fullText;
-        // If no tool calls, we're done
-        if (!toolCalls || toolCalls.length === 0)
-            break;
-        // Execute tool calls — executeToolCalls itself flips per-tool state
-        const toolResults = await executeToolCalls(toolCalls, ctx);
-        ctx.messages.push(...toolResults);
-    }
-    // Chain ended; back to idle so F1 reports the correct state.
-    setStatus({ state: 'idle' });
-    // ── Voice: read the assistant's final response ────────────
-    // Off the hot path — fire-and-forget so the next prompt appears
-    // immediately. The playback runs in background; F2 pauses, F4 skips.
-    if (isVoiceEnabled(ctx.config) && accumulatedAssistantText.trim()) {
-        const tts = getTtsConfig(ctx.config);
-        if (tts.apiKey) {
-            const a = getAccessibilityConfig(ctx.config);
-            let toRead = accumulatedAssistantText;
-            // If the response is long, abbreviate via cheap heuristic summary so
-            // blind users aren't forced to listen to 800 words. They can press
-            // F3 (replay) on chunks or ask "give me the full version" verbally.
-            const words = countWords(toRead);
-            if (words >= a.longResponseThreshold) {
-                toRead = summarize(toRead, a.longResponseThreshold);
-            }
-            // Register an abort controller + last-chunk + last-full-response
-            // globally so the hotkey handler in index.ts can cancel / replay.
-            // - __voiceLastChunk drives PGUP "replay last chunk"
-            // - __voiceLastFullResponse drives F3 "read full" + F4 "read summary"
-            const g = globalThis;
-            const ctl = new AbortController();
-            g.__voicePlaybackCtl = ctl;
-            g.__voiceLastChunk = toRead;
-            g.__voiceLastFullResponse = accumulatedAssistantText;
-            speakAssistantResponse(toRead, ctx.config, ctl.signal).catch(() => { });
+        // Chain-elapsed summary. One line per response chain (user msg → assistant
+        // ending without a tool call), printed regardless of how many tool-call
+        // iterations the chain took. Lets the user see how long that whole
+        // exchange took, separate from per-turn cost timings.
+        const chainMs = Date.now() - chainStart;
+        // Only show if there was meaningful work — multi-second chains. Sub-second
+        // chains (slash command rejects, instant returns) don't need a chain line.
+        if (chainMs > 1500) {
+            console.log(theme.dim(`  chain ${formatDuration(chainMs)} · ${turns} ${turns === 1 ? 'turn' : 'turns'}`));
         }
     }
-    if (turns >= maxTurns) {
-        console.log(theme.warning(`\n  ${sym.warn} reached max turns limit`));
-    }
-    // Chain-elapsed summary. One line per response chain (user msg → assistant
-    // ending without a tool call), printed regardless of how many tool-call
-    // iterations the chain took. Lets the user see how long that whole
-    // exchange took, separate from per-turn cost timings.
-    const chainMs = Date.now() - chainStart;
-    // Only show if there was meaningful work — multi-second chains. Sub-second
-    // chains (slash command rejects, instant returns) don't need a chain line.
-    if (chainMs > 1500) {
-        console.log(theme.dim(`  chain ${formatDuration(chainMs)} · ${turns} ${turns === 1 ? 'turn' : 'turns'}`));
+    finally {
+        inputGuard.restore();
     }
 }
-async function executeToolCalls(toolCalls, ctx) {
+async function executeToolCalls(toolCalls, ctx, inputGuard) {
     const results = [];
     for (const tc of toolCalls) {
         const toolName = tc.function.name;
@@ -409,7 +477,18 @@ async function executeToolCalls(toolCalls, ctx) {
             }
         }
         // ── Permission check ──────────────────────────────────
-        const allowed = await checkPermission(tool, input, ctx.config, ctx.rl);
+        // Pause input suppression so rl.question() can read the user's
+        // Y/n/always response — without this, readline's keypress listener is
+        // detached and the prompt would hang forever. Re-suppress immediately
+        // after so any typing during the next tool's execution is blocked.
+        inputGuard.pause();
+        let allowed;
+        try {
+            allowed = await checkPermission(tool, input, ctx.config, ctx.rl);
+        }
+        finally {
+            inputGuard.resume();
+        }
         if (!allowed) {
             console.log(theme.warning(`  ${sym.warn} Denied: ${toolName}`));
             results.push({