npm - @codexstar/pi-listen - Versions diffs - 1.0.17 → 1.0.18 - Mend

@codexstar/pi-listen 1.0.17 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/extensions/voice.ts +415 -336
package/package.json +1 -1

package/extensions/voice.ts CHANGED Viewed

@@ -1,29 +1,48 @@
 /**
- * pi-voice — Deepgram WebSocket streaming STT for Pi CLI.
+ * pi-voice — Enterprise-grade voice STT for Pi CLI.
  *
  * Architecture (modeled after Claude Code's voice pipeline):
- *   1. SoX `rec` captures mic audio as raw PCM (16kHz, mono, 16-bit)
- *      and pipes it to stdout (no file).
- *   2. Raw PCM chunks are streamed over a WebSocket to Deepgram Nova 3.
- *   3. Deepgram returns interim + final transcripts in real-time.
- *   4. Interim transcripts update a live widget above the editor.
- *   5. On key-release (or toggle stop), a CloseStream message is sent;
- *      final transcript is injected into the editor.
+ *
+ *   STATE MACHINE
+ *   ─────────────
+ *   idle → warmup → recording → finalizing → idle
+ *              ↑         │
+ *              └─────────┘  (rapid re-press recovery)
+ *
+ *   warmup:     User holds SPACE for ≥ HOLD_THRESHOLD_MS (500ms).
+ *               A "keep holding…" hint is shown. If released before
+ *               the threshold, a normal space character is typed.
+ *
+ *   recording:  SoX captures PCM → Deepgram WebSocket streaming.
+ *               Live interim + final transcripts update the widget.
+ *               Release SPACE (or press again in toggle mode) → stop.
+ *
+ *   finalizing: CloseStream sent to Deepgram. Waiting for final
+ *               transcript. Safety timeout auto-completes.
+ *
+ *   HOLD-TO-TALK DETECTION (non-Kitty terminals)
+ *   ─────────────────────────────────────────────
+ *   Holding a key sends rapid key-press events (~30ms apart).
+ *   "Release" is detected when the gap between presses exceeds
+ *   RELEASE_DETECT_MS (150ms).
+ *
+ *   ENTERPRISE FALLBACKS
+ *   ────────────────────
+ *   • Session corruption guard: new recording request during
+ *     finalizing automatically cancels the stale session first.
+ *   • Transient failure retry: on WebSocket error during rapid
+ *     push-to-talk re-press, auto-retry once after 300ms.
+ *   • Stale transcript cleanup: any prior transcript is cleared
+ *     before new recording begins.
+ *   • Silence vs. no-speech: distinguishes "mic captured silence"
+ *     from "no speech detected" with distinct user messages.
  *
  * Activation:
- *   - Hold SPACE (empty editor) → release to finalize
- *   - Ctrl+Shift+V → toggle start/stop (fallback for non-Kitty terminals)
+ *   - Hold SPACE (≥500ms) → release to finalize
+ *   - Ctrl+Shift+V → toggle start/stop (always works)
  *   - Ctrl+Shift+B → hold to record → auto-send as /btw
  *
- * Config in ~/.pi/agent/settings.json:
- * {
- *   "voice": {
- *     "enabled": true,
- *     "language": "en",
- *     "backend": "deepgram",
- *     "model": "nova-3"
- *   }
- * }
+ * Config in ~/.pi/agent/settings.json under "voice": { ... }
  */
 import type {
@@ -57,7 +76,14 @@ import { buildProvisioningPlan } from "./voice/install";
 // ─── Types ───────────────────────────────────────────────────────────────────
-type VoiceState = "idle" | "recording" | "transcribing";
+/**
+ * Voice state machine — strict transitions only:
+ *   idle → warmup → recording → finalizing → idle
+ *   warmup → idle  (released before threshold)
+ *   recording → idle  (on error)
+ *   finalizing → idle  (on completion or timeout)
+ */
+type VoiceState = "idle" | "warmup" | "recording" | "finalizing";
 interface BtwExchange {
 	question: string;
@@ -76,7 +102,14 @@ const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
 const KEEPALIVE_INTERVAL_MS = 8000;
 const FINALIZE_SAFETY_TIMEOUT_MS = 5000;
 const FINALIZE_NO_DATA_TIMEOUT_MS = 1500;
-const MAX_RECORDING_SECS = 120; // 2 minutes safety cap (streaming is efficient)
+const MAX_RECORDING_SECS = 120;
+// Hold-to-talk timing
+const HOLD_THRESHOLD_MS = 500;    // Must hold for this long before activation
+const RELEASE_DETECT_MS = 150;    // Gap in key-repeat → "released"
+const RETRY_DELAY_MS = 300;       // Auto-retry on transient failure during rapid re-press
+const MAX_RETRY_ATTEMPTS = 1;     // Max retries per activation attempt
+const CORRUPTION_GUARD_MS = 200;  // Min gap between stop and restart
 const EXT_DIR = path.dirname(new URL(import.meta.url).pathname);
 const PROJECT_ROOT = path.join(EXT_DIR, "..");
@@ -264,25 +297,17 @@ async function transcribeAudioFile(
 interface StreamingSession {
 	ws: WebSocket;
 	recProcess: ChildProcess;
-	interimText: string;      // Current interim (partial) transcript
-	finalizedParts: string[]; // All finalized transcript segments
+	interimText: string;
+	finalizedParts: string[];
 	keepAliveTimer: ReturnType<typeof setInterval> | null;
 	closed: boolean;
+	hadAudioData: boolean;       // Track if we received any audio data
+	hadSpeech: boolean;          // Track if Deepgram detected any speech
 	onTranscript: (interim: string, finals: string[]) => void;
-	onDone: (fullText: string) => void;
+	onDone: (fullText: string, meta: { hadAudio: boolean; hadSpeech: boolean }) => void;
 	onError: (err: string) => void;
 }
-function getDeepgramApiKey(): string | null {
-	// Priority: env var → config file → null
-	return process.env.DEEPGRAM_API_KEY || null;
-}
-/**
- * Resolve the Deepgram API key from all sources:
- * 1. process.env.DEEPGRAM_API_KEY (shell)
- * 2. config.deepgramApiKey (settings.json, persisted at setup time)
- */
 function resolveDeepgramApiKey(config: VoiceConfig): string | null {
 	return process.env.DEEPGRAM_API_KEY || config.deepgramApiKey || null;
 }
@@ -290,7 +315,6 @@ function resolveDeepgramApiKey(config: VoiceConfig): string | null {
 function isDeepgramStreaming(config: VoiceConfig): boolean {
 	const key = resolveDeepgramApiKey(config);
 	if (!key) return false;
-	// Use streaming for deepgram backend, or auto mode when deepgram key is available
 	return config.backend === "deepgram" || (config.backend === "auto" && !!key);
 }
@@ -299,8 +323,8 @@ function buildDeepgramWsUrl(config: VoiceConfig): string {
 		encoding: ENCODING,
 		sample_rate: String(SAMPLE_RATE),
 		channels: String(CHANNELS),
-		endpointing: "300",       // ms of silence before phrase boundary
-		utterance_end_ms: "1000", // ms of silence before utterance is complete
+		endpointing: "300",
+		utterance_end_ms: "1000",
 		language: config.language || "en",
 		model: config.model || "nova-3",
 		smart_format: "true",
@@ -313,7 +337,7 @@ function startStreamingSession(
 	config: VoiceConfig,
 	callbacks: {
 		onTranscript: (interim: string, finals: string[]) => void;
-		onDone: (fullText: string) => void;
+		onDone: (fullText: string, meta: { hadAudio: boolean; hadSpeech: boolean }) => void;
 		onError: (err: string) => void;
 	},
 ): StreamingSession | null {
@@ -328,7 +352,6 @@ function startStreamingSession(
 		return null;
 	}
-	// Start SoX streaming raw PCM to stdout (no file)
 	const recProc = spawn("rec", [
 		"-q",
 		"-r", String(SAMPLE_RATE),
@@ -336,12 +359,11 @@ function startStreamingSession(
 		"-b", "16",
 		"-e", "signed-integer",
 		"-t", "raw",
-		"-",  // output to stdout
+		"-",
 	], { stdio: ["pipe", "pipe", "pipe"] });
-	recProc.stderr?.on("data", () => {}); // suppress SoX warnings
+	recProc.stderr?.on("data", () => {});
-	// Connect WebSocket to Deepgram
 	const wsUrl = buildDeepgramWsUrl(config);
 	const ws = new WebSocket(wsUrl, {
 		headers: {
@@ -356,25 +378,25 @@ function startStreamingSession(
 		finalizedParts: [],
 		keepAliveTimer: null,
 		closed: false,
+		hadAudioData: false,
+		hadSpeech: false,
 		onTranscript: callbacks.onTranscript,
 		onDone: callbacks.onDone,
 		onError: callbacks.onError,
 	};
 	ws.onopen = () => {
-		// Send initial KeepAlive
 		try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
-		// Start keepalive timer
 		session.keepAliveTimer = setInterval(() => {
 			if (ws.readyState === WebSocket.OPEN) {
 				try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
 			}
 		}, KEEPALIVE_INTERVAL_MS);
-		// Pipe SoX stdout → WebSocket as binary frames
 		recProc.stdout?.on("data", (chunk: Buffer) => {
 			if (ws.readyState === WebSocket.OPEN) {
+				session.hadAudioData = true;
 				try { ws.send(chunk); } catch {}
 			}
 		});
@@ -389,38 +411,27 @@ function startStreamingSession(
 				const alt = msg.channel?.alternatives?.[0];
 				const transcript = alt?.transcript || "";
+				if (transcript.trim()) {
+					session.hadSpeech = true;
+				}
 				if (msg.is_final) {
-					// Final result for this audio segment
 					if (transcript.trim()) {
 						session.finalizedParts.push(transcript.trim());
 					}
 					session.interimText = "";
 				} else {
-					// Interim result — live update
 					session.interimText = transcript;
 				}
 				session.onTranscript(session.interimText, session.finalizedParts);
-				// If speech_final is true, it's the end of an utterance
-				// (similar to TranscriptEndpoint in Claude Code's protocol)
-				if (msg.speech_final && transcript.trim()) {
-					// Already added to finalizedParts above when is_final was true
-				}
-			} else if (msg.type === "Metadata") {
-				// Connection metadata — ignore
-			} else if (msg.type === "UtteranceEnd") {
-				// Utterance boundary — Deepgram detected end of speech
-				// Nothing extra needed, is_final already handles finalization
 			} else if (msg.type === "Error" || msg.type === "error") {
 				session.onError(msg.message || msg.description || "Deepgram error");
 			}
-		} catch (e: any) {
-			// Ignore parse errors for binary data
-		}
+		} catch {}
 	};
-	ws.onerror = (event: Event) => {
+	ws.onerror = () => {
 		if (!session.closed) {
 			session.onError("WebSocket connection error");
 		}
@@ -437,7 +448,6 @@ function startStreamingSession(
 	});
 	recProc.on("close", () => {
-		// SoX stopped — send CloseStream to Deepgram
 		if (ws.readyState === WebSocket.OPEN) {
 			try { ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
 		}
@@ -449,22 +459,20 @@ function startStreamingSession(
 function stopStreamingSession(session: StreamingSession): void {
 	if (session.closed) return;
-	// Stop the microphone
 	try { session.recProcess.kill("SIGTERM"); } catch {}
-	// CloseStream tells Deepgram to flush remaining audio
 	if (session.ws.readyState === WebSocket.OPEN) {
 		try { session.ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
 	}
-	// Safety: finalize after timeout even if Deepgram doesn't respond
+	// Safety timeout
 	setTimeout(() => {
 		if (!session.closed) {
 			finalizeSession(session);
 		}
 	}, FINALIZE_SAFETY_TIMEOUT_MS);
-	// Shorter timeout: if no new data arrives for 1.5s, assume done
+	// Quick finalize if no new data
 	let lastDataTime = Date.now();
 	const origOnMessage = session.ws.onmessage;
 	session.ws.onmessage = (event: MessageEvent) => {
@@ -486,21 +494,32 @@ function finalizeSession(session: StreamingSession): void {
 	if (session.closed) return;
 	session.closed = true;
-	// Clean up keepalive
 	if (session.keepAliveTimer) {
 		clearInterval(session.keepAliveTimer);
 		session.keepAliveTimer = null;
 	}
-	// Close WebSocket
 	try { session.ws.close(); } catch {}
-	// Kill SoX if still running
 	try { session.recProcess.kill("SIGKILL"); } catch {}
-	// Deliver final transcript
 	const fullText = session.finalizedParts.join(" ").trim();
-	session.onDone(fullText);
+	session.onDone(fullText, {
+		hadAudio: session.hadAudioData,
+		hadSpeech: session.hadSpeech,
+	});
+}
+// ─── Abort helper — nuke everything synchronously ────────────────────────────
+function abortSession(session: StreamingSession | null): void {
+	if (!session || session.closed) return;
+	session.closed = true;
+	if (session.keepAliveTimer) {
+		clearInterval(session.keepAliveTimer);
+		session.keepAliveTimer = null;
+	}
+	try { session.ws.close(); } catch {}
+	try { session.recProcess.kill("SIGKILL"); } catch {}
 }
 // ─── Extension ───────────────────────────────────────────────────────────────
@@ -515,11 +534,20 @@ export default function (pi: ExtensionAPI) {
 	let recordingStart = 0;
 	let statusTimer: ReturnType<typeof setInterval> | null = null;
 	let terminalInputUnsub: (() => void) | null = null;
-	let isHolding = false;
 	// Streaming session state
 	let activeSession: StreamingSession | null = null;
 	let currentTarget: "editor" | "btw" = "editor";
+	let retryAttempts = 0;
+	let lastStopTime = 0;    // For corruption guard
+	// Hold-to-talk state
+	let kittyReleaseDetected = false;
+	let spaceDownTime: number | null = null;
+	let holdActivationTimer: ReturnType<typeof setTimeout> | null = null;
+	let spaceConsumed = false;        // True once threshold passed and recording started
+	let releaseDetectTimer: ReturnType<typeof setTimeout> | null = null;
+	let warmupWidgetTimer: ReturnType<typeof setInterval> | null = null;
 	// ─── BTW State ───────────────────────────────────────────────────────────
@@ -548,13 +576,16 @@ export default function (pi: ExtensionAPI) {
 				ctx.ui.setStatus("voice", `MIC ${modeTag}`);
 				break;
 			}
+			case "warmup":
+				ctx.ui.setStatus("voice", "🎙️ HOLD...");
+				break;
 			case "recording": {
 				const secs = Math.round((Date.now() - recordingStart) / 1000);
 				ctx.ui.setStatus("voice", `🔴 REC ${secs}s`);
 				break;
 			}
-			case "transcribing":
-				ctx.ui.setStatus("voice", "STT...");
+			case "finalizing":
+				ctx.ui.setStatus("voice", "⏳ STT...");
 				break;
 		}
 	}
@@ -564,20 +595,57 @@ export default function (pi: ExtensionAPI) {
 		updateVoiceStatus();
 	}
+	// ─── Cleanup helpers ─────────────────────────────────────────────────────
+	function clearHoldTimer() {
+		if (holdActivationTimer) {
+			clearTimeout(holdActivationTimer);
+			holdActivationTimer = null;
+		}
+	}
+	function clearReleaseTimer() {
+		if (releaseDetectTimer) {
+			clearTimeout(releaseDetectTimer);
+			releaseDetectTimer = null;
+		}
+	}
+	function clearWarmupWidget() {
+		if (warmupWidgetTimer) {
+			clearInterval(warmupWidgetTimer);
+			warmupWidgetTimer = null;
+		}
+	}
+	function clearRecordingAnimTimer() {
+		const timer = (showRecordingWidget as any)?._animTimer;
+		if (timer) {
+			clearInterval(timer);
+			(showRecordingWidget as any)._animTimer = null;
+		}
+	}
+	function hideWidget() {
+		if (ctx?.hasUI) ctx.ui.setWidget("voice-recording", undefined);
+	}
 	function voiceCleanup() {
 		if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
 		clearHoldTimer();
 		clearReleaseTimer();
-		stopRecordingWidgetAnimation();
+		clearWarmupWidget();
+		clearRecordingAnimTimer();
 		if (activeSession) {
-			finalizeSession(activeSession);
+			abortSession(activeSession);
 			activeSession = null;
 		}
 		if (legacyRecProcess) { legacyRecProcess.kill("SIGTERM"); legacyRecProcess = null; }
 		if (tempFile) { try { fs.unlinkSync(tempFile); } catch {} tempFile = null; }
-		isHolding = false;
 		spaceConsumed = false;
 		spaceDownTime = null;
+		retryAttempts = 0;
+		hideWidget();
 		setVoiceState("idle");
 	}
@@ -609,58 +677,64 @@ export default function (pi: ExtensionAPI) {
 		].join("\n"), validated ? "info" : "warning");
 	}
-	// ─── Live Transcript Widget (Component-based, themed) ───────────────────
+	// ─── Warmup Widget ──────────────────────────────────────────────────────
+	//
+	// During the 500ms hold threshold, show a subtle "keep holding…" hint
+	// with a progress indicator. This matches Claude Code's warmup pattern.
-	/** Subtle hint shown during the hold threshold wait */
-	function showHoldHintWidget() {
+	function showWarmupWidget() {
 		if (!ctx?.hasUI) return;
-		ctx.ui.setWidget("voice-recording", (tui, theme) => {
-			return {
-				invalidate() {},
-				render(width: number): string[] {
-					const bar = theme.fg("muted", "─".repeat(Math.min(width - 2, 60)));
-					return [
-						bar,
-						theme.fg("dim", "  Hold " + theme.bold("SPACE") + " for voice input..."),
-						bar,
-					];
-				},
-			};
-		}, { placement: "aboveEditor" });
-	}
-	function hideHoldHintWidget() {
-		if (!ctx?.hasUI) return;
-		ctx.ui.setWidget("voice-recording", undefined);
+		const startTime = Date.now();
+		const renderWarmup = () => {
+			if (!ctx?.hasUI) return;
+			const elapsed = Date.now() - startTime;
+			const progress = Math.min(elapsed / HOLD_THRESHOLD_MS, 1);
+			const barLen = 20;
+			const filled = Math.round(progress * barLen);
+			const empty = barLen - filled;
+			ctx.ui.setWidget("voice-recording", (tui, theme) => {
+				return {
+					invalidate() {},
+					render(width: number): string[] {
+						const maxW = Math.min(width - 2, 60);
+						const bar = theme.fg("accent", "█".repeat(filled)) + theme.fg("muted", "░".repeat(empty));
+						const hint = progress < 0.6
+							? theme.fg("dim", "Keep holding " + theme.bold("SPACE") + " for voice…")
+							: theme.fg("accent", "Almost there… keep holding…");
+						const border = theme.fg("border", "─".repeat(maxW));
+						return [border, `  ${bar}  ${hint}`, border];
+					},
+				};
+			}, { placement: "aboveEditor" });
+		};
+		renderWarmup();
+		warmupWidgetTimer = setInterval(renderWarmup, 50);
 	}
-	/** Animated recording indicator with live waveform */
+	// ─── Recording Widget ───────────────────────────────────────────────────
+	const waveChars = ["▁", "▂", "▃", "▅", "▆", "▇", "▆", "▅", "▃", "▂"];
 	function showRecordingWidget(target: "editor" | "btw") {
 		if (!ctx?.hasUI) return;
-		// Store initial state — once live transcription arrives,
-		// updateLiveTranscriptWidget takes over and we stop the animation.
-		(showRecordingWidget as any)._target = target;
 		(showRecordingWidget as any)._frame = 0;
 		(showRecordingWidget as any)._hasTranscript = false;
-		// Animate the widget every 300ms (only while no transcript is showing)
 		const animTimer = setInterval(() => {
-			// Stop animating once live transcript takes over
 			if ((showRecordingWidget as any)?._hasTranscript) return;
 			(showRecordingWidget as any)._frame = ((showRecordingWidget as any)._frame || 0) + 1;
 			showRecordingWidgetFrame(target, (showRecordingWidget as any)._frame);
 		}, 300);
-		// Store the timer so we can clean it up
 		(showRecordingWidget as any)._animTimer = animTimer;
 		showRecordingWidgetFrame(target, 0);
 	}
-	const waveChars = ["▁", "▂", "▃", "▅", "▆", "▇", "▆", "▅", "▃", "▂"];
 	function showRecordingWidgetFrame(target: "editor" | "btw", frame: number) {
 		if (!ctx?.hasUI) return;
 		ctx.ui.setWidget("voice-recording", (tui, theme) => {
@@ -673,7 +747,6 @@ export default function (pi: ExtensionAPI) {
 					const secs = elapsed % 60;
 					const timeStr = mins > 0 ? `${mins}:${String(secs).padStart(2, "0")}` : `${secs}s`;
-					// Animated waveform
 					const waveLen = 12;
 					let wave = "";
 					for (let i = 0; i < waveLen; i++) {
@@ -702,33 +775,25 @@ export default function (pi: ExtensionAPI) {
 							? theme.fg("dim", "  Release SPACE to finalize")
 							: theme.fg("dim", "  Release SPACE to stop");
-					const lines = [
+					return [
 						topBorder,
 						theme.fg("borderAccent", "│") + pad(titleLine, maxW) + theme.fg("borderAccent", "│"),
 						theme.fg("borderAccent", "│") + pad(hint, maxW) + theme.fg("borderAccent", "│"),
 						botBorder,
 					];
-					return lines;
 				},
 			};
 		}, { placement: "aboveEditor" });
 	}
-	function stopRecordingWidgetAnimation() {
-		const timer = (showRecordingWidget as any)?._animTimer;
-		if (timer) {
-			clearInterval(timer);
-			(showRecordingWidget as any)._animTimer = null;
-		}
-	}
+	// ─── Live Transcript Widget ─────────────────────────────────────────────
-	/** Show live transcript inside a themed box */
 	function updateLiveTranscriptWidget(interim: string, finals: string[]) {
 		if (!ctx?.hasUI) return;
-		// Stop the recording animation — live transcript takes over
+		// Stop the waveform animation — live transcript takes over
 		(showRecordingWidget as any)._hasTranscript = true;
-		stopRecordingWidgetAnimation();
+		clearRecordingAnimTimer();
 		const finalized = finals.join(" ");
 		const displayText = finalized + (interim ? (finalized ? " " : "") + interim : "");
@@ -756,15 +821,14 @@ export default function (pi: ExtensionAPI) {
 					const label = theme.bold(theme.fg("accent", " VOICE "));
 					const timeStyled = theme.fg("muted", timeStr);
 					const titleLine = `  ${dot} ${label}  ${timeStyled}`;
-					const hint = theme.fg("dim", "  Release SPACE to stop");
+					const hint = theme.fg("dim", "  Release SPACE to finalize");
 					const lines = [topBorder, side(titleLine)];
 					if (!displayText.trim()) {
-						lines.push(side(theme.fg("dim", "  Listening... speak now")));
+						lines.push(side(theme.fg("dim", "  Listening… speak now")));
 					} else {
 						lines.push(sep);
-						// Word-wrap the transcript text
-						const innerMax = maxW - 4; // padding inside box
+						const innerMax = maxW - 4;
 						const words = displayText.split(" ");
 						const wrappedLines: string[] = [];
 						let currentLine = "";
@@ -779,11 +843,9 @@ export default function (pi: ExtensionAPI) {
 						}
 						if (currentLine) wrappedLines.push(currentLine);
-						// Show last 3 lines of transcript
 						const visible = wrappedLines.slice(-3);
 						for (let i = 0; i < visible.length; i++) {
 							let line = visible[i];
-							// Style: finalized parts in normal text, interim in accent
 							if (i === visible.length - 1 && interim) {
 								line = theme.fg("text", line) + theme.fg("accent", "▍");
 							} else {
@@ -801,8 +863,9 @@ export default function (pi: ExtensionAPI) {
 		}, { placement: "aboveEditor" });
 	}
-	/** Transcribing state — show a processing indicator */
-	function showTranscribingWidget() {
+	// ─── Finalizing Widget ──────────────────────────────────────────────────
+	function showFinalizingWidget() {
 		if (!ctx?.hasUI) return;
 		ctx.ui.setWidget("voice-recording", (tui, theme) => {
 			return {
@@ -818,7 +881,7 @@ export default function (pi: ExtensionAPI) {
 					};
 					const spinner = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
 					const idx = Math.floor(Date.now() / 100) % spinner.length;
-					const line = `  ${theme.fg("accent", spinner[idx])} ${theme.fg("dim", "Finalizing transcription...")}`;
+					const line = `  ${theme.fg("accent", spinner[idx])} ${theme.fg("dim", "Finalizing transcription…")}`;
 					return [topBorder, side(line), botBorder];
 				},
 			};
@@ -828,104 +891,147 @@ export default function (pi: ExtensionAPI) {
 	// ─── Voice: Start / Stop (Streaming or Legacy) ───────────────────────────
 	async function startVoiceRecording(target: "editor" | "btw" = "editor"): Promise<boolean> {
-		if (voiceState !== "idle" || !ctx) return false;
+		if (!ctx) return false;
+		// ── SESSION CORRUPTION GUARD ──
+		// If we're still finalizing from a previous recording, abort it first.
+		// This prevents the "slow connection overlaps new recording" bug.
+		if (voiceState === "finalizing" || voiceState === "recording") {
+			abortSession(activeSession);
+			activeSession = null;
+			clearRecordingAnimTimer();
+			clearWarmupWidget();
+			hideWidget();
+			setVoiceState("idle");
+			// Brief pause to let resources release
+			await new Promise((r) => setTimeout(r, CORRUPTION_GUARD_MS));
+		}
+		// ── STALE TRANSCRIPT CLEANUP ──
+		// Clear any prior transcript from the widget
+		hideWidget();
 		currentTarget = target;
 		recordingStart = Date.now();
+		retryAttempts = 0;
 		if (isDeepgramStreaming(config)) {
-			// === STREAMING PATH === (Deepgram WebSocket)
-			setVoiceState("recording");
+			return startStreamingRecording(target);
+		} else {
+			return startLegacyRecording(target);
+		}
+	}
-			const session = startStreamingSession(config, {
-				onTranscript: (interim, finals) => {
-					updateLiveTranscriptWidget(interim, finals);
-					updateVoiceStatus();
-				},
-				onDone: (fullText) => {
-					activeSession = null;
-					stopRecordingWidgetAnimation();
-					ctx?.ui.setWidget("voice-recording", undefined);
+	async function startStreamingRecording(target: "editor" | "btw"): Promise<boolean> {
+		setVoiceState("recording");
-					if (!fullText.trim()) {
+		const session = startStreamingSession(config, {
+			onTranscript: (interim, finals) => {
+				// Live transcript update — this is the key UX feature
+				updateLiveTranscriptWidget(interim, finals);
+				updateVoiceStatus();
+			},
+			onDone: (fullText, meta) => {
+				activeSession = null;
+				clearRecordingAnimTimer();
+				hideWidget();
+				lastStopTime = Date.now();
+				if (!fullText.trim()) {
+					// ── DISTINGUISH SILENCE VS NO SPEECH ──
+					if (!meta.hadAudio) {
+						ctx?.ui.notify("Microphone captured no audio. Check mic permissions.", "error");
+					} else if (!meta.hadSpeech) {
+						ctx?.ui.notify("Microphone captured silence — no speech detected.", "warning");
+					} else {
 						ctx?.ui.notify("No speech detected.", "warning");
-						setVoiceState("idle");
-						return;
 					}
+					setVoiceState("idle");
+					return;
+				}
-					if (target === "btw") {
-						handleBtw(fullText);
-					} else {
-						if (ctx?.hasUI) {
-							const existing = ctx.ui.getEditorText();
-							ctx.ui.setEditorText(existing ? existing + " " + fullText : fullText);
-							const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
-							ctx.ui.notify(
-								`STT (${elapsed}s): ${fullText.slice(0, 80)}${fullText.length > 80 ? "..." : ""}`,
-								"info",
-							);
-						}
+				if (target === "btw") {
+					handleBtw(fullText);
+				} else {
+					if (ctx?.hasUI) {
+						const existing = ctx.ui.getEditorText();
+						ctx.ui.setEditorText(existing ? existing + " " + fullText : fullText);
+						const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
+						ctx.ui.notify(
+							`STT (${elapsed}s): ${fullText.slice(0, 80)}${fullText.length > 80 ? "…" : ""}`,
+							"info",
+						);
 					}
-					setVoiceState("idle");
-				},
-				onError: (err) => {
-					activeSession = null;
-					stopRecordingWidgetAnimation();
-					ctx?.ui.setWidget("voice-recording", undefined);
-					ctx?.ui.notify(`STT error: ${err}`, "error");
-					setVoiceState("idle");
-				},
-			});
+				}
+				setVoiceState("idle");
+			},
+			onError: (err) => {
+				activeSession = null;
+				clearRecordingAnimTimer();
+				hideWidget();
+				// ── TRANSIENT FAILURE RETRY ──
+				// On WebSocket error during rapid push-to-talk re-press, auto-retry
+				if (retryAttempts < MAX_RETRY_ATTEMPTS) {
+					retryAttempts++;
+					ctx?.ui.notify(`Voice connection error — retrying (${retryAttempts}/${MAX_RETRY_ATTEMPTS})…`, "warning");
+					setTimeout(() => {
+						if (voiceState !== "idle") {
+							setVoiceState("idle");
+						}
+						startStreamingRecording(target);
+					}, RETRY_DELAY_MS);
+					return;
+				}
-			if (!session) {
+				ctx?.ui.notify(`STT error: ${err}`, "error");
 				setVoiceState("idle");
-				return false;
-			}
+			},
+		});
-			activeSession = session;
+		if (!session) {
+			setVoiceState("idle");
+			return false;
+		}
-			// Status timer for elapsed time
-			statusTimer = setInterval(() => {
-				if (voiceState === "recording") {
-					updateVoiceStatus();
-					const elapsed = (Date.now() - recordingStart) / 1000;
-					if (elapsed >= MAX_RECORDING_SECS) {
-						isHolding = false;
-						stopVoiceRecording(target);
-					}
+		activeSession = session;
+		// Status timer for elapsed time
+		statusTimer = setInterval(() => {
+			if (voiceState === "recording") {
+				updateVoiceStatus();
+				const elapsed = (Date.now() - recordingStart) / 1000;
+				if (elapsed >= MAX_RECORDING_SECS) {
+					stopVoiceRecording(target);
 				}
-			}, 1000);
+			}
+		}, 1000);
-			// Show the themed recording widget
-			showRecordingWidget(target);
-			return true;
+		showRecordingWidget(target);
+		return true;
+	}
-		} else {
-			// === LEGACY PATH === (file-based for local backends)
-			tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
-			if (!startLegacyRecordingToFile(tempFile)) {
-				ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
-				return false;
-			}
+	async function startLegacyRecording(target: "editor" | "btw"): Promise<boolean> {
+		if (!ctx) return false;
+		tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
+		if (!startLegacyRecordingToFile(tempFile)) {
+			ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
+			return false;
+		}
-			setVoiceState("recording");
-			statusTimer = setInterval(() => {
-				if (voiceState === "recording") {
-					updateVoiceStatus();
-					const elapsed = (Date.now() - recordingStart) / 1000;
-					if (elapsed >= MAX_RECORDING_SECS) {
-						isHolding = false;
-						stopVoiceRecording(target);
-					}
+		setVoiceState("recording");
+		statusTimer = setInterval(() => {
+			if (voiceState === "recording") {
+				updateVoiceStatus();
+				const elapsed = (Date.now() - recordingStart) / 1000;
+				if (elapsed >= MAX_RECORDING_SECS) {
+					stopVoiceRecording(target);
 				}
-			}, 1000);
-			if (ctx.hasUI) {
-				// Show themed recording widget for legacy path
-				showRecordingWidget(target);
 			}
-			return true;
-		}
+		}, 1000);
+		showRecordingWidget(target);
+		return true;
 	}
 	async function stopVoiceRecording(target: "editor" | "btw" = "editor") {
@@ -933,34 +1039,35 @@ export default function (pi: ExtensionAPI) {
 		if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
 		if (activeSession) {
-			// === STREAMING PATH === Stop the stream, finalize will call onDone
-			setVoiceState("transcribing");
-			stopRecordingWidgetAnimation();
-			showTranscribingWidget();
+			setVoiceState("finalizing");
+			clearRecordingAnimTimer();
+			showFinalizingWidget();
 			stopStreamingSession(activeSession);
 			return;
 		}
-		// === LEGACY PATH ===
+		// Legacy path
 		const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
 		const audioFile = tempFile;
-		setVoiceState("transcribing");
-		stopRecordingWidgetAnimation();
-		showTranscribingWidget();
+		setVoiceState("finalizing");
+		clearRecordingAnimTimer();
+		showFinalizingWidget();
 		await stopLegacyRecording();
 		if (!audioFile || !fs.existsSync(audioFile)) {
 			ctx.ui.notify("No audio recorded.", "warning");
+			hideWidget();
 			setVoiceState("idle");
 			return;
 		}
 		const stats = fs.statSync(audioFile);
 		if (stats.size < 1000) {
-			ctx.ui.notify("Recording too short.", "warning");
+			ctx.ui.notify("Recording too short — mic captured silence.", "warning");
 			try { fs.unlinkSync(audioFile); } catch {}
 			tempFile = null;
+			hideWidget();
 			setVoiceState("idle");
 			return;
 		}
@@ -971,6 +1078,8 @@ export default function (pi: ExtensionAPI) {
 		try { fs.unlinkSync(audioFile); } catch {}
 		if (tempFile === audioFile) tempFile = null;
+		hideWidget();
 		if (result.error) {
 			ctx.ui.notify(`STT error: ${result.error}`, "error");
 			setVoiceState("idle");
@@ -991,7 +1100,7 @@ export default function (pi: ExtensionAPI) {
 				const existing = ctx.ui.getEditorText();
 				ctx.ui.setEditorText(existing ? existing + " " + transcript : transcript);
 				ctx.ui.notify(
-					`STT (${elapsed}s): ${transcript.slice(0, 80)}${transcript.length > 80 ? "..." : ""}`,
+					`STT (${elapsed}s): ${transcript.slice(0, 80)}${transcript.length > 80 ? "…" : ""}`,
 					"info",
 				);
 			}
@@ -1000,83 +1109,46 @@ export default function (pi: ExtensionAPI) {
 		setVoiceState("idle");
 	}
-	// ─── Hold-to-talk with Duration Threshold ──────────────────────────────
+	// ─── Hold-to-Talk State Machine ─────────────────────────────────────────
 	//
-	// SPACE activates voice ONLY when:
-	//   1. The editor is empty (no text typed yet)
-	//   2. SPACE is held for ≥ HOLD_THRESHOLD_MS (500ms)
+	// SPACE key handling with strict hold-duration detection:
 	//
-	// If SPACE is released before the threshold, a regular space character
-	// is typed into the editor (normal typing behavior).
+	//   1. SPACE press (first) → enter "warmup" state, start 500ms timer
+	//   2. During warmup: show progress bar, consume repeat presses
+	//   3. Timer fires → transition to "recording", start voice capture
+	//   4. SPACE release → stop recording, finalize
+	//   5. If released during warmup → cancel, type a space character
 	//
-	// KEY DESIGN for non-Kitty terminals (no key-release events):
-	//   Holding a key generates rapid press events (~30ms apart). We detect
-	//   "release" by watching for the stream of space presses to STOP.
-	//   Once the gap exceeds RELEASE_DETECT_MS (200ms), we know the user
-	//   lifted their finger and we stop recording.
-	//
-	// Flow:
-	//   Hold SPACE → rapid presses arrive → first press starts 500ms timer →
-	//   timer fires → recording starts → presses keep coming (consumed) →
-	//   user releases → presses stop → 200ms silence → auto-stop recording
-	//
-	// Kitty protocol terminals get true key-release events and work natively.
-	const HOLD_THRESHOLD_MS = 500; // minimum hold time before voice activates
-	const RELEASE_DETECT_MS = 200; // gap in key-repeat that means "released"
-	let kittyReleaseDetected = false;
-	let spaceDownTime: number | null = null;
-	let holdActivationTimer: ReturnType<typeof setTimeout> | null = null;
-	let spaceConsumed = false;
-	let lastSpacePressTime = 0;
-	let releaseDetectTimer: ReturnType<typeof setTimeout> | null = null;
-	function clearHoldTimer() {
-		if (holdActivationTimer) {
-			clearTimeout(holdActivationTimer);
-			holdActivationTimer = null;
-		}
-	}
-	function clearReleaseTimer() {
-		if (releaseDetectTimer) {
-			clearTimeout(releaseDetectTimer);
-			releaseDetectTimer = null;
-		}
-	}
+	// Non-Kitty detection: rapid press events = "holding", gap > 150ms = "released"
-	/** Called when we detect the user has released SPACE (non-Kitty) */
 	function onSpaceReleaseDetected() {
 		releaseDetectTimer = null;
-		// If we're still in the threshold wait (< 500ms), user just tapped space
-		if (spaceDownTime && !spaceConsumed) {
+		// Released during warmup — cancel, type a space
+		if (voiceState === "warmup") {
 			clearHoldTimer();
+			clearWarmupWidget();
+			hideWidget();
+			setVoiceState("idle");
 			spaceDownTime = null;
 			spaceConsumed = false;
-			// Insert a space character
 			if (ctx?.hasUI) {
 				ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
-				hideHoldHintWidget();
 			}
 			return;
 		}
-		// If we're recording, stop
+		// Released during recording — stop
 		if (spaceConsumed && voiceState === "recording") {
-			isHolding = false;
 			spaceConsumed = false;
 			spaceDownTime = null;
 			stopVoiceRecording("editor");
 		}
 	}
-	/** Reset the release detection timer — called on every space press */
 	function resetReleaseDetect() {
 		clearReleaseTimer();
-		// If we're in a hold state (threshold pending or recording),
-		// start a timer to detect release
-		if (spaceDownTime || spaceConsumed || voiceState === "recording") {
+		if (voiceState === "warmup" || voiceState === "recording" || spaceDownTime || spaceConsumed) {
 			releaseDetectTimer = setTimeout(onSpaceReleaseDetected, RELEASE_DETECT_MS);
 		}
 	}
@@ -1091,27 +1163,26 @@ export default function (pi: ExtensionAPI) {
 			// ── SPACE handling ──
 			if (matchesKey(data, "space")) {
-				// Check editor content — hold-to-talk still works even with content,
-				// but a quick tap types a space as normal
-				const editorHasContent = !!(ctx?.hasUI && ctx.ui.getEditorText()?.trim().length);
 				// ── Kitty key-release ──
 				if (isKeyRelease(data)) {
 					kittyReleaseDetected = true;
 					clearReleaseTimer();
-					// Released before threshold → type a space character
-					if (spaceDownTime && !spaceConsumed) {
+					// Released during warmup → cancel, type a space
+					if (voiceState === "warmup") {
 						clearHoldTimer();
+						clearWarmupWidget();
+						hideWidget();
+						setVoiceState("idle");
 						spaceDownTime = null;
 						spaceConsumed = false;
 						if (ctx?.hasUI) ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
 						return { consume: true };
 					}
-					// Released after threshold → stop recording (true hold-to-talk)
+					// Released during recording → stop
 					if (spaceConsumed && voiceState === "recording") {
-						isHolding = false;
 						spaceConsumed = false;
 						spaceDownTime = null;
 						stopVoiceRecording("editor");
@@ -1123,60 +1194,58 @@ export default function (pi: ExtensionAPI) {
 					return undefined;
 				}
-				// ── Kitty key-repeat: ALWAYS suppress while holding/recording ──
+				// ── Kitty key-repeat: suppress while in warmup/recording ──
 				if (isKeyRepeat(data)) {
-					if (spaceDownTime || spaceConsumed || isHolding || voiceState === "recording") {
-						resetReleaseDetect(); // keep resetting — still holding
+					if (voiceState === "warmup" || voiceState === "recording" || voiceState === "finalizing" || spaceConsumed) {
+						resetReleaseDetect();
 						return { consume: true };
 					}
 					return undefined;
 				}
 				// === Key PRESS ===
-				// In non-Kitty terminals, holding a key sends rapid press events.
-				// We use these to detect "still holding" and the gap to detect "released".
-				// Reset release detection — user is still holding
 				resetReleaseDetect();
-				// If transcribing → ignore
-				if (voiceState === "transcribing") {
+				// If finalizing → ignore
+				if (voiceState === "finalizing") {
 					return { consume: true };
 				}
-				// If already recording → just consume (release detect handles stop)
+				// If already recording → just consume (release handles stop)
 				if (voiceState === "recording") {
 					return { consume: true };
 				}
-				// If we already started the hold timer, this is a repeat → consume
-				if (spaceDownTime) {
+				// If already in warmup → consume (threshold timer is running)
+				if (voiceState === "warmup") {
+					return { consume: true };
+				}
+				// If we've already consumed space for this hold → consume
+				if (spaceConsumed || spaceDownTime) {
 					return { consume: true };
 				}
-				// Idle, first press → start the hold timer
+				// IDLE — first press → start warmup
 				if (voiceState === "idle") {
 					spaceDownTime = Date.now();
 					spaceConsumed = false;
-					lastSpacePressTime = Date.now();
-					// Show a subtle "preparing" indicator
-					if (ctx?.hasUI) {
-						showHoldHintWidget();
-					}
+					// Transition to warmup state
+					setVoiceState("warmup");
+					showWarmupWidget();
 					// After threshold: activate voice recording
-					// Works regardless of whether editor has content — hold always activates voice
 					holdActivationTimer = setTimeout(() => {
 						holdActivationTimer = null;
-						if (voiceState === "idle" && spaceDownTime) {
+						if (voiceState === "warmup" && spaceDownTime) {
+							clearWarmupWidget();
 							spaceConsumed = true;
-							isHolding = true;
 							startVoiceRecording("editor").then((ok) => {
 								if (!ok) {
-									isHolding = false;
 									spaceConsumed = false;
 									spaceDownTime = null;
+									setVoiceState("idle");
 								}
 							});
 						} else {
@@ -1188,17 +1257,19 @@ export default function (pi: ExtensionAPI) {
 					return { consume: true };
 				}
-				if (isHolding || spaceConsumed) return { consume: true };
+				if (spaceConsumed) return { consume: true };
 				return undefined;
 			}
-			// ── Any other key while holding space (pre-threshold) → cancel hold, insert space ──
-			if (spaceDownTime && !spaceConsumed && !matchesKey(data, "space")) {
+			// ── Any other key during warmup → cancel hold, type a space ──
+			if (voiceState === "warmup" && spaceDownTime && !spaceConsumed) {
 				clearHoldTimer();
 				clearReleaseTimer();
+				clearWarmupWidget();
+				hideWidget();
+				setVoiceState("idle");
 				if (ctx?.hasUI) {
 					ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
-					hideHoldHintWidget();
 				}
 				spaceDownTime = null;
 				spaceConsumed = false;
@@ -1209,8 +1280,7 @@ export default function (pi: ExtensionAPI) {
 			if (matchesKey(data, "ctrl+shift+b")) {
 				if (isKeyRelease(data)) {
 					kittyReleaseDetected = true;
-					if (isHolding && voiceState === "recording") {
-						isHolding = false;
+					if (voiceState === "recording" && currentTarget === "btw") {
 						stopVoiceRecording("btw");
 						return { consume: true };
 					}
@@ -1218,25 +1288,23 @@ export default function (pi: ExtensionAPI) {
 				}
 				if (isKeyRepeat(data)) {
-					if (isHolding) return { consume: true };
+					if (voiceState === "recording" && currentTarget === "btw") return { consume: true };
 					return undefined;
 				}
-				if (voiceState === "recording") {
-					isHolding = false;
+				if (voiceState === "recording" && currentTarget === "btw") {
 					stopVoiceRecording("btw");
 					return { consume: true };
 				}
-				if (voiceState === "idle" && !isHolding) {
-					isHolding = true;
-					startVoiceRecording("btw").then((ok) => {
-						if (!ok) isHolding = false;
-					});
+				if (voiceState === "idle") {
+					startVoiceRecording("btw");
 					return { consume: true };
 				}
-				if (isHolding) return { consume: true };
+				if (voiceState === "recording" || voiceState === "finalizing" || voiceState === "warmup") {
+					return { consume: true };
+				}
 				return undefined;
 			}
@@ -1280,12 +1348,12 @@ export default function (pi: ExtensionAPI) {
 			"",
 		];
-		lines.push(`  Q: ${last.question.slice(0, 100)}${last.question.length > 100 ? "..." : ""}`);
+		lines.push(`  Q: ${last.question.slice(0, 100)}${last.question.length > 100 ? "…" : ""}`);
 		const answerLines = last.answer.split("\n");
 		for (const line of answerLines.slice(0, 8)) {
 			lines.push(`  ${line}`);
 		}
-		if (answerLines.length > 8) lines.push("  ...");
+		if (answerLines.length > 8) lines.push("  …");
 		lines.push("");
 		lines.push("  /btw:clear to dismiss | /btw:inject to send to agent");
@@ -1301,9 +1369,9 @@ export default function (pi: ExtensionAPI) {
 		ctx.ui.setWidget("btw", [
 			" BTW",
 			"",
-			`  Q: ${message.slice(0, 100)}${message.length > 100 ? "..." : ""}`,
+			`  Q: ${message.slice(0, 100)}${message.length > 100 ? "…" : ""}`,
 			"",
-			"  Thinking...",
+			"  Thinking…",
 		], { placement: "aboveEditor" });
 		const btwContext = buildBtwContext();
@@ -1388,16 +1456,12 @@ export default function (pi: ExtensionAPI) {
 				return;
 			}
 			if (voiceState === "idle") {
-				// Direct start — bypass hold threshold
 				spaceConsumed = true;
-				isHolding = true;
 				const ok = await startVoiceRecording("editor");
 				if (!ok) {
-					isHolding = false;
 					spaceConsumed = false;
 				}
 			} else if (voiceState === "recording") {
-				isHolding = false;
 				spaceConsumed = false;
 				spaceDownTime = null;
 				clearHoldTimer();
@@ -1416,9 +1480,7 @@ export default function (pi: ExtensionAPI) {
 		configSource = loaded.source;
 		updateSocketPath(config, currentCwd);
-		// Auto-capture DEEPGRAM_API_KEY from env into config if not already stored.
-		// This ensures streaming works even when Pi is launched from a context
-		// that doesn't source .zshrc (GUI app, tmux, etc.)
+		// Auto-capture DEEPGRAM_API_KEY from env into config
 		if (process.env.DEEPGRAM_API_KEY && !config.deepgramApiKey) {
 			config.deepgramApiKey = process.env.DEEPGRAM_API_KEY;
 			if (configSource !== "default") {
@@ -1426,7 +1488,7 @@ export default function (pi: ExtensionAPI) {
 			}
 		}
-		// Also try to load DEEPGRAM_API_KEY from shell if not in process.env and not in config
+		// Try to load DEEPGRAM_API_KEY from shell if not available
 		if (!resolveDeepgramApiKey(config) && config.backend === "deepgram") {
 			try {
 				const result = spawnSync("zsh", ["-ic", "echo $DEEPGRAM_API_KEY"], {
@@ -1437,7 +1499,7 @@ export default function (pi: ExtensionAPI) {
 				const shellKey = result.stdout?.toString().trim();
 				if (shellKey && shellKey.length > 5) {
 					config.deepgramApiKey = shellKey;
-					process.env.DEEPGRAM_API_KEY = shellKey; // Also set for child processes
+					process.env.DEEPGRAM_API_KEY = shellKey;
 					if (configSource !== "default") {
 						saveConfig(config, config.scope, currentCwd);
 					}
@@ -1448,7 +1510,6 @@ export default function (pi: ExtensionAPI) {
 		if (config.enabled && config.onboarding.completed) {
 			updateVoiceStatus();
 			setupHoldToTalk();
-			// Only start daemon for non-streaming backends
 			if (!isDeepgramStreaming(config)) {
 				ensureDaemon(config).catch(() => {});
 			}
@@ -1509,7 +1570,15 @@ export default function (pi: ExtensionAPI) {
 					ensureDaemon(config).catch(() => {});
 				}
 				const mode = isDeepgramStreaming(config) ? "Deepgram streaming" : config.backend;
-				cmdCtx.ui.notify(`Voice enabled (${mode}).\n  Hold SPACE (empty editor) → release to transcribe\n  Ctrl+Shift+V → toggle recording on/off\n  Live transcription shown while speaking`, "info");
+				cmdCtx.ui.notify([
+					`Voice enabled (${mode}).`,
+					"",
+					"  Hold SPACE (500ms) → release to transcribe",
+					"  Ctrl+Shift+V → toggle recording on/off",
+					"  Quick SPACE tap → types a space (no voice)",
+					"",
+					"  Live transcription shown while speaking",
+				].join("\n"), "info");
 				return;
 			}
@@ -1524,9 +1593,14 @@ export default function (pi: ExtensionAPI) {
 			if (sub === "stop") {
 				if (voiceState === "recording") {
-					isHolding = false;
 					await stopVoiceRecording("editor");
 					cmdCtx.ui.notify("Recording stopped and transcribed.", "info");
+				} else if (voiceState === "warmup") {
+					clearHoldTimer();
+					clearWarmupWidget();
+					hideWidget();
+					setVoiceState("idle");
+					cmdCtx.ui.notify("Warmup cancelled.", "info");
 				} else {
 					cmdCtx.ui.notify("No recording in progress.", "info");
 				}
@@ -1534,7 +1608,7 @@ export default function (pi: ExtensionAPI) {
 			}
 			if (sub === "test") {
-				cmdCtx.ui.notify("Testing voice setup...", "info");
+				cmdCtx.ui.notify("Testing voice setup…", "info");
 				const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
 				const dgKey = resolveDeepgramApiKey(config);
 				const streaming = isDeepgramStreaming(config);
@@ -1551,11 +1625,15 @@ export default function (pi: ExtensionAPI) {
 					`  model status: ${modelReadiness}`,
 					`  language: ${config.language}`,
 					`  streaming: ${streaming ? "YES (Deepgram WS)" : "NO (batch)"}`,
-					`  DEEPGRAM_API_KEY: ${dgKey ? "set (" + dgKey.slice(0, 8) + "...)" : "NOT SET"}`,
+					`  DEEPGRAM_API_KEY: ${dgKey ? "set (" + dgKey.slice(0, 8) + "…)" : "NOT SET"}`,
 					`  onboarding: ${config.onboarding.completed ? "complete" : "incomplete"}`,
 					`  python3: ${diagnostics.hasPython ? "OK" : "missing"}`,
 					`  sox/rec: ${diagnostics.hasSox ? "OK" : "missing"}`,
 					`  daemon: ${daemonUp ? "running" : "not running"}`,
+					`  state: ${voiceState}`,
+					`  hold threshold: ${HOLD_THRESHOLD_MS}ms`,
+					`  release detect: ${RELEASE_DETECT_MS}ms`,
+					`  kitty protocol: ${kittyReleaseDetected ? "detected" : "not detected"}`,
 				];
 				if (diagnostics.hasSox) {
@@ -1609,14 +1687,15 @@ export default function (pi: ExtensionAPI) {
 					`  setup:      ${config.onboarding.completed ? `complete (${config.onboarding.source ?? "unknown"})` : "incomplete"}`,
 					`  socket:     ${activeSocketPath}`,
 					`  daemon:     ${daemonUp ? "running" : "stopped"}${daemonInfo}`,
-					`  hold-key:   SPACE (editor empty) or Ctrl+Shift+V (toggle)`,
+					`  hold-key:   SPACE (hold ≥${HOLD_THRESHOLD_MS}ms) or Ctrl+Shift+V (toggle)`,
 					`  btw-key:    Ctrl+Shift+B (hold to record → auto-btw)`,
+					`  kitty:      ${kittyReleaseDetected ? "yes" : "no"}`,
 				].join("\n"), "info");
 				return;
 			}
 			if (sub === "daemon" || sub === "daemon start") {
-				cmdCtx.ui.notify("Starting STT daemon...", "info");
+				cmdCtx.ui.notify("Starting STT daemon…", "info");
 				const ok = await ensureDaemon(config);
 				cmdCtx.ui.notify(ok ? "Daemon started." : "Failed to start daemon.", ok ? "info" : "error");
 				return;
@@ -1858,7 +1937,7 @@ export default function (pi: ExtensionAPI) {
 				return;
 			}
-			cmdCtx.ui.notify("Summarizing BTW thread...", "info");
+			cmdCtx.ui.notify("Summarizing BTW thread…", "info");
 			try {
 				let summary = "";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@codexstar/pi-listen",
-  "version": "1.0.17",
+  "version": "1.0.18",
   "description": "Voice input, first-run onboarding, and side-channel BTW conversations for Pi",
   "type": "module",
   "keywords": [