npm - @codexstar/pi-listen - Versions diffs - 1.0.12 → 1.0.13 - Mend

@codexstar/pi-listen 1.0.12 → 1.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/extensions/voice/config.ts +4 -0
package/extensions/voice.ts +556 -131
package/package.json +1 -1

package/extensions/voice/config.ts CHANGED Viewed

@@ -31,6 +31,8 @@ export interface VoiceConfig {
 	scope: VoiceSettingsScope;
 	btwEnabled: boolean;
 	onboarding: VoiceOnboardingState;
+	/** Deepgram API key — stored in config so it's available even when env var isn't set */
+	deepgramApiKey?: string;
 }
 export interface LoadedVoiceConfig {
@@ -60,6 +62,7 @@ export const DEFAULT_CONFIG: VoiceConfig = {
 	model: "small",
 	scope: "global",
 	btwEnabled: true,
+	deepgramApiKey: undefined,
 	onboarding: {
 		completed: false,
 		schemaVersion: VOICE_CONFIG_VERSION,
@@ -121,6 +124,7 @@ function migrateConfig(rawVoice: any, source: VoiceConfigSource): VoiceConfig {
 		model: typeof rawVoice.model === "string" ? rawVoice.model : DEFAULT_CONFIG.model,
 		scope: (rawVoice.scope as VoiceSettingsScope | undefined) ?? (source === "project" ? "project" : "global"),
 		btwEnabled: typeof rawVoice.btwEnabled === "boolean" ? rawVoice.btwEnabled : DEFAULT_CONFIG.btwEnabled,
+		deepgramApiKey: typeof rawVoice.deepgramApiKey === "string" ? rawVoice.deepgramApiKey : undefined,
 		onboarding: normalizeOnboarding(rawVoice.onboarding, fallbackCompleted),
 	};
 }

package/extensions/voice.ts CHANGED Viewed

@@ -1,22 +1,27 @@
 /**
- * pi-voice — Voice input + BTW side conversations for Pi CLI.
+ * pi-voice — Deepgram WebSocket streaming STT for Pi CLI.
  *
- * Features:
- *   1. Hold-spacebar to talk (Kitty protocol key release detection)
- *      Fallback: Ctrl+Shift+V toggle for non-Kitty terminals
- *   2. BTW side conversations (/btw <msg>, /btw:new, /btw:clear, /btw:inject, /btw:summarize)
- *   3. Voice → BTW glue: Ctrl+Shift+B = hold to record → auto-send as /btw
+ * Architecture (modeled after Claude Code's voice pipeline):
+ *   1. SoX `rec` captures mic audio as raw PCM (16kHz, mono, 16-bit)
+ *      and pipes it to stdout (no file).
+ *   2. Raw PCM chunks are streamed over a WebSocket to Deepgram Nova 3.
+ *   3. Deepgram returns interim + final transcripts in real-time.
+ *   4. Interim transcripts update a live widget above the editor.
+ *   5. On key-release (or toggle stop), a CloseStream message is sent;
+ *      final transcript is injected into the editor.
  *
- * Records audio via SoX, transcribes via persistent daemon (daemon.py) or fallback subprocess.
- * STT backends: faster-whisper, moonshine, whisper.cpp, deepgram, parakeet.
+ * Activation:
+ *   - Hold SPACE (empty editor) → release to finalize
+ *   - Ctrl+Shift+V → toggle start/stop (fallback for non-Kitty terminals)
+ *   - Ctrl+Shift+B → hold to record → auto-send as /btw
  *
- * Config in ~/.pi/agent/settings.json or <project>/.pi/settings.json:
+ * Config in ~/.pi/agent/settings.json:
  * {
  *   "voice": {
  *     "enabled": true,
  *     "language": "en",
- *     "backend": "faster-whisper",
- *     "model": "small"
+ *     "backend": "deepgram",
+ *     "model": "nova-3"
  *   }
  * }
  */
@@ -65,6 +70,14 @@ interface BtwExchange {
 // ─── Constants ───────────────────────────────────────────────────────────────
 const SAMPLE_RATE = 16000;
+const CHANNELS = 1;
+const ENCODING = "linear16";
+const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
+const KEEPALIVE_INTERVAL_MS = 8000;
+const FINALIZE_SAFETY_TIMEOUT_MS = 5000;
+const FINALIZE_NO_DATA_TIMEOUT_MS = 1500;
+const MAX_RECORDING_SECS = 120; // 2 minutes safety cap (streaming is efficient)
 const EXT_DIR = path.dirname(new URL(import.meta.url).pathname);
 const PROJECT_ROOT = path.join(EXT_DIR, "..");
 const DAEMON_SCRIPT = path.join(PROJECT_ROOT, "daemon.py");
@@ -74,7 +87,7 @@ function commandExists(cmd: string): boolean {
 	return spawnSync("which", [cmd], { stdio: "pipe", timeout: 3000 }).status === 0;
 }
-// ─── Daemon Communication ────────────────────────────────────────────────────
+// ─── Daemon Communication (kept for non-deepgram local backends) ─────────────
 let activeSocketPath = getSocketPath({
 	scope: DEFAULT_CONFIG.scope,
@@ -135,8 +148,6 @@ async function isDaemonRunning(socketPath = activeSocketPath): Promise<boolean>
 async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
 	if (await isDaemonRunning(activeSocketPath)) {
 		const status = await daemonSend({ cmd: "status" }, 3000, activeSocketPath);
-		// When backend is 'auto', accept any loaded backend — the daemon already
-		// resolved 'auto' to a concrete backend, so we don't need to reload.
 		if (config.backend === "auto" || (status.backend === config.backend && status.model === config.model)) return true;
 		const reloaded = await daemonSend({
 			cmd: "load",
@@ -175,7 +186,6 @@ async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
 		proc.on("error", () => resolve(false));
-		// Timeout: if daemon doesn't start in 10s, kill orphan and fall back
 		setTimeout(() => {
 			if (!started) {
 				try { proc.kill(); } catch {}
@@ -185,46 +195,40 @@ async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
 	});
 }
-// ─── Audio Recording ─────────────────────────────────────────────────────────
+// ─── Legacy file-based transcription (for non-deepgram backends) ─────────────
-let recProcess: ChildProcess | null = null;
+let legacyRecProcess: ChildProcess | null = null;
-function startRecordingToFile(outPath: string): boolean {
-	if (recProcess) {
-		recProcess.kill("SIGTERM");
-		recProcess = null;
+function startLegacyRecordingToFile(outPath: string): boolean {
+	if (legacyRecProcess) {
+		legacyRecProcess.kill("SIGTERM");
+		legacyRecProcess = null;
 	}
 	if (!commandExists("rec")) return false;
-	recProcess = spawn("rec", [
+	legacyRecProcess = spawn("rec", [
 		"-q", "-r", String(SAMPLE_RATE), "-c", "1", "-b", "16", outPath,
 	], { stdio: ["pipe", "pipe", "pipe"] });
-	recProcess.stderr?.on("data", () => {});
-	recProcess.on("error", () => { recProcess = null; });
+	legacyRecProcess.stderr?.on("data", () => {});
+	legacyRecProcess.on("error", () => { legacyRecProcess = null; });
 	return true;
 }
-function stopRecording(): Promise<void> {
+function stopLegacyRecording(): Promise<void> {
 	return new Promise((resolve) => {
-		if (!recProcess) { resolve(); return; }
-		recProcess.on("close", () => { recProcess = null; resolve(); });
-		recProcess.kill("SIGTERM");
+		if (!legacyRecProcess) { resolve(); return; }
+		legacyRecProcess.on("close", () => { legacyRecProcess = null; resolve(); });
+		legacyRecProcess.kill("SIGTERM");
 		setTimeout(() => {
-			if (recProcess) { recProcess.kill("SIGKILL"); recProcess = null; }
+			if (legacyRecProcess) { legacyRecProcess.kill("SIGKILL"); legacyRecProcess = null; }
 			resolve();
 		}, 2000);
 	});
 }
-// ─── Transcription (daemon or fallback) ──────────────────────────────────────
-async function transcribeAudio(
+async function transcribeAudioFile(
 	audioPath: string,
 	config: VoiceConfig,
 ): Promise<{ text: string; duration: number; error?: string }> {
-	// Try daemon first
 	if (await isDaemonRunning()) {
 		const resp = await daemonSend({
 			cmd: "transcribe",
@@ -238,13 +242,10 @@ async function transcribeAudio(
 			return resp as { text: string; duration: number };
 		}
 	}
-	// Fallback: direct subprocess
 	return new Promise((resolve) => {
 		const args = [TRANSCRIBE_SCRIPT, "--language", config.language, audioPath];
 		if (config.backend !== "auto") args.splice(1, 0, "--backend", config.backend);
 		if (config.model) args.splice(1, 0, "--model", config.model);
 		const proc = spawn("python3", args, { stdio: ["pipe", "pipe", "pipe"] });
 		let stdout = "";
 		let stderr = "";
@@ -258,6 +259,250 @@ async function transcribeAudio(
 	});
 }
+// ─── Deepgram WebSocket Streaming ────────────────────────────────────────────
+interface StreamingSession {
+	ws: WebSocket;
+	recProcess: ChildProcess;
+	interimText: string;      // Current interim (partial) transcript
+	finalizedParts: string[]; // All finalized transcript segments
+	keepAliveTimer: ReturnType<typeof setInterval> | null;
+	closed: boolean;
+	onTranscript: (interim: string, finals: string[]) => void;
+	onDone: (fullText: string) => void;
+	onError: (err: string) => void;
+}
+function getDeepgramApiKey(): string | null {
+	// Priority: env var → config file → null
+	return process.env.DEEPGRAM_API_KEY || null;
+}
+/**
+ * Resolve the Deepgram API key from all sources:
+ * 1. process.env.DEEPGRAM_API_KEY (shell)
+ * 2. config.deepgramApiKey (settings.json, persisted at setup time)
+ */
+function resolveDeepgramApiKey(config: VoiceConfig): string | null {
+	return process.env.DEEPGRAM_API_KEY || config.deepgramApiKey || null;
+}
+function isDeepgramStreaming(config: VoiceConfig): boolean {
+	const key = resolveDeepgramApiKey(config);
+	if (!key) return false;
+	// Use streaming for deepgram backend, or auto mode when deepgram key is available
+	return config.backend === "deepgram" || (config.backend === "auto" && !!key);
+}
+function buildDeepgramWsUrl(config: VoiceConfig): string {
+	const params = new URLSearchParams({
+		encoding: ENCODING,
+		sample_rate: String(SAMPLE_RATE),
+		channels: String(CHANNELS),
+		endpointing: "300",       // ms of silence before phrase boundary
+		utterance_end_ms: "1000", // ms of silence before utterance is complete
+		language: config.language || "en",
+		model: config.model || "nova-3",
+		smart_format: "true",
+		interim_results: "true",
+	});
+	return `${DEEPGRAM_WS_URL}?${params.toString()}`;
+}
+function startStreamingSession(
+	config: VoiceConfig,
+	callbacks: {
+		onTranscript: (interim: string, finals: string[]) => void;
+		onDone: (fullText: string) => void;
+		onError: (err: string) => void;
+	},
+): StreamingSession | null {
+	const apiKey = resolveDeepgramApiKey(config);
+	if (!apiKey) {
+		callbacks.onError("DEEPGRAM_API_KEY not set");
+		return null;
+	}
+	if (!commandExists("rec")) {
+		callbacks.onError("Voice requires SoX. Install: brew install sox");
+		return null;
+	}
+	// Start SoX streaming raw PCM to stdout (no file)
+	const recProc = spawn("rec", [
+		"-q",
+		"-r", String(SAMPLE_RATE),
+		"-c", String(CHANNELS),
+		"-b", "16",
+		"-e", "signed-integer",
+		"-t", "raw",
+		"-",  // output to stdout
+	], { stdio: ["pipe", "pipe", "pipe"] });
+	recProc.stderr?.on("data", () => {}); // suppress SoX warnings
+	// Connect WebSocket to Deepgram
+	const wsUrl = buildDeepgramWsUrl(config);
+	const ws = new WebSocket(wsUrl, {
+		headers: {
+			"Authorization": `Token ${apiKey}`,
+		},
+	} as any);
+	const session: StreamingSession = {
+		ws,
+		recProcess: recProc,
+		interimText: "",
+		finalizedParts: [],
+		keepAliveTimer: null,
+		closed: false,
+		onTranscript: callbacks.onTranscript,
+		onDone: callbacks.onDone,
+		onError: callbacks.onError,
+	};
+	ws.onopen = () => {
+		// Send initial KeepAlive
+		try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
+		// Start keepalive timer
+		session.keepAliveTimer = setInterval(() => {
+			if (ws.readyState === WebSocket.OPEN) {
+				try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
+			}
+		}, KEEPALIVE_INTERVAL_MS);
+		// Pipe SoX stdout → WebSocket as binary frames
+		recProc.stdout?.on("data", (chunk: Buffer) => {
+			if (ws.readyState === WebSocket.OPEN) {
+				try { ws.send(chunk); } catch {}
+			}
+		});
+	};
+	ws.onmessage = (event: MessageEvent) => {
+		try {
+			const msg = typeof event.data === "string" ? JSON.parse(event.data) : null;
+			if (!msg) return;
+			if (msg.type === "Results") {
+				const alt = msg.channel?.alternatives?.[0];
+				const transcript = alt?.transcript || "";
+				if (msg.is_final) {
+					// Final result for this audio segment
+					if (transcript.trim()) {
+						session.finalizedParts.push(transcript.trim());
+					}
+					session.interimText = "";
+				} else {
+					// Interim result — live update
+					session.interimText = transcript;
+				}
+				session.onTranscript(session.interimText, session.finalizedParts);
+				// If speech_final is true, it's the end of an utterance
+				// (similar to TranscriptEndpoint in Claude Code's protocol)
+				if (msg.speech_final && transcript.trim()) {
+					// Already added to finalizedParts above when is_final was true
+				}
+			} else if (msg.type === "Metadata") {
+				// Connection metadata — ignore
+			} else if (msg.type === "UtteranceEnd") {
+				// Utterance boundary — Deepgram detected end of speech
+				// Nothing extra needed, is_final already handles finalization
+			} else if (msg.type === "Error" || msg.type === "error") {
+				session.onError(msg.message || msg.description || "Deepgram error");
+			}
+		} catch (e: any) {
+			// Ignore parse errors for binary data
+		}
+	};
+	ws.onerror = (event: Event) => {
+		if (!session.closed) {
+			session.onError("WebSocket connection error");
+		}
+	};
+	ws.onclose = () => {
+		if (!session.closed) {
+			finalizeSession(session);
+		}
+	};
+	recProc.on("error", (err) => {
+		session.onError(`SoX error: ${err.message}`);
+	});
+	recProc.on("close", () => {
+		// SoX stopped — send CloseStream to Deepgram
+		if (ws.readyState === WebSocket.OPEN) {
+			try { ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
+		}
+	});
+	return session;
+}
+function stopStreamingSession(session: StreamingSession): void {
+	if (session.closed) return;
+	// Stop the microphone
+	try { session.recProcess.kill("SIGTERM"); } catch {}
+	// CloseStream tells Deepgram to flush remaining audio
+	if (session.ws.readyState === WebSocket.OPEN) {
+		try { session.ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
+	}
+	// Safety: finalize after timeout even if Deepgram doesn't respond
+	setTimeout(() => {
+		if (!session.closed) {
+			finalizeSession(session);
+		}
+	}, FINALIZE_SAFETY_TIMEOUT_MS);
+	// Shorter timeout: if no new data arrives for 1.5s, assume done
+	let lastDataTime = Date.now();
+	const origOnMessage = session.ws.onmessage;
+	session.ws.onmessage = (event: MessageEvent) => {
+		lastDataTime = Date.now();
+		if (origOnMessage) origOnMessage.call(session.ws, event);
+	};
+	const noDataCheck = setInterval(() => {
+		if (Date.now() - lastDataTime > FINALIZE_NO_DATA_TIMEOUT_MS) {
+			clearInterval(noDataCheck);
+			if (!session.closed) {
+				finalizeSession(session);
+			}
+		}
+	}, 500);
+}
+function finalizeSession(session: StreamingSession): void {
+	if (session.closed) return;
+	session.closed = true;
+	// Clean up keepalive
+	if (session.keepAliveTimer) {
+		clearInterval(session.keepAliveTimer);
+		session.keepAliveTimer = null;
+	}
+	// Close WebSocket
+	try { session.ws.close(); } catch {}
+	// Kill SoX if still running
+	try { session.recProcess.kill("SIGKILL"); } catch {}
+	// Deliver final transcript
+	const fullText = session.finalizedParts.join(" ").trim();
+	session.onDone(fullText);
+}
 // ─── Extension ───────────────────────────────────────────────────────────────
 export default function (pi: ExtensionAPI) {
@@ -272,6 +517,10 @@ export default function (pi: ExtensionAPI) {
 	let terminalInputUnsub: (() => void) | null = null;
 	let isHolding = false;
+	// Streaming session state
+	let activeSession: StreamingSession | null = null;
+	let currentTarget: "editor" | "btw" = "editor";
 	// ─── BTW State ───────────────────────────────────────────────────────────
 	let btwThread: BtwExchange[] = [];
@@ -289,17 +538,19 @@ export default function (pi: ExtensionAPI) {
 				}
 				const modeTag = !config.onboarding.completed
 					? "SETUP"
-					: config.mode === "api"
-						? "API"
-						: config.mode === "local"
-							? "LOCAL"
-							: "AUTO";
+					: isDeepgramStreaming(config)
+						? "STREAM"
+						: config.mode === "api"
+							? "API"
+							: config.mode === "local"
+								? "LOCAL"
+								: "AUTO";
 				ctx.ui.setStatus("voice", `MIC ${modeTag}`);
 				break;
 			}
 			case "recording": {
 				const secs = Math.round((Date.now() - recordingStart) / 1000);
-				ctx.ui.setStatus("voice", `REC ${secs}s`);
+				ctx.ui.setStatus("voice", `🔴 REC ${secs}s`);
 				break;
 			}
 			case "transcribing":
@@ -315,7 +566,11 @@ export default function (pi: ExtensionAPI) {
 	function voiceCleanup() {
 		if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
-		if (recProcess) { recProcess.kill("SIGTERM"); recProcess = null; }
+		if (activeSession) {
+			finalizeSession(activeSession);
+			activeSession = null;
+		}
+		if (legacyRecProcess) { legacyRecProcess.kill("SIGTERM"); legacyRecProcess = null; }
 		if (tempFile) { try { fs.unlinkSync(tempFile); } catch {} tempFile = null; }
 		isHolding = false;
 		setVoiceState("idle");
@@ -332,7 +587,7 @@ export default function (pi: ExtensionAPI) {
 		const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
 		const provisioningPlan = buildProvisioningPlan(nextConfig, diagnostics);
 		let validated = provisioningPlan.ready;
-		if (validated && nextConfig.enabled) {
+		if (validated && nextConfig.enabled && !isDeepgramStreaming(nextConfig)) {
 			validated = await ensureDaemon(nextConfig);
 		}
@@ -349,53 +604,173 @@ export default function (pi: ExtensionAPI) {
 		].join("\n"), validated ? "info" : "warning");
 	}
-	// ─── Voice: Start / Stop / Transcribe ────────────────────────────────────
+	// ─── Live Transcript Widget ──────────────────────────────────────────────
-	const MAX_RECORDING_SECS = 30; // Safety cap: auto-stop after 30s
+	function updateLiveTranscriptWidget(interim: string, finals: string[]) {
+		if (!ctx?.hasUI) return;
-	async function startVoiceRecording(target: "editor" | "btw" = "editor"): Promise<boolean> {
-		if (voiceState !== "idle" || !ctx) return false;
+		const finalized = finals.join(" ");
+		const displayText = finalized + (interim ? (finalized ? " " : "") + interim : "");
-		tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
-		if (!startRecordingToFile(tempFile)) {
-			ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
-			return false;
+		if (!displayText.trim()) {
+			ctx.ui.setWidget("voice-recording", [
+				" 🎙 Listening... (speak now)",
+			], { placement: "aboveEditor" });
+			return;
+		}
+		// Show the live transcript — last 3 lines max
+		const words = displayText.split(" ");
+		const lines: string[] = [];
+		let currentLine = " 🎙 ";
+		const maxLineLen = 70;
+		for (const word of words) {
+			if ((currentLine + word).length > maxLineLen) {
+				lines.push(currentLine);
+				currentLine = "    " + word + " ";
+			} else {
+				currentLine += word + " ";
+			}
 		}
+		if (currentLine.trim()) lines.push(currentLine);
+		// Keep only last 4 lines to avoid widget overflow
+		const visibleLines = lines.slice(-4);
+		if (interim) {
+			// Show a blinking cursor for interim text
+			const lastIdx = visibleLines.length - 1;
+			visibleLines[lastIdx] = visibleLines[lastIdx].trimEnd() + "▍";
+		}
+		ctx.ui.setWidget("voice-recording", visibleLines, { placement: "aboveEditor" });
+	}
+	// ─── Voice: Start / Stop (Streaming or Legacy) ───────────────────────────
+	async function startVoiceRecording(target: "editor" | "btw" = "editor"): Promise<boolean> {
+		if (voiceState !== "idle" || !ctx) return false;
+		currentTarget = target;
 		recordingStart = Date.now();
-		setVoiceState("recording");
-		statusTimer = setInterval(() => {
-			if (voiceState === "recording") {
-				updateVoiceStatus();
-				// Safety: auto-stop after MAX_RECORDING_SECS
-				const elapsed = (Date.now() - recordingStart) / 1000;
-				if (elapsed >= MAX_RECORDING_SECS) {
-					isHolding = false;
-					stopVoiceRecording(target);
+		if (isDeepgramStreaming(config)) {
+			// === STREAMING PATH === (Deepgram WebSocket)
+			setVoiceState("recording");
+			const session = startStreamingSession(config, {
+				onTranscript: (interim, finals) => {
+					updateLiveTranscriptWidget(interim, finals);
+					updateVoiceStatus();
+				},
+				onDone: (fullText) => {
+					activeSession = null;
+					ctx?.ui.setWidget("voice-recording", undefined);
+					if (!fullText.trim()) {
+						ctx?.ui.notify("No speech detected.", "warning");
+						setVoiceState("idle");
+						return;
+					}
+					if (target === "btw") {
+						handleBtw(fullText);
+					} else {
+						if (ctx?.hasUI) {
+							const existing = ctx.ui.getEditorText();
+							ctx.ui.setEditorText(existing ? existing + " " + fullText : fullText);
+							const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
+							ctx.ui.notify(
+								`STT (${elapsed}s): ${fullText.slice(0, 80)}${fullText.length > 80 ? "..." : ""}`,
+								"info",
+							);
+						}
+					}
+					setVoiceState("idle");
+				},
+				onError: (err) => {
+					activeSession = null;
+					ctx?.ui.setWidget("voice-recording", undefined);
+					ctx?.ui.notify(`STT error: ${err}`, "error");
+					setVoiceState("idle");
+				},
+			});
+			if (!session) {
+				setVoiceState("idle");
+				return false;
+			}
+			activeSession = session;
+			// Status timer for elapsed time
+			statusTimer = setInterval(() => {
+				if (voiceState === "recording") {
+					updateVoiceStatus();
+					const elapsed = (Date.now() - recordingStart) / 1000;
+					if (elapsed >= MAX_RECORDING_SECS) {
+						isHolding = false;
+						stopVoiceRecording(target);
+					}
 				}
+			}, 1000);
+			if (ctx.hasUI) {
+				ctx.ui.setWidget("voice-recording", [
+					" 🎙 Listening... speak now — press SPACE again to stop",
+				], { placement: "aboveEditor" });
 			}
-		}, 1000);
+			return true;
-		if (ctx.hasUI) {
-			ctx.ui.setWidget("voice-recording", [
-				target === "btw"
-					? " 🎙 BTW Recording... Ctrl+Shift+V to stop"
-					: " 🎙 Recording... Ctrl+Shift+V to stop (or release SPACE)",
-			], { placement: "aboveEditor" });
+		} else {
+			// === LEGACY PATH === (file-based for local backends)
+			tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
+			if (!startLegacyRecordingToFile(tempFile)) {
+				ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
+				return false;
+			}
+			setVoiceState("recording");
+			statusTimer = setInterval(() => {
+				if (voiceState === "recording") {
+					updateVoiceStatus();
+					const elapsed = (Date.now() - recordingStart) / 1000;
+					if (elapsed >= MAX_RECORDING_SECS) {
+						isHolding = false;
+						stopVoiceRecording(target);
+					}
+				}
+			}, 1000);
+			if (ctx.hasUI) {
+				ctx.ui.setWidget("voice-recording", [
+					target === "btw"
+						? " 🎙 BTW Recording... Ctrl+Shift+V to stop"
+						: " 🎙 Recording... Ctrl+Shift+V to stop (or release SPACE)",
+				], { placement: "aboveEditor" });
+			}
+			return true;
 		}
-		return true;
 	}
 	async function stopVoiceRecording(target: "editor" | "btw" = "editor") {
 		if (voiceState !== "recording" || !ctx) return;
 		if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
+		if (activeSession) {
+			// === STREAMING PATH === Stop the stream, finalize will call onDone
+			setVoiceState("transcribing");
+			stopStreamingSession(activeSession);
+			return;
+		}
+		// === LEGACY PATH ===
 		const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
-		const audioFile = tempFile; // capture before cleanup can null it
+		const audioFile = tempFile;
 		setVoiceState("transcribing");
 		ctx.ui.setWidget("voice-recording", undefined);
-		await stopRecording();
+		await stopLegacyRecording();
 		if (!audioFile || !fs.existsSync(audioFile)) {
 			ctx.ui.notify("No audio recorded.", "warning");
@@ -412,12 +787,9 @@ export default function (pi: ExtensionAPI) {
 			return;
 		}
-		// Ensure daemon is up before transcribing — await so the warm path
-		// is available for this request instead of falling through to the
-		// cold subprocess fallback.
 		await ensureDaemon(config).catch(() => {});
-		const result = await transcribeAudio(audioFile, config);
+		const result = await transcribeAudioFile(audioFile, config);
 		try { fs.unlinkSync(audioFile); } catch {}
 		if (tempFile === audioFile) tempFile = null;
@@ -437,7 +809,6 @@ export default function (pi: ExtensionAPI) {
 		if (target === "btw") {
 			await handleBtw(transcript);
 		} else {
-			// Inject into editor
 			if (ctx.hasUI) {
 				const existing = ctx.ui.getEditorText();
 				ctx.ui.setEditorText(existing ? existing + " " + transcript : transcript);
@@ -451,25 +822,38 @@ export default function (pi: ExtensionAPI) {
 		setVoiceState("idle");
 	}
-	// ─── Hold-to-talk via Kitty protocol ─────────────────────────────────────
+	// ─── Hold-to-talk / Toggle-to-talk ──────────────────────────────────────
+	//
+	// Kitty protocol terminals (Ghostty, WezTerm, Kitty) send key-release
+	// events (":3u" sequences), enabling true hold-to-talk.
+	//
+	// Non-Kitty terminals (Apple Terminal, iTerm2 without config, basic xterm)
+	// only send key-press. We detect this and fall back to toggle:
+	//   1st SPACE press → start recording
+	//   2nd SPACE press → stop recording + transcribe
+	//
+	// We auto-detect Kitty support: if we see a key-release within the first
+	// recording, we know hold-to-talk works. Otherwise, we stay in toggle mode.
+	let kittyReleaseDetected = false; // have we ever seen a Kitty release event?
 	function setupHoldToTalk() {
 		if (!ctx?.hasUI) return;
-		// Remove previous listener
 		if (terminalInputUnsub) { terminalInputUnsub(); terminalInputUnsub = null; }
 		terminalInputUnsub = ctx.ui.onTerminalInput((data: string) => {
 			if (!config.enabled) return undefined;
-			// Hold SPACE → talk → release → transcribe to editor
+			// ── SPACE handling ──
 			if (matchesKey(data, "space")) {
-				// Only activate when editor is empty (avoid conflicting with typing)
 				const editorText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
 				if (editorText && editorText.trim().length > 0) return undefined;
+				// Kitty key-release: stop recording
 				if (isKeyRelease(data)) {
-					if (isHolding) {
+					kittyReleaseDetected = true;
+					if (isHolding && voiceState === "recording") {
 						isHolding = false;
 						stopVoiceRecording("editor");
 						return { consume: true };
@@ -477,12 +861,27 @@ export default function (pi: ExtensionAPI) {
 					return undefined;
 				}
+				// Kitty key-repeat: suppress while holding
 				if (isKeyRepeat(data)) {
 					if (isHolding) return { consume: true };
 					return undefined;
 				}
-				// Key press — start recording
+				// === Key PRESS ===
+				// Currently recording? → this is the "stop" press (toggle mode)
+				if (voiceState === "recording") {
+					isHolding = false;
+					stopVoiceRecording("editor");
+					return { consume: true };
+				}
+				// Currently transcribing? → ignore, wait for it to finish
+				if (voiceState === "transcribing") {
+					return { consume: true };
+				}
+				// Idle → start recording
 				if (voiceState === "idle" && !isHolding) {
 					isHolding = true;
 					startVoiceRecording("editor").then((ok) => {
@@ -495,10 +894,11 @@ export default function (pi: ExtensionAPI) {
 				return undefined;
 			}
-			// Hold Ctrl+Shift+B → talk → release → auto-btw
+			// ── Ctrl+Shift+B handling (BTW voice) ──
 			if (matchesKey(data, "ctrl+shift+b")) {
 				if (isKeyRelease(data)) {
-					if (isHolding) {
+					kittyReleaseDetected = true;
+					if (isHolding && voiceState === "recording") {
 						isHolding = false;
 						stopVoiceRecording("btw");
 						return { consume: true };
@@ -511,6 +911,13 @@ export default function (pi: ExtensionAPI) {
 					return undefined;
 				}
+				// Toggle: stop if recording
+				if (voiceState === "recording") {
+					isHolding = false;
+					stopVoiceRecording("btw");
+					return { consume: true };
+				}
 				if (voiceState === "idle" && !isHolding) {
 					isHolding = true;
 					startVoiceRecording("btw").then((ok) => {
@@ -523,12 +930,6 @@ export default function (pi: ExtensionAPI) {
 				return undefined;
 			}
-			// Any other key while holding = cancel
-			if (isHolding && voiceState === "recording") {
-				// Don't cancel on modifier-only events
-				return undefined;
-			}
 			return undefined;
 		});
 	}
@@ -536,7 +937,6 @@ export default function (pi: ExtensionAPI) {
 	// ─── BTW: Side Conversations ─────────────────────────────────────────────
 	function buildBtwContext(): string {
-		// Build context from main session + btw thread
 		const systemPrompt = ctx?.getSystemPrompt() ?? "";
 		let btwContext = "You are a helpful side-channel assistant. ";
 		btwContext += "The user is having a parallel conversation while their main Pi agent works. ";
@@ -570,7 +970,6 @@ export default function (pi: ExtensionAPI) {
 			"",
 		];
-		// Show last exchange
 		lines.push(`  Q: ${last.question.slice(0, 100)}${last.question.length > 100 ? "..." : ""}`);
 		const answerLines = last.answer.split("\n");
 		for (const line of answerLines.slice(0, 8)) {
@@ -589,7 +988,6 @@ export default function (pi: ExtensionAPI) {
 		btwWidgetVisible = true;
-		// Show thinking state
 		ctx.ui.setWidget("btw", [
 			" BTW",
 			"",
@@ -598,10 +996,8 @@ export default function (pi: ExtensionAPI) {
 			"  Thinking...",
 		], { placement: "aboveEditor" });
-		// Build context for LLM
 		const btwContext = buildBtwContext();
-		// Use the model registry to get current model
 		const model = ctx.model;
 		if (!model) {
 			const exchange: BtwExchange = {
@@ -616,7 +1012,6 @@ export default function (pi: ExtensionAPI) {
 		}
 		try {
-			// Stream the response
 			let answer = "";
 			const eventStream = streamSimple(model, {
 				systemPrompt: btwContext,
@@ -633,7 +1028,6 @@ export default function (pi: ExtensionAPI) {
 					break;
 				}
-				// Update widget with streaming response
 				const displayLines: string[] = [
 					` BTW`,
 					"",
@@ -657,7 +1051,6 @@ export default function (pi: ExtensionAPI) {
 			pi.appendEntry("btw", exchange);
 			updateBtwWidget();
 		} catch (err: any) {
-			// Fallback: send as a follow-up message to the main agent
 			const exchange: BtwExchange = {
 				question: message,
 				answer: `(BTW streaming failed: ${err.message}. Falling back to sendUserMessage.)`,
@@ -667,7 +1060,6 @@ export default function (pi: ExtensionAPI) {
 			pi.appendEntry("btw", exchange);
 			updateBtwWidget();
-			// Use sendUserMessage as alternative
 			pi.sendUserMessage(
 				`[BTW question]: ${message}`,
 				{ deliverAs: "followUp" },
@@ -677,7 +1069,6 @@ export default function (pi: ExtensionAPI) {
 	// ─── Shortcuts ───────────────────────────────────────────────────────────
-	// Ctrl+Shift+V = toggle voice (fallback for non-Kitty terminals)
 	pi.registerShortcut("ctrl+shift+v", {
 		description: "Toggle voice recording (start/stop)",
 		handler: async (handlerCtx) => {
@@ -705,12 +1096,42 @@ export default function (pi: ExtensionAPI) {
 		configSource = loaded.source;
 		updateSocketPath(config, currentCwd);
-		// No auto-popup on startup. Users run `/voice setup` to configure.
-		// Only activate voice features if setup has been completed previously.
+		// Auto-capture DEEPGRAM_API_KEY from env into config if not already stored.
+		// This ensures streaming works even when Pi is launched from a context
+		// that doesn't source .zshrc (GUI app, tmux, etc.)
+		if (process.env.DEEPGRAM_API_KEY && !config.deepgramApiKey) {
+			config.deepgramApiKey = process.env.DEEPGRAM_API_KEY;
+			if (configSource !== "default") {
+				saveConfig(config, config.scope, currentCwd);
+			}
+		}
+		// Also try to load DEEPGRAM_API_KEY from shell if not in process.env and not in config
+		if (!resolveDeepgramApiKey(config) && config.backend === "deepgram") {
+			try {
+				const result = spawnSync("zsh", ["-ic", "echo $DEEPGRAM_API_KEY"], {
+					stdio: ["pipe", "pipe", "pipe"],
+					timeout: 3000,
+					env: { ...process.env, HOME: os.homedir() },
+				});
+				const shellKey = result.stdout?.toString().trim();
+				if (shellKey && shellKey.length > 5) {
+					config.deepgramApiKey = shellKey;
+					process.env.DEEPGRAM_API_KEY = shellKey; // Also set for child processes
+					if (configSource !== "default") {
+						saveConfig(config, config.scope, currentCwd);
+					}
+				}
+			} catch {}
+		}
 		if (config.enabled && config.onboarding.completed) {
 			updateVoiceStatus();
 			setupHoldToTalk();
-			ensureDaemon(config).catch(() => {});
+			// Only start daemon for non-streaming backends
+			if (!isDeepgramStreaming(config)) {
+				ensureDaemon(config).catch(() => {});
+			}
 		}
 	});
@@ -764,8 +1185,11 @@ export default function (pi: ExtensionAPI) {
 				config.enabled = true;
 				updateVoiceStatus();
 				setupHoldToTalk();
-				ensureDaemon(config).catch(() => {});
-				cmdCtx.ui.notify("Voice enabled.\n  Hold SPACE (empty editor) → release to transcribe\n  Ctrl+Shift+V → toggle recording on/off\n  Auto-stops after 30s", "info");
+				if (!isDeepgramStreaming(config)) {
+					ensureDaemon(config).catch(() => {});
+				}
+				const mode = isDeepgramStreaming(config) ? "Deepgram streaming" : config.backend;
+				cmdCtx.ui.notify(`Voice enabled (${mode}).\n  Hold SPACE (empty editor) → release to transcribe\n  Ctrl+Shift+V → toggle recording on/off\n  Live transcription shown while speaking`, "info");
 				return;
 			}
@@ -779,7 +1203,6 @@ export default function (pi: ExtensionAPI) {
 			}
 			if (sub === "stop") {
-				// Emergency stop — cancel any active recording
 				if (voiceState === "recording") {
 					isHolding = false;
 					await stopVoiceRecording("editor");
@@ -793,6 +1216,8 @@ export default function (pi: ExtensionAPI) {
 			if (sub === "test") {
 				cmdCtx.ui.notify("Testing voice setup...", "info");
 				const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
+				const dgKey = resolveDeepgramApiKey(config);
+				const streaming = isDeepgramStreaming(config);
 				const daemonUp = await isDaemonRunning();
 				const provisioningPlan = buildProvisioningPlan(config, diagnostics);
 				const selectedBackend = diagnostics.backends.find((backend) => backend.name === config.backend);
@@ -805,6 +1230,8 @@ export default function (pi: ExtensionAPI) {
 					`  model: ${config.model}`,
 					`  model status: ${modelReadiness}`,
 					`  language: ${config.language}`,
+					`  streaming: ${streaming ? "YES (Deepgram WS)" : "NO (batch)"}`,
+					`  DEEPGRAM_API_KEY: ${dgKey ? "set (" + dgKey.slice(0, 8) + "...)" : "NOT SET"}`,
 					`  onboarding: ${config.onboarding.completed ? "complete" : "incomplete"}`,
 					`  python3: ${diagnostics.hasPython ? "OK" : "missing"}`,
 					`  sox/rec: ${diagnostics.hasSox ? "OK" : "missing"}`,
@@ -826,11 +1253,10 @@ export default function (pi: ExtensionAPI) {
 					}
 				}
-				lines.push("", "Suggested commands:");
-				lines.push(...(provisioningPlan.commands.length > 0 ? provisioningPlan.commands.map((command) => `  - ${command}`) : ["  - none"]));
-				if (provisioningPlan.manualSteps.length > 0) {
-					lines.push("", "Manual steps:");
-					lines.push(...provisioningPlan.manualSteps.map((step) => `  - ${step}`));
+				if (!dgKey && config.backend === "deepgram") {
+					lines.push("");
+					lines.push("⚠️  DEEPGRAM_API_KEY not set! Add to ~/.zshrc or ~/.env.secrets");
+					lines.push("   export DEEPGRAM_API_KEY=your_key_here");
 				}
 				cmdCtx.ui.notify(lines.join("\n"), provisioningPlan.ready ? "info" : "warning");
@@ -847,22 +1273,24 @@ export default function (pi: ExtensionAPI) {
 				const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
 				const selectedBackend = diagnostics.backends.find((backend) => backend.name === config.backend);
 				const modelReadiness = getModelReadiness(selectedBackend, config.model);
+				const streaming = isDeepgramStreaming(config);
 				cmdCtx.ui.notify([
 					`Voice config:`,
-					`  enabled:  ${config.enabled}`,
-					`  mode:     ${config.mode}`,
-					`  scope:    ${config.scope}`,
-					`  backend:  ${config.backend}`,
-					`  model:    ${config.model}`,
-					`  model status: ${modelReadiness}`,
-					`  language: ${config.language}`,
-					`  state:    ${voiceState}`,
-					`  setup:    ${config.onboarding.completed ? `complete (${config.onboarding.source ?? "unknown"})` : "incomplete"}`,
-					`  socket:   ${activeSocketPath}`,
-					`  daemon:   ${daemonUp ? "running" : "stopped"}${daemonInfo}`,
-					`  hold-key: SPACE (editor empty) or Ctrl+Shift+V (toggle)`,
-					`  btw-key:  Ctrl+Shift+B (hold to record → auto-btw)`,
+					`  enabled:    ${config.enabled}`,
+					`  mode:       ${config.mode}`,
+					`  scope:      ${config.scope}`,
+					`  backend:    ${config.backend}`,
+					`  model:      ${config.model}`,
+					`  model stat: ${modelReadiness}`,
+					`  language:   ${config.language}`,
+					`  streaming:  ${streaming ? "YES (Deepgram WebSocket)" : "NO (batch)"}`,
+					`  state:      ${voiceState}`,
+					`  setup:      ${config.onboarding.completed ? `complete (${config.onboarding.source ?? "unknown"})` : "incomplete"}`,
+					`  socket:     ${activeSocketPath}`,
+					`  daemon:     ${daemonUp ? "running" : "stopped"}${daemonInfo}`,
+					`  hold-key:   SPACE (editor empty) or Ctrl+Shift+V (toggle)`,
+					`  btw-key:    Ctrl+Shift+B (hold to record → auto-btw)`,
 				].join("\n"), "info");
 				return;
 			}
@@ -905,7 +1333,6 @@ export default function (pi: ExtensionAPI) {
 					cmdCtx.ui.notify("Voice setup cancelled.", "warning");
 					return;
 				}
 				await finalizeAndSaveSetup(cmdCtx, result.config, result.selectedScope, result.summaryLines, "setup-command");
 				return;
 			}
@@ -1013,7 +1440,7 @@ export default function (pi: ExtensionAPI) {
 		},
 	});
-	// ─── Dedicated setup command (discoverable in /command list) ──────────────
+	// ─── Dedicated setup command ─────────────────────────────────────────────
 	pi.registerCommand("voice-setup", {
 		description: "Configure voice input — select backend, model, and language",
@@ -1081,7 +1508,6 @@ export default function (pi: ExtensionAPI) {
 			pi.sendUserMessage(content, { deliverAs: "followUp" });
-			// Clear after injection
 			btwThread = [];
 			btwWidgetVisible = false;
 			cmdCtx.ui.setWidget("btw", undefined);
@@ -1106,7 +1532,6 @@ export default function (pi: ExtensionAPI) {
 				threadText += `Q: ${ex.question}\nA: ${ex.answer}\n\n`;
 			}
-			// Ask the model to summarize
 			const model = ctx.model;
 			if (!model) {
 				cmdCtx.ui.notify("No model available for summarization.", "error");

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@codexstar/pi-listen",
-  "version": "1.0.12",
+  "version": "1.0.13",
   "description": "Voice input, first-run onboarding, and side-channel BTW conversations for Pi",
   "type": "module",
   "keywords": [