@codexstar/pi-listen 1.0.12 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,8 @@ export interface VoiceConfig {
31
31
  scope: VoiceSettingsScope;
32
32
  btwEnabled: boolean;
33
33
  onboarding: VoiceOnboardingState;
34
+ /** Deepgram API key — stored in config so it's available even when env var isn't set */
35
+ deepgramApiKey?: string;
34
36
  }
35
37
 
36
38
  export interface LoadedVoiceConfig {
@@ -60,6 +62,7 @@ export const DEFAULT_CONFIG: VoiceConfig = {
60
62
  model: "small",
61
63
  scope: "global",
62
64
  btwEnabled: true,
65
+ deepgramApiKey: undefined,
63
66
  onboarding: {
64
67
  completed: false,
65
68
  schemaVersion: VOICE_CONFIG_VERSION,
@@ -121,6 +124,7 @@ function migrateConfig(rawVoice: any, source: VoiceConfigSource): VoiceConfig {
121
124
  model: typeof rawVoice.model === "string" ? rawVoice.model : DEFAULT_CONFIG.model,
122
125
  scope: (rawVoice.scope as VoiceSettingsScope | undefined) ?? (source === "project" ? "project" : "global"),
123
126
  btwEnabled: typeof rawVoice.btwEnabled === "boolean" ? rawVoice.btwEnabled : DEFAULT_CONFIG.btwEnabled,
127
+ deepgramApiKey: typeof rawVoice.deepgramApiKey === "string" ? rawVoice.deepgramApiKey : undefined,
124
128
  onboarding: normalizeOnboarding(rawVoice.onboarding, fallbackCompleted),
125
129
  };
126
130
  }
@@ -1,22 +1,27 @@
1
1
  /**
2
- * pi-voice — Voice input + BTW side conversations for Pi CLI.
2
+ * pi-voice — Deepgram WebSocket streaming STT for Pi CLI.
3
3
  *
4
- * Features:
5
- * 1. Hold-spacebar to talk (Kitty protocol key release detection)
6
- * Fallback: Ctrl+Shift+V toggle for non-Kitty terminals
7
- * 2. BTW side conversations (/btw <msg>, /btw:new, /btw:clear, /btw:inject, /btw:summarize)
8
- * 3. Voice BTW glue: Ctrl+Shift+B = hold to record → auto-send as /btw
4
+ * Architecture (modeled after Claude Code's voice pipeline):
5
+ * 1. SoX `rec` captures mic audio as raw PCM (16kHz, mono, 16-bit)
6
+ * and pipes it to stdout (no file).
7
+ * 2. Raw PCM chunks are streamed over a WebSocket to Deepgram Nova 3.
8
+ * 3. Deepgram returns interim + final transcripts in real-time.
9
+ * 4. Interim transcripts update a live widget above the editor.
10
+ * 5. On key-release (or toggle stop), a CloseStream message is sent;
11
+ * final transcript is injected into the editor.
9
12
  *
10
- * Records audio via SoX, transcribes via persistent daemon (daemon.py) or fallback subprocess.
11
- * STT backends: faster-whisper, moonshine, whisper.cpp, deepgram, parakeet.
13
+ * Activation:
14
+ * - Hold SPACE (empty editor) release to finalize
15
+ * - Ctrl+Shift+V → toggle start/stop (fallback for non-Kitty terminals)
16
+ * - Ctrl+Shift+B → hold to record → auto-send as /btw
12
17
  *
13
- * Config in ~/.pi/agent/settings.json or <project>/.pi/settings.json:
18
+ * Config in ~/.pi/agent/settings.json:
14
19
  * {
15
20
  * "voice": {
16
21
  * "enabled": true,
17
22
  * "language": "en",
18
- * "backend": "faster-whisper",
19
- * "model": "small"
23
+ * "backend": "deepgram",
24
+ * "model": "nova-3"
20
25
  * }
21
26
  * }
22
27
  */
@@ -65,6 +70,14 @@ interface BtwExchange {
65
70
  // ─── Constants ───────────────────────────────────────────────────────────────
66
71
 
67
72
  const SAMPLE_RATE = 16000;
73
+ const CHANNELS = 1;
74
+ const ENCODING = "linear16";
75
+ const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
76
+ const KEEPALIVE_INTERVAL_MS = 8000;
77
+ const FINALIZE_SAFETY_TIMEOUT_MS = 5000;
78
+ const FINALIZE_NO_DATA_TIMEOUT_MS = 1500;
79
+ const MAX_RECORDING_SECS = 120; // 2 minutes safety cap (streaming is efficient)
80
+
68
81
  const EXT_DIR = path.dirname(new URL(import.meta.url).pathname);
69
82
  const PROJECT_ROOT = path.join(EXT_DIR, "..");
70
83
  const DAEMON_SCRIPT = path.join(PROJECT_ROOT, "daemon.py");
@@ -74,7 +87,7 @@ function commandExists(cmd: string): boolean {
74
87
  return spawnSync("which", [cmd], { stdio: "pipe", timeout: 3000 }).status === 0;
75
88
  }
76
89
 
77
- // ─── Daemon Communication ────────────────────────────────────────────────────
90
+ // ─── Daemon Communication (kept for non-deepgram local backends) ─────────────
78
91
 
79
92
  let activeSocketPath = getSocketPath({
80
93
  scope: DEFAULT_CONFIG.scope,
@@ -135,8 +148,6 @@ async function isDaemonRunning(socketPath = activeSocketPath): Promise<boolean>
135
148
  async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
136
149
  if (await isDaemonRunning(activeSocketPath)) {
137
150
  const status = await daemonSend({ cmd: "status" }, 3000, activeSocketPath);
138
- // When backend is 'auto', accept any loaded backend — the daemon already
139
- // resolved 'auto' to a concrete backend, so we don't need to reload.
140
151
  if (config.backend === "auto" || (status.backend === config.backend && status.model === config.model)) return true;
141
152
  const reloaded = await daemonSend({
142
153
  cmd: "load",
@@ -175,7 +186,6 @@ async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
175
186
 
176
187
  proc.on("error", () => resolve(false));
177
188
 
178
- // Timeout: if daemon doesn't start in 10s, kill orphan and fall back
179
189
  setTimeout(() => {
180
190
  if (!started) {
181
191
  try { proc.kill(); } catch {}
@@ -185,46 +195,40 @@ async function ensureDaemon(config: VoiceConfig): Promise<boolean> {
185
195
  });
186
196
  }
187
197
 
188
- // ─── Audio Recording ─────────────────────────────────────────────────────────
198
+ // ─── Legacy file-based transcription (for non-deepgram backends) ─────────────
189
199
 
190
- let recProcess: ChildProcess | null = null;
200
+ let legacyRecProcess: ChildProcess | null = null;
191
201
 
192
- function startRecordingToFile(outPath: string): boolean {
193
- if (recProcess) {
194
- recProcess.kill("SIGTERM");
195
- recProcess = null;
202
+ function startLegacyRecordingToFile(outPath: string): boolean {
203
+ if (legacyRecProcess) {
204
+ legacyRecProcess.kill("SIGTERM");
205
+ legacyRecProcess = null;
196
206
  }
197
-
198
207
  if (!commandExists("rec")) return false;
199
-
200
- recProcess = spawn("rec", [
208
+ legacyRecProcess = spawn("rec", [
201
209
  "-q", "-r", String(SAMPLE_RATE), "-c", "1", "-b", "16", outPath,
202
210
  ], { stdio: ["pipe", "pipe", "pipe"] });
203
-
204
- recProcess.stderr?.on("data", () => {});
205
- recProcess.on("error", () => { recProcess = null; });
211
+ legacyRecProcess.stderr?.on("data", () => {});
212
+ legacyRecProcess.on("error", () => { legacyRecProcess = null; });
206
213
  return true;
207
214
  }
208
215
 
209
- function stopRecording(): Promise<void> {
216
+ function stopLegacyRecording(): Promise<void> {
210
217
  return new Promise((resolve) => {
211
- if (!recProcess) { resolve(); return; }
212
- recProcess.on("close", () => { recProcess = null; resolve(); });
213
- recProcess.kill("SIGTERM");
218
+ if (!legacyRecProcess) { resolve(); return; }
219
+ legacyRecProcess.on("close", () => { legacyRecProcess = null; resolve(); });
220
+ legacyRecProcess.kill("SIGTERM");
214
221
  setTimeout(() => {
215
- if (recProcess) { recProcess.kill("SIGKILL"); recProcess = null; }
222
+ if (legacyRecProcess) { legacyRecProcess.kill("SIGKILL"); legacyRecProcess = null; }
216
223
  resolve();
217
224
  }, 2000);
218
225
  });
219
226
  }
220
227
 
221
- // ─── Transcription (daemon or fallback) ──────────────────────────────────────
222
-
223
- async function transcribeAudio(
228
+ async function transcribeAudioFile(
224
229
  audioPath: string,
225
230
  config: VoiceConfig,
226
231
  ): Promise<{ text: string; duration: number; error?: string }> {
227
- // Try daemon first
228
232
  if (await isDaemonRunning()) {
229
233
  const resp = await daemonSend({
230
234
  cmd: "transcribe",
@@ -238,13 +242,10 @@ async function transcribeAudio(
238
242
  return resp as { text: string; duration: number };
239
243
  }
240
244
  }
241
-
242
- // Fallback: direct subprocess
243
245
  return new Promise((resolve) => {
244
246
  const args = [TRANSCRIBE_SCRIPT, "--language", config.language, audioPath];
245
247
  if (config.backend !== "auto") args.splice(1, 0, "--backend", config.backend);
246
248
  if (config.model) args.splice(1, 0, "--model", config.model);
247
-
248
249
  const proc = spawn("python3", args, { stdio: ["pipe", "pipe", "pipe"] });
249
250
  let stdout = "";
250
251
  let stderr = "";
@@ -258,6 +259,250 @@ async function transcribeAudio(
258
259
  });
259
260
  }
260
261
 
262
+ // ─── Deepgram WebSocket Streaming ────────────────────────────────────────────
263
+
264
+ interface StreamingSession {
265
+ ws: WebSocket;
266
+ recProcess: ChildProcess;
267
+ interimText: string; // Current interim (partial) transcript
268
+ finalizedParts: string[]; // All finalized transcript segments
269
+ keepAliveTimer: ReturnType<typeof setInterval> | null;
270
+ closed: boolean;
271
+ onTranscript: (interim: string, finals: string[]) => void;
272
+ onDone: (fullText: string) => void;
273
+ onError: (err: string) => void;
274
+ }
275
+
276
+ function getDeepgramApiKey(): string | null {
277
+ // Priority: env var → config file → null
278
+ return process.env.DEEPGRAM_API_KEY || null;
279
+ }
280
+
281
+ /**
282
+ * Resolve the Deepgram API key from all sources:
283
+ * 1. process.env.DEEPGRAM_API_KEY (shell)
284
+ * 2. config.deepgramApiKey (settings.json, persisted at setup time)
285
+ */
286
+ function resolveDeepgramApiKey(config: VoiceConfig): string | null {
287
+ return process.env.DEEPGRAM_API_KEY || config.deepgramApiKey || null;
288
+ }
289
+
290
+ function isDeepgramStreaming(config: VoiceConfig): boolean {
291
+ const key = resolveDeepgramApiKey(config);
292
+ if (!key) return false;
293
+ // Use streaming for deepgram backend, or auto mode when deepgram key is available
294
+ return config.backend === "deepgram" || (config.backend === "auto" && !!key);
295
+ }
296
+
297
+ function buildDeepgramWsUrl(config: VoiceConfig): string {
298
+ const params = new URLSearchParams({
299
+ encoding: ENCODING,
300
+ sample_rate: String(SAMPLE_RATE),
301
+ channels: String(CHANNELS),
302
+ endpointing: "300", // ms of silence before phrase boundary
303
+ utterance_end_ms: "1000", // ms of silence before utterance is complete
304
+ language: config.language || "en",
305
+ model: config.model || "nova-3",
306
+ smart_format: "true",
307
+ interim_results: "true",
308
+ });
309
+ return `${DEEPGRAM_WS_URL}?${params.toString()}`;
310
+ }
311
+
312
+ function startStreamingSession(
313
+ config: VoiceConfig,
314
+ callbacks: {
315
+ onTranscript: (interim: string, finals: string[]) => void;
316
+ onDone: (fullText: string) => void;
317
+ onError: (err: string) => void;
318
+ },
319
+ ): StreamingSession | null {
320
+ const apiKey = resolveDeepgramApiKey(config);
321
+ if (!apiKey) {
322
+ callbacks.onError("DEEPGRAM_API_KEY not set");
323
+ return null;
324
+ }
325
+
326
+ if (!commandExists("rec")) {
327
+ callbacks.onError("Voice requires SoX. Install: brew install sox");
328
+ return null;
329
+ }
330
+
331
+ // Start SoX streaming raw PCM to stdout (no file)
332
+ const recProc = spawn("rec", [
333
+ "-q",
334
+ "-r", String(SAMPLE_RATE),
335
+ "-c", String(CHANNELS),
336
+ "-b", "16",
337
+ "-e", "signed-integer",
338
+ "-t", "raw",
339
+ "-", // output to stdout
340
+ ], { stdio: ["pipe", "pipe", "pipe"] });
341
+
342
+ recProc.stderr?.on("data", () => {}); // suppress SoX warnings
343
+
344
+ // Connect WebSocket to Deepgram
345
+ const wsUrl = buildDeepgramWsUrl(config);
346
+ const ws = new WebSocket(wsUrl, {
347
+ headers: {
348
+ "Authorization": `Token ${apiKey}`,
349
+ },
350
+ } as any);
351
+
352
+ const session: StreamingSession = {
353
+ ws,
354
+ recProcess: recProc,
355
+ interimText: "",
356
+ finalizedParts: [],
357
+ keepAliveTimer: null,
358
+ closed: false,
359
+ onTranscript: callbacks.onTranscript,
360
+ onDone: callbacks.onDone,
361
+ onError: callbacks.onError,
362
+ };
363
+
364
+ ws.onopen = () => {
365
+ // Send initial KeepAlive
366
+ try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
367
+
368
+ // Start keepalive timer
369
+ session.keepAliveTimer = setInterval(() => {
370
+ if (ws.readyState === WebSocket.OPEN) {
371
+ try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
372
+ }
373
+ }, KEEPALIVE_INTERVAL_MS);
374
+
375
+ // Pipe SoX stdout → WebSocket as binary frames
376
+ recProc.stdout?.on("data", (chunk: Buffer) => {
377
+ if (ws.readyState === WebSocket.OPEN) {
378
+ try { ws.send(chunk); } catch {}
379
+ }
380
+ });
381
+ };
382
+
383
+ ws.onmessage = (event: MessageEvent) => {
384
+ try {
385
+ const msg = typeof event.data === "string" ? JSON.parse(event.data) : null;
386
+ if (!msg) return;
387
+
388
+ if (msg.type === "Results") {
389
+ const alt = msg.channel?.alternatives?.[0];
390
+ const transcript = alt?.transcript || "";
391
+
392
+ if (msg.is_final) {
393
+ // Final result for this audio segment
394
+ if (transcript.trim()) {
395
+ session.finalizedParts.push(transcript.trim());
396
+ }
397
+ session.interimText = "";
398
+ } else {
399
+ // Interim result — live update
400
+ session.interimText = transcript;
401
+ }
402
+
403
+ session.onTranscript(session.interimText, session.finalizedParts);
404
+
405
+ // If speech_final is true, it's the end of an utterance
406
+ // (similar to TranscriptEndpoint in Claude Code's protocol)
407
+ if (msg.speech_final && transcript.trim()) {
408
+ // Already added to finalizedParts above when is_final was true
409
+ }
410
+ } else if (msg.type === "Metadata") {
411
+ // Connection metadata — ignore
412
+ } else if (msg.type === "UtteranceEnd") {
413
+ // Utterance boundary — Deepgram detected end of speech
414
+ // Nothing extra needed, is_final already handles finalization
415
+ } else if (msg.type === "Error" || msg.type === "error") {
416
+ session.onError(msg.message || msg.description || "Deepgram error");
417
+ }
418
+ } catch (e: any) {
419
+ // Ignore parse errors for binary data
420
+ }
421
+ };
422
+
423
+ ws.onerror = (event: Event) => {
424
+ if (!session.closed) {
425
+ session.onError("WebSocket connection error");
426
+ }
427
+ };
428
+
429
+ ws.onclose = () => {
430
+ if (!session.closed) {
431
+ finalizeSession(session);
432
+ }
433
+ };
434
+
435
+ recProc.on("error", (err) => {
436
+ session.onError(`SoX error: ${err.message}`);
437
+ });
438
+
439
+ recProc.on("close", () => {
440
+ // SoX stopped — send CloseStream to Deepgram
441
+ if (ws.readyState === WebSocket.OPEN) {
442
+ try { ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
443
+ }
444
+ });
445
+
446
+ return session;
447
+ }
448
+
449
+ function stopStreamingSession(session: StreamingSession): void {
450
+ if (session.closed) return;
451
+
452
+ // Stop the microphone
453
+ try { session.recProcess.kill("SIGTERM"); } catch {}
454
+
455
+ // CloseStream tells Deepgram to flush remaining audio
456
+ if (session.ws.readyState === WebSocket.OPEN) {
457
+ try { session.ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
458
+ }
459
+
460
+ // Safety: finalize after timeout even if Deepgram doesn't respond
461
+ setTimeout(() => {
462
+ if (!session.closed) {
463
+ finalizeSession(session);
464
+ }
465
+ }, FINALIZE_SAFETY_TIMEOUT_MS);
466
+
467
+ // Shorter timeout: if no new data arrives for 1.5s, assume done
468
+ let lastDataTime = Date.now();
469
+ const origOnMessage = session.ws.onmessage;
470
+ session.ws.onmessage = (event: MessageEvent) => {
471
+ lastDataTime = Date.now();
472
+ if (origOnMessage) origOnMessage.call(session.ws, event);
473
+ };
474
+
475
+ const noDataCheck = setInterval(() => {
476
+ if (Date.now() - lastDataTime > FINALIZE_NO_DATA_TIMEOUT_MS) {
477
+ clearInterval(noDataCheck);
478
+ if (!session.closed) {
479
+ finalizeSession(session);
480
+ }
481
+ }
482
+ }, 500);
483
+ }
484
+
485
+ function finalizeSession(session: StreamingSession): void {
486
+ if (session.closed) return;
487
+ session.closed = true;
488
+
489
+ // Clean up keepalive
490
+ if (session.keepAliveTimer) {
491
+ clearInterval(session.keepAliveTimer);
492
+ session.keepAliveTimer = null;
493
+ }
494
+
495
+ // Close WebSocket
496
+ try { session.ws.close(); } catch {}
497
+
498
+ // Kill SoX if still running
499
+ try { session.recProcess.kill("SIGKILL"); } catch {}
500
+
501
+ // Deliver final transcript
502
+ const fullText = session.finalizedParts.join(" ").trim();
503
+ session.onDone(fullText);
504
+ }
505
+
261
506
  // ─── Extension ───────────────────────────────────────────────────────────────
262
507
 
263
508
  export default function (pi: ExtensionAPI) {
@@ -272,6 +517,10 @@ export default function (pi: ExtensionAPI) {
272
517
  let terminalInputUnsub: (() => void) | null = null;
273
518
  let isHolding = false;
274
519
 
520
+ // Streaming session state
521
+ let activeSession: StreamingSession | null = null;
522
+ let currentTarget: "editor" | "btw" = "editor";
523
+
275
524
  // ─── BTW State ───────────────────────────────────────────────────────────
276
525
 
277
526
  let btwThread: BtwExchange[] = [];
@@ -289,17 +538,19 @@ export default function (pi: ExtensionAPI) {
289
538
  }
290
539
  const modeTag = !config.onboarding.completed
291
540
  ? "SETUP"
292
- : config.mode === "api"
293
- ? "API"
294
- : config.mode === "local"
295
- ? "LOCAL"
296
- : "AUTO";
541
+ : isDeepgramStreaming(config)
542
+ ? "STREAM"
543
+ : config.mode === "api"
544
+ ? "API"
545
+ : config.mode === "local"
546
+ ? "LOCAL"
547
+ : "AUTO";
297
548
  ctx.ui.setStatus("voice", `MIC ${modeTag}`);
298
549
  break;
299
550
  }
300
551
  case "recording": {
301
552
  const secs = Math.round((Date.now() - recordingStart) / 1000);
302
- ctx.ui.setStatus("voice", `REC ${secs}s`);
553
+ ctx.ui.setStatus("voice", `🔴 REC ${secs}s`);
303
554
  break;
304
555
  }
305
556
  case "transcribing":
@@ -315,9 +566,17 @@ export default function (pi: ExtensionAPI) {
315
566
 
316
567
  function voiceCleanup() {
317
568
  if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
318
- if (recProcess) { recProcess.kill("SIGTERM"); recProcess = null; }
569
+ clearHoldTimer();
570
+ stopRecordingWidgetAnimation();
571
+ if (activeSession) {
572
+ finalizeSession(activeSession);
573
+ activeSession = null;
574
+ }
575
+ if (legacyRecProcess) { legacyRecProcess.kill("SIGTERM"); legacyRecProcess = null; }
319
576
  if (tempFile) { try { fs.unlinkSync(tempFile); } catch {} tempFile = null; }
320
577
  isHolding = false;
578
+ spaceConsumed = false;
579
+ spaceDownTime = null;
321
580
  setVoiceState("idle");
322
581
  }
323
582
 
@@ -332,7 +591,7 @@ export default function (pi: ExtensionAPI) {
332
591
  const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
333
592
  const provisioningPlan = buildProvisioningPlan(nextConfig, diagnostics);
334
593
  let validated = provisioningPlan.ready;
335
- if (validated && nextConfig.enabled) {
594
+ if (validated && nextConfig.enabled && !isDeepgramStreaming(nextConfig)) {
336
595
  validated = await ensureDaemon(nextConfig);
337
596
  }
338
597
 
@@ -349,53 +608,337 @@ export default function (pi: ExtensionAPI) {
349
608
  ].join("\n"), validated ? "info" : "warning");
350
609
  }
351
610
 
352
- // ─── Voice: Start / Stop / Transcribe ────────────────────────────────────
611
+ // ─── Live Transcript Widget (Component-based, themed) ───────────────────
353
612
 
354
- const MAX_RECORDING_SECS = 30; // Safety cap: auto-stop after 30s
613
+ /** Subtle hint shown during the hold threshold wait */
614
+ function showHoldHintWidget() {
615
+ if (!ctx?.hasUI) return;
616
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
617
+ return {
618
+ invalidate() {},
619
+ render(width: number): string[] {
620
+ const bar = theme.fg("muted", "─".repeat(Math.min(width - 2, 60)));
621
+ return [
622
+ bar,
623
+ theme.fg("dim", " Hold " + theme.bold("SPACE") + " for voice input..."),
624
+ bar,
625
+ ];
626
+ },
627
+ };
628
+ }, { placement: "aboveEditor" });
629
+ }
355
630
 
356
- async function startVoiceRecording(target: "editor" | "btw" = "editor"): Promise<boolean> {
357
- if (voiceState !== "idle" || !ctx) return false;
631
+ function hideHoldHintWidget() {
632
+ if (!ctx?.hasUI) return;
633
+ ctx.ui.setWidget("voice-recording", undefined);
634
+ }
635
+
636
+ /** Animated recording indicator with live waveform */
637
+ function showRecordingWidget(target: "editor" | "btw") {
638
+ if (!ctx?.hasUI) return;
639
+ let frame = 0;
640
+ const waveChars = ["▁", "▂", "▃", "▅", "▆", "▇", "▆", "▅", "▃", "▂"];
358
641
 
359
- tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
360
- if (!startRecordingToFile(tempFile)) {
361
- ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
362
- return false;
642
+ // Animate the widget every 200ms
643
+ const animTimer = setInterval(() => {
644
+ frame++;
645
+ if (ctx?.hasUI) ctx.ui.setWidget("voice-recording", undefined); // force re-render
646
+ showRecordingWidgetFrame(target, frame, waveChars);
647
+ }, 200);
648
+
649
+ // Store the timer so we can clean it up
650
+ (showRecordingWidget as any)._animTimer = animTimer;
651
+
652
+ showRecordingWidgetFrame(target, frame, waveChars);
653
+ }
654
+
655
+ function showRecordingWidgetFrame(target: "editor" | "btw", frame: number, waveChars: string[]) {
656
+ if (!ctx?.hasUI) return;
657
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
658
+ return {
659
+ invalidate() {},
660
+ render(width: number): string[] {
661
+ const maxW = Math.min(width - 2, 72);
662
+ const elapsed = Math.round((Date.now() - recordingStart) / 1000);
663
+ const mins = Math.floor(elapsed / 60);
664
+ const secs = elapsed % 60;
665
+ const timeStr = mins > 0 ? `${mins}:${String(secs).padStart(2, "0")}` : `${secs}s`;
666
+
667
+ // Animated waveform
668
+ const waveLen = 12;
669
+ let wave = "";
670
+ for (let i = 0; i < waveLen; i++) {
671
+ wave += waveChars[(frame + i) % waveChars.length];
672
+ }
673
+
674
+ const topBorder = theme.fg("borderAccent", "╭" + "─".repeat(maxW) + "╮");
675
+ const botBorder = theme.fg("borderAccent", "╰" + "─".repeat(maxW) + "╯");
676
+ const pad = (s: string, w: number) => {
677
+ const visible = s.replace(/\x1b\[[^m]*m/g, "").length;
678
+ return s + " ".repeat(Math.max(0, w - visible));
679
+ };
680
+
681
+ const dot = theme.fg("error", "●");
682
+ const label = target === "btw"
683
+ ? theme.bold(theme.fg("accent", " BTW "))
684
+ : theme.bold(theme.fg("accent", " VOICE "));
685
+ const waveStyled = theme.fg("accent", wave);
686
+ const timeStyled = theme.fg("muted", timeStr);
687
+
688
+ const titleLine = ` ${dot} ${label} ${waveStyled} ${timeStyled}`;
689
+
690
+ const hint = target === "btw"
691
+ ? theme.fg("dim", " Press Ctrl+Shift+B to stop")
692
+ : kittyReleaseDetected
693
+ ? theme.fg("dim", " Release SPACE to finalize")
694
+ : theme.fg("dim", " Press SPACE again to stop");
695
+
696
+ const lines = [
697
+ topBorder,
698
+ theme.fg("borderAccent", "│") + pad(titleLine, maxW) + theme.fg("borderAccent", "│"),
699
+ theme.fg("borderAccent", "│") + pad(hint, maxW) + theme.fg("borderAccent", "│"),
700
+ botBorder,
701
+ ];
702
+ return lines;
703
+ },
704
+ };
705
+ }, { placement: "aboveEditor" });
706
+ }
707
+
708
+ function stopRecordingWidgetAnimation() {
709
+ const timer = (showRecordingWidget as any)?._animTimer;
710
+ if (timer) {
711
+ clearInterval(timer);
712
+ (showRecordingWidget as any)._animTimer = null;
363
713
  }
714
+ }
715
+
716
+ /** Show live transcript inside a themed box */
717
+ function updateLiveTranscriptWidget(interim: string, finals: string[]) {
718
+ if (!ctx?.hasUI) return;
719
+
720
+ const finalized = finals.join(" ");
721
+ const displayText = finalized + (interim ? (finalized ? " " : "") + interim : "");
722
+
723
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
724
+ return {
725
+ invalidate() {},
726
+ render(width: number): string[] {
727
+ const maxW = Math.min(width - 2, 72);
728
+ const elapsed = Math.round((Date.now() - recordingStart) / 1000);
729
+ const mins = Math.floor(elapsed / 60);
730
+ const secs = elapsed % 60;
731
+ const timeStr = mins > 0 ? `${mins}:${String(secs).padStart(2, "0")}` : `${secs}s`;
732
+
733
+ const topBorder = theme.fg("borderAccent", "╭" + "─".repeat(maxW) + "╮");
734
+ const botBorder = theme.fg("borderAccent", "╰" + "─".repeat(maxW) + "╯");
735
+ const sep = theme.fg("borderAccent", "│") + theme.fg("borderAccent", "─".repeat(maxW)) + theme.fg("borderAccent", "│");
736
+ const side = (content: string) => {
737
+ const stripped = content.replace(/\x1b\[[^m]*m/g, "");
738
+ const padding = Math.max(0, maxW - stripped.length);
739
+ return theme.fg("borderAccent", "│") + content + " ".repeat(padding) + theme.fg("borderAccent", "│");
740
+ };
741
+
742
+ const dot = theme.fg("error", "●");
743
+ const label = theme.bold(theme.fg("accent", " VOICE "));
744
+ const timeStyled = theme.fg("muted", timeStr);
745
+ const titleLine = ` ${dot} ${label} ${timeStyled}`;
746
+ const hint = kittyReleaseDetected
747
+ ? theme.fg("dim", " Release SPACE to finalize")
748
+ : theme.fg("dim", " Press SPACE again to stop");
749
+
750
+ const lines = [topBorder, side(titleLine)];
751
+
752
+ if (!displayText.trim()) {
753
+ lines.push(side(theme.fg("dim", " Listening... speak now")));
754
+ } else {
755
+ lines.push(sep);
756
+ // Word-wrap the transcript text
757
+ const innerMax = maxW - 4; // padding inside box
758
+ const words = displayText.split(" ");
759
+ const wrappedLines: string[] = [];
760
+ let currentLine = "";
761
+
762
+ for (const word of words) {
763
+ if ((currentLine + " " + word).trim().length > innerMax && currentLine) {
764
+ wrappedLines.push(currentLine);
765
+ currentLine = word;
766
+ } else {
767
+ currentLine = currentLine ? currentLine + " " + word : word;
768
+ }
769
+ }
770
+ if (currentLine) wrappedLines.push(currentLine);
771
+
772
+ // Show last 3 lines of transcript
773
+ const visible = wrappedLines.slice(-3);
774
+ for (let i = 0; i < visible.length; i++) {
775
+ let line = visible[i];
776
+ // Style: finalized parts in normal text, interim in accent
777
+ if (i === visible.length - 1 && interim) {
778
+ line = theme.fg("text", line) + theme.fg("accent", "▍");
779
+ } else {
780
+ line = theme.fg("text", line);
781
+ }
782
+ lines.push(side(" " + line));
783
+ }
784
+ }
785
+
786
+ lines.push(side(hint));
787
+ lines.push(botBorder);
788
+ return lines;
789
+ },
790
+ };
791
+ }, { placement: "aboveEditor" });
792
+ }
364
793
 
794
+ /** Transcribing state — show a processing indicator */
795
+ function showTranscribingWidget() {
796
+ if (!ctx?.hasUI) return;
797
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
798
+ return {
799
+ invalidate() {},
800
+ render(width: number): string[] {
801
+ const maxW = Math.min(width - 2, 72);
802
+ const topBorder = theme.fg("border", "╭" + "─".repeat(maxW) + "╮");
803
+ const botBorder = theme.fg("border", "╰" + "─".repeat(maxW) + "╯");
804
+ const side = (content: string) => {
805
+ const stripped = content.replace(/\x1b\[[^m]*m/g, "");
806
+ const padding = Math.max(0, maxW - stripped.length);
807
+ return theme.fg("border", "│") + content + " ".repeat(padding) + theme.fg("border", "│");
808
+ };
809
+ const spinner = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
810
+ const idx = Math.floor(Date.now() / 100) % spinner.length;
811
+ const line = ` ${theme.fg("accent", spinner[idx])} ${theme.fg("dim", "Finalizing transcription...")}`;
812
+ return [topBorder, side(line), botBorder];
813
+ },
814
+ };
815
+ }, { placement: "aboveEditor" });
816
+ }
817
+
818
+ // ─── Voice: Start / Stop (Streaming or Legacy) ───────────────────────────
819
+
820
+ async function startVoiceRecording(target: "editor" | "btw" = "editor"): Promise<boolean> {
821
+ if (voiceState !== "idle" || !ctx) return false;
822
+
823
+ currentTarget = target;
365
824
  recordingStart = Date.now();
366
- setVoiceState("recording");
367
- statusTimer = setInterval(() => {
368
- if (voiceState === "recording") {
369
- updateVoiceStatus();
370
- // Safety: auto-stop after MAX_RECORDING_SECS
371
- const elapsed = (Date.now() - recordingStart) / 1000;
372
- if (elapsed >= MAX_RECORDING_SECS) {
373
- isHolding = false;
374
- stopVoiceRecording(target);
825
+
826
+ if (isDeepgramStreaming(config)) {
827
+ // === STREAMING PATH === (Deepgram WebSocket)
828
+ setVoiceState("recording");
829
+
830
+ const session = startStreamingSession(config, {
831
+ onTranscript: (interim, finals) => {
832
+ updateLiveTranscriptWidget(interim, finals);
833
+ updateVoiceStatus();
834
+ },
835
+ onDone: (fullText) => {
836
+ activeSession = null;
837
+ stopRecordingWidgetAnimation();
838
+ ctx?.ui.setWidget("voice-recording", undefined);
839
+
840
+ if (!fullText.trim()) {
841
+ ctx?.ui.notify("No speech detected.", "warning");
842
+ setVoiceState("idle");
843
+ return;
844
+ }
845
+
846
+ if (target === "btw") {
847
+ handleBtw(fullText);
848
+ } else {
849
+ if (ctx?.hasUI) {
850
+ const existing = ctx.ui.getEditorText();
851
+ ctx.ui.setEditorText(existing ? existing + " " + fullText : fullText);
852
+ const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
853
+ ctx.ui.notify(
854
+ `STT (${elapsed}s): ${fullText.slice(0, 80)}${fullText.length > 80 ? "..." : ""}`,
855
+ "info",
856
+ );
857
+ }
858
+ }
859
+ setVoiceState("idle");
860
+ },
861
+ onError: (err) => {
862
+ activeSession = null;
863
+ stopRecordingWidgetAnimation();
864
+ ctx?.ui.setWidget("voice-recording", undefined);
865
+ ctx?.ui.notify(`STT error: ${err}`, "error");
866
+ setVoiceState("idle");
867
+ },
868
+ });
869
+
870
+ if (!session) {
871
+ setVoiceState("idle");
872
+ return false;
873
+ }
874
+
875
+ activeSession = session;
876
+
877
+ // Status timer for elapsed time
878
+ statusTimer = setInterval(() => {
879
+ if (voiceState === "recording") {
880
+ updateVoiceStatus();
881
+ const elapsed = (Date.now() - recordingStart) / 1000;
882
+ if (elapsed >= MAX_RECORDING_SECS) {
883
+ isHolding = false;
884
+ stopVoiceRecording(target);
885
+ }
886
+ }
887
+ }, 1000);
888
+
889
+ // Show the themed recording widget
890
+ showRecordingWidget(target);
891
+ return true;
892
+
893
+ } else {
894
+ // === LEGACY PATH === (file-based for local backends)
895
+ tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
896
+ if (!startLegacyRecordingToFile(tempFile)) {
897
+ ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
898
+ return false;
899
+ }
900
+
901
+ setVoiceState("recording");
902
+ statusTimer = setInterval(() => {
903
+ if (voiceState === "recording") {
904
+ updateVoiceStatus();
905
+ const elapsed = (Date.now() - recordingStart) / 1000;
906
+ if (elapsed >= MAX_RECORDING_SECS) {
907
+ isHolding = false;
908
+ stopVoiceRecording(target);
909
+ }
375
910
  }
911
+ }, 1000);
912
+
913
+ if (ctx.hasUI) {
914
+ // Show themed recording widget for legacy path
915
+ showRecordingWidget(target);
376
916
  }
377
- }, 1000);
378
-
379
- if (ctx.hasUI) {
380
- ctx.ui.setWidget("voice-recording", [
381
- target === "btw"
382
- ? " 🎙 BTW Recording... Ctrl+Shift+V to stop"
383
- : " 🎙 Recording... Ctrl+Shift+V to stop (or release SPACE)",
384
- ], { placement: "aboveEditor" });
917
+ return true;
385
918
  }
386
- return true;
387
919
  }
388
920
 
389
921
  async function stopVoiceRecording(target: "editor" | "btw" = "editor") {
390
922
  if (voiceState !== "recording" || !ctx) return;
391
923
  if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
392
924
 
925
+ if (activeSession) {
926
+ // === STREAMING PATH === Stop the stream, finalize will call onDone
927
+ setVoiceState("transcribing");
928
+ stopRecordingWidgetAnimation();
929
+ showTranscribingWidget();
930
+ stopStreamingSession(activeSession);
931
+ return;
932
+ }
933
+
934
+ // === LEGACY PATH ===
393
935
  const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
394
- const audioFile = tempFile; // capture before cleanup can null it
936
+ const audioFile = tempFile;
395
937
  setVoiceState("transcribing");
396
- ctx.ui.setWidget("voice-recording", undefined);
938
+ stopRecordingWidgetAnimation();
939
+ showTranscribingWidget();
397
940
 
398
- await stopRecording();
941
+ await stopLegacyRecording();
399
942
 
400
943
  if (!audioFile || !fs.existsSync(audioFile)) {
401
944
  ctx.ui.notify("No audio recorded.", "warning");
@@ -412,12 +955,9 @@ export default function (pi: ExtensionAPI) {
412
955
  return;
413
956
  }
414
957
 
415
- // Ensure daemon is up before transcribing — await so the warm path
416
- // is available for this request instead of falling through to the
417
- // cold subprocess fallback.
418
958
  await ensureDaemon(config).catch(() => {});
419
959
 
420
- const result = await transcribeAudio(audioFile, config);
960
+ const result = await transcribeAudioFile(audioFile, config);
421
961
  try { fs.unlinkSync(audioFile); } catch {}
422
962
  if (tempFile === audioFile) tempFile = null;
423
963
 
@@ -437,7 +977,6 @@ export default function (pi: ExtensionAPI) {
437
977
  if (target === "btw") {
438
978
  await handleBtw(transcript);
439
979
  } else {
440
- // Inject into editor
441
980
  if (ctx.hasUI) {
442
981
  const existing = ctx.ui.getEditorText();
443
982
  ctx.ui.setEditorText(existing ? existing + " " + transcript : transcript);
@@ -451,54 +990,163 @@ export default function (pi: ExtensionAPI) {
451
990
  setVoiceState("idle");
452
991
  }
453
992
 
454
- // ─── Hold-to-talk via Kitty protocol ─────────────────────────────────────
993
+ // ─── Hold-to-talk with Duration Threshold ──────────────────────────────
994
+ //
995
+ // SPACE activates voice ONLY when:
996
+ // 1. The editor is empty (no text typed yet)
997
+ // 2. SPACE is held for ≥ HOLD_THRESHOLD_MS (500ms)
998
+ //
999
+ // If SPACE is released before the threshold, a regular space character
1000
+ // is typed into the editor (normal typing behavior).
1001
+ //
1002
+ // This prevents accidental voice activation when typing and matches
1003
+ // Claude Code's hold-to-talk UX pattern.
1004
+ //
1005
+ // For Kitty protocol terminals: hold → wait threshold → activate →
1006
+ // release → stop recording. True hold-to-talk.
1007
+ // For non-Kitty terminals: hold → wait threshold → activate →
1008
+ // press SPACE again → stop recording. Toggle after activation.
1009
+
1010
+ const HOLD_THRESHOLD_MS = 500; // minimum hold time before voice activates
1011
+ let kittyReleaseDetected = false;
1012
+ let spaceDownTime: number | null = null; // timestamp when SPACE was first pressed
1013
+ let holdActivationTimer: ReturnType<typeof setTimeout> | null = null;
1014
+ let spaceConsumed = false; // whether we've committed to voice (past threshold)
1015
+
1016
+ function clearHoldTimer() {
1017
+ if (holdActivationTimer) {
1018
+ clearTimeout(holdActivationTimer);
1019
+ holdActivationTimer = null;
1020
+ }
1021
+ }
455
1022
 
456
1023
  function setupHoldToTalk() {
457
1024
  if (!ctx?.hasUI) return;
458
1025
 
459
- // Remove previous listener
460
1026
  if (terminalInputUnsub) { terminalInputUnsub(); terminalInputUnsub = null; }
461
1027
 
462
1028
  terminalInputUnsub = ctx.ui.onTerminalInput((data: string) => {
463
1029
  if (!config.enabled) return undefined;
464
1030
 
465
- // Hold SPACE talk → release → transcribe to editor
1031
+ // ── SPACE handling ──
466
1032
  if (matchesKey(data, "space")) {
467
- // Only activate when editor is empty (avoid conflicting with typing)
1033
+ // RULE: If editor has content, SPACE always types a space — never voice
468
1034
  const editorText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
469
- if (editorText && editorText.trim().length > 0) return undefined;
1035
+ if (editorText && editorText.trim().length > 0) {
1036
+ clearHoldTimer();
1037
+ spaceDownTime = null;
1038
+ spaceConsumed = false;
1039
+ return undefined; // let the default space character through
1040
+ }
470
1041
 
1042
+ // ── Kitty key-release ──
471
1043
  if (isKeyRelease(data)) {
472
- if (isHolding) {
1044
+ kittyReleaseDetected = true;
1045
+
1046
+ // Released before threshold → type a space character
1047
+ if (spaceDownTime && !spaceConsumed) {
1048
+ clearHoldTimer();
1049
+ spaceDownTime = null;
1050
+ spaceConsumed = false;
1051
+ // Insert a space into editor
1052
+ if (ctx?.hasUI) ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
1053
+ return { consume: true };
1054
+ }
1055
+
1056
+ // Released after threshold → stop recording (true hold-to-talk)
1057
+ if (spaceConsumed && isHolding && voiceState === "recording") {
473
1058
  isHolding = false;
1059
+ spaceConsumed = false;
1060
+ spaceDownTime = null;
474
1061
  stopVoiceRecording("editor");
475
1062
  return { consume: true };
476
1063
  }
1064
+
1065
+ spaceDownTime = null;
1066
+ spaceConsumed = false;
477
1067
  return undefined;
478
1068
  }
479
1069
 
1070
+ // ── Kitty key-repeat: suppress while holding past threshold ──
480
1071
  if (isKeyRepeat(data)) {
481
- if (isHolding) return { consume: true };
1072
+ if (spaceConsumed || isHolding) return { consume: true };
482
1073
  return undefined;
483
1074
  }
484
1075
 
485
- // Key press — start recording
486
- if (voiceState === "idle" && !isHolding) {
487
- isHolding = true;
488
- startVoiceRecording("editor").then((ok) => {
489
- if (!ok) isHolding = false;
490
- });
1076
+ // === Key PRESS ===
1077
+
1078
+ // If already recording (toggle mode for non-Kitty) → stop
1079
+ if (voiceState === "recording" && spaceConsumed) {
1080
+ isHolding = false;
1081
+ spaceConsumed = false;
1082
+ spaceDownTime = null;
1083
+ clearHoldTimer();
1084
+ stopVoiceRecording("editor");
491
1085
  return { consume: true };
492
1086
  }
493
1087
 
494
- if (isHolding) return { consume: true };
1088
+ // If transcribing ignore
1089
+ if (voiceState === "transcribing") {
1090
+ return { consume: true };
1091
+ }
1092
+
1093
+ // Idle → start the hold timer
1094
+ if (voiceState === "idle" && !spaceDownTime) {
1095
+ spaceDownTime = Date.now();
1096
+ spaceConsumed = false;
1097
+
1098
+ // Show a subtle "preparing" indicator
1099
+ if (ctx?.hasUI) {
1100
+ showHoldHintWidget();
1101
+ }
1102
+
1103
+ // After threshold: activate voice recording
1104
+ holdActivationTimer = setTimeout(() => {
1105
+ holdActivationTimer = null;
1106
+ // Double-check: still idle, still holding, editor still empty
1107
+ const currentText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
1108
+ if (voiceState === "idle" && spaceDownTime && !(currentText && currentText.trim().length > 0)) {
1109
+ spaceConsumed = true;
1110
+ isHolding = true;
1111
+ startVoiceRecording("editor").then((ok) => {
1112
+ if (!ok) {
1113
+ isHolding = false;
1114
+ spaceConsumed = false;
1115
+ spaceDownTime = null;
1116
+ }
1117
+ });
1118
+ } else {
1119
+ spaceDownTime = null;
1120
+ spaceConsumed = false;
1121
+ }
1122
+ }, HOLD_THRESHOLD_MS);
1123
+
1124
+ return { consume: true }; // consume now — we'll insert space on early release
1125
+ }
1126
+
1127
+ if (isHolding || spaceConsumed) return { consume: true };
1128
+ return undefined;
1129
+ }
1130
+
1131
+ // ── Any other key while holding space (pre-threshold) → cancel hold, insert space ──
1132
+ if (spaceDownTime && !spaceConsumed && !matchesKey(data, "space")) {
1133
+ clearHoldTimer();
1134
+ // Insert the space that was consumed during hold detection
1135
+ if (ctx?.hasUI) {
1136
+ ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
1137
+ hideHoldHintWidget();
1138
+ }
1139
+ spaceDownTime = null;
1140
+ spaceConsumed = false;
1141
+ // Don't consume this key — let it through
495
1142
  return undefined;
496
1143
  }
497
1144
 
498
- // Hold Ctrl+Shift+B talk release auto-btw
1145
+ // ── Ctrl+Shift+B handling (BTW voice) direct toggle, no hold threshold ──
499
1146
  if (matchesKey(data, "ctrl+shift+b")) {
500
1147
  if (isKeyRelease(data)) {
501
- if (isHolding) {
1148
+ kittyReleaseDetected = true;
1149
+ if (isHolding && voiceState === "recording") {
502
1150
  isHolding = false;
503
1151
  stopVoiceRecording("btw");
504
1152
  return { consume: true };
@@ -511,6 +1159,13 @@ export default function (pi: ExtensionAPI) {
511
1159
  return undefined;
512
1160
  }
513
1161
 
1162
+ // Toggle: stop if recording
1163
+ if (voiceState === "recording") {
1164
+ isHolding = false;
1165
+ stopVoiceRecording("btw");
1166
+ return { consume: true };
1167
+ }
1168
+
514
1169
  if (voiceState === "idle" && !isHolding) {
515
1170
  isHolding = true;
516
1171
  startVoiceRecording("btw").then((ok) => {
@@ -523,12 +1178,6 @@ export default function (pi: ExtensionAPI) {
523
1178
  return undefined;
524
1179
  }
525
1180
 
526
- // Any other key while holding = cancel
527
- if (isHolding && voiceState === "recording") {
528
- // Don't cancel on modifier-only events
529
- return undefined;
530
- }
531
-
532
1181
  return undefined;
533
1182
  });
534
1183
  }
@@ -536,7 +1185,6 @@ export default function (pi: ExtensionAPI) {
536
1185
  // ─── BTW: Side Conversations ─────────────────────────────────────────────
537
1186
 
538
1187
  function buildBtwContext(): string {
539
- // Build context from main session + btw thread
540
1188
  const systemPrompt = ctx?.getSystemPrompt() ?? "";
541
1189
  let btwContext = "You are a helpful side-channel assistant. ";
542
1190
  btwContext += "The user is having a parallel conversation while their main Pi agent works. ";
@@ -570,7 +1218,6 @@ export default function (pi: ExtensionAPI) {
570
1218
  "",
571
1219
  ];
572
1220
 
573
- // Show last exchange
574
1221
  lines.push(` Q: ${last.question.slice(0, 100)}${last.question.length > 100 ? "..." : ""}`);
575
1222
  const answerLines = last.answer.split("\n");
576
1223
  for (const line of answerLines.slice(0, 8)) {
@@ -589,7 +1236,6 @@ export default function (pi: ExtensionAPI) {
589
1236
 
590
1237
  btwWidgetVisible = true;
591
1238
 
592
- // Show thinking state
593
1239
  ctx.ui.setWidget("btw", [
594
1240
  " BTW",
595
1241
  "",
@@ -598,10 +1244,8 @@ export default function (pi: ExtensionAPI) {
598
1244
  " Thinking...",
599
1245
  ], { placement: "aboveEditor" });
600
1246
 
601
- // Build context for LLM
602
1247
  const btwContext = buildBtwContext();
603
1248
 
604
- // Use the model registry to get current model
605
1249
  const model = ctx.model;
606
1250
  if (!model) {
607
1251
  const exchange: BtwExchange = {
@@ -616,7 +1260,6 @@ export default function (pi: ExtensionAPI) {
616
1260
  }
617
1261
 
618
1262
  try {
619
- // Stream the response
620
1263
  let answer = "";
621
1264
  const eventStream = streamSimple(model, {
622
1265
  systemPrompt: btwContext,
@@ -633,7 +1276,6 @@ export default function (pi: ExtensionAPI) {
633
1276
  break;
634
1277
  }
635
1278
 
636
- // Update widget with streaming response
637
1279
  const displayLines: string[] = [
638
1280
  ` BTW`,
639
1281
  "",
@@ -657,7 +1299,6 @@ export default function (pi: ExtensionAPI) {
657
1299
  pi.appendEntry("btw", exchange);
658
1300
  updateBtwWidget();
659
1301
  } catch (err: any) {
660
- // Fallback: send as a follow-up message to the main agent
661
1302
  const exchange: BtwExchange = {
662
1303
  question: message,
663
1304
  answer: `(BTW streaming failed: ${err.message}. Falling back to sendUserMessage.)`,
@@ -667,7 +1308,6 @@ export default function (pi: ExtensionAPI) {
667
1308
  pi.appendEntry("btw", exchange);
668
1309
  updateBtwWidget();
669
1310
 
670
- // Use sendUserMessage as alternative
671
1311
  pi.sendUserMessage(
672
1312
  `[BTW question]: ${message}`,
673
1313
  { deliverAs: "followUp" },
@@ -677,7 +1317,6 @@ export default function (pi: ExtensionAPI) {
677
1317
 
678
1318
  // ─── Shortcuts ───────────────────────────────────────────────────────────
679
1319
 
680
- // Ctrl+Shift+V = toggle voice (fallback for non-Kitty terminals)
681
1320
  pi.registerShortcut("ctrl+shift+v", {
682
1321
  description: "Toggle voice recording (start/stop)",
683
1322
  handler: async (handlerCtx) => {
@@ -705,12 +1344,42 @@ export default function (pi: ExtensionAPI) {
705
1344
  configSource = loaded.source;
706
1345
  updateSocketPath(config, currentCwd);
707
1346
 
708
- // No auto-popup on startup. Users run `/voice setup` to configure.
709
- // Only activate voice features if setup has been completed previously.
1347
+ // Auto-capture DEEPGRAM_API_KEY from env into config if not already stored.
1348
+ // This ensures streaming works even when Pi is launched from a context
1349
+ // that doesn't source .zshrc (GUI app, tmux, etc.)
1350
+ if (process.env.DEEPGRAM_API_KEY && !config.deepgramApiKey) {
1351
+ config.deepgramApiKey = process.env.DEEPGRAM_API_KEY;
1352
+ if (configSource !== "default") {
1353
+ saveConfig(config, config.scope, currentCwd);
1354
+ }
1355
+ }
1356
+
1357
+ // Also try to load DEEPGRAM_API_KEY from shell if not in process.env and not in config
1358
+ if (!resolveDeepgramApiKey(config) && config.backend === "deepgram") {
1359
+ try {
1360
+ const result = spawnSync("zsh", ["-ic", "echo $DEEPGRAM_API_KEY"], {
1361
+ stdio: ["pipe", "pipe", "pipe"],
1362
+ timeout: 3000,
1363
+ env: { ...process.env, HOME: os.homedir() },
1364
+ });
1365
+ const shellKey = result.stdout?.toString().trim();
1366
+ if (shellKey && shellKey.length > 5) {
1367
+ config.deepgramApiKey = shellKey;
1368
+ process.env.DEEPGRAM_API_KEY = shellKey; // Also set for child processes
1369
+ if (configSource !== "default") {
1370
+ saveConfig(config, config.scope, currentCwd);
1371
+ }
1372
+ }
1373
+ } catch {}
1374
+ }
1375
+
710
1376
  if (config.enabled && config.onboarding.completed) {
711
1377
  updateVoiceStatus();
712
1378
  setupHoldToTalk();
713
- ensureDaemon(config).catch(() => {});
1379
+ // Only start daemon for non-streaming backends
1380
+ if (!isDeepgramStreaming(config)) {
1381
+ ensureDaemon(config).catch(() => {});
1382
+ }
714
1383
  }
715
1384
  });
716
1385
 
@@ -764,8 +1433,11 @@ export default function (pi: ExtensionAPI) {
764
1433
  config.enabled = true;
765
1434
  updateVoiceStatus();
766
1435
  setupHoldToTalk();
767
- ensureDaemon(config).catch(() => {});
768
- cmdCtx.ui.notify("Voice enabled.\n Hold SPACE (empty editor) release to transcribe\n Ctrl+Shift+V → toggle recording on/off\n Auto-stops after 30s", "info");
1436
+ if (!isDeepgramStreaming(config)) {
1437
+ ensureDaemon(config).catch(() => {});
1438
+ }
1439
+ const mode = isDeepgramStreaming(config) ? "Deepgram streaming" : config.backend;
1440
+ cmdCtx.ui.notify(`Voice enabled (${mode}).\n Hold SPACE (empty editor) → release to transcribe\n Ctrl+Shift+V → toggle recording on/off\n Live transcription shown while speaking`, "info");
769
1441
  return;
770
1442
  }
771
1443
 
@@ -779,7 +1451,6 @@ export default function (pi: ExtensionAPI) {
779
1451
  }
780
1452
 
781
1453
  if (sub === "stop") {
782
- // Emergency stop — cancel any active recording
783
1454
  if (voiceState === "recording") {
784
1455
  isHolding = false;
785
1456
  await stopVoiceRecording("editor");
@@ -793,6 +1464,8 @@ export default function (pi: ExtensionAPI) {
793
1464
  if (sub === "test") {
794
1465
  cmdCtx.ui.notify("Testing voice setup...", "info");
795
1466
  const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
1467
+ const dgKey = resolveDeepgramApiKey(config);
1468
+ const streaming = isDeepgramStreaming(config);
796
1469
  const daemonUp = await isDaemonRunning();
797
1470
  const provisioningPlan = buildProvisioningPlan(config, diagnostics);
798
1471
  const selectedBackend = diagnostics.backends.find((backend) => backend.name === config.backend);
@@ -805,6 +1478,8 @@ export default function (pi: ExtensionAPI) {
805
1478
  ` model: ${config.model}`,
806
1479
  ` model status: ${modelReadiness}`,
807
1480
  ` language: ${config.language}`,
1481
+ ` streaming: ${streaming ? "YES (Deepgram WS)" : "NO (batch)"}`,
1482
+ ` DEEPGRAM_API_KEY: ${dgKey ? "set (" + dgKey.slice(0, 8) + "...)" : "NOT SET"}`,
808
1483
  ` onboarding: ${config.onboarding.completed ? "complete" : "incomplete"}`,
809
1484
  ` python3: ${diagnostics.hasPython ? "OK" : "missing"}`,
810
1485
  ` sox/rec: ${diagnostics.hasSox ? "OK" : "missing"}`,
@@ -826,11 +1501,10 @@ export default function (pi: ExtensionAPI) {
826
1501
  }
827
1502
  }
828
1503
 
829
- lines.push("", "Suggested commands:");
830
- lines.push(...(provisioningPlan.commands.length > 0 ? provisioningPlan.commands.map((command) => ` - ${command}`) : [" - none"]));
831
- if (provisioningPlan.manualSteps.length > 0) {
832
- lines.push("", "Manual steps:");
833
- lines.push(...provisioningPlan.manualSteps.map((step) => ` - ${step}`));
1504
+ if (!dgKey && config.backend === "deepgram") {
1505
+ lines.push("");
1506
+ lines.push("⚠️ DEEPGRAM_API_KEY not set! Add to ~/.zshrc or ~/.env.secrets");
1507
+ lines.push(" export DEEPGRAM_API_KEY=your_key_here");
834
1508
  }
835
1509
 
836
1510
  cmdCtx.ui.notify(lines.join("\n"), provisioningPlan.ready ? "info" : "warning");
@@ -847,22 +1521,24 @@ export default function (pi: ExtensionAPI) {
847
1521
  const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
848
1522
  const selectedBackend = diagnostics.backends.find((backend) => backend.name === config.backend);
849
1523
  const modelReadiness = getModelReadiness(selectedBackend, config.model);
1524
+ const streaming = isDeepgramStreaming(config);
850
1525
 
851
1526
  cmdCtx.ui.notify([
852
1527
  `Voice config:`,
853
- ` enabled: ${config.enabled}`,
854
- ` mode: ${config.mode}`,
855
- ` scope: ${config.scope}`,
856
- ` backend: ${config.backend}`,
857
- ` model: ${config.model}`,
858
- ` model status: ${modelReadiness}`,
859
- ` language: ${config.language}`,
860
- ` state: ${voiceState}`,
861
- ` setup: ${config.onboarding.completed ? `complete (${config.onboarding.source ?? "unknown"})` : "incomplete"}`,
862
- ` socket: ${activeSocketPath}`,
863
- ` daemon: ${daemonUp ? "running" : "stopped"}${daemonInfo}`,
864
- ` hold-key: SPACE (editor empty) or Ctrl+Shift+V (toggle)`,
865
- ` btw-key: Ctrl+Shift+B (hold to record → auto-btw)`,
1528
+ ` enabled: ${config.enabled}`,
1529
+ ` mode: ${config.mode}`,
1530
+ ` scope: ${config.scope}`,
1531
+ ` backend: ${config.backend}`,
1532
+ ` model: ${config.model}`,
1533
+ ` model stat: ${modelReadiness}`,
1534
+ ` language: ${config.language}`,
1535
+ ` streaming: ${streaming ? "YES (Deepgram WebSocket)" : "NO (batch)"}`,
1536
+ ` state: ${voiceState}`,
1537
+ ` setup: ${config.onboarding.completed ? `complete (${config.onboarding.source ?? "unknown"})` : "incomplete"}`,
1538
+ ` socket: ${activeSocketPath}`,
1539
+ ` daemon: ${daemonUp ? "running" : "stopped"}${daemonInfo}`,
1540
+ ` hold-key: SPACE (editor empty) or Ctrl+Shift+V (toggle)`,
1541
+ ` btw-key: Ctrl+Shift+B (hold to record → auto-btw)`,
866
1542
  ].join("\n"), "info");
867
1543
  return;
868
1544
  }
@@ -905,7 +1581,6 @@ export default function (pi: ExtensionAPI) {
905
1581
  cmdCtx.ui.notify("Voice setup cancelled.", "warning");
906
1582
  return;
907
1583
  }
908
-
909
1584
  await finalizeAndSaveSetup(cmdCtx, result.config, result.selectedScope, result.summaryLines, "setup-command");
910
1585
  return;
911
1586
  }
@@ -1013,7 +1688,7 @@ export default function (pi: ExtensionAPI) {
1013
1688
  },
1014
1689
  });
1015
1690
 
1016
- // ─── Dedicated setup command (discoverable in /command list) ──────────────
1691
+ // ─── Dedicated setup command ─────────────────────────────────────────────
1017
1692
 
1018
1693
  pi.registerCommand("voice-setup", {
1019
1694
  description: "Configure voice input — select backend, model, and language",
@@ -1081,7 +1756,6 @@ export default function (pi: ExtensionAPI) {
1081
1756
 
1082
1757
  pi.sendUserMessage(content, { deliverAs: "followUp" });
1083
1758
 
1084
- // Clear after injection
1085
1759
  btwThread = [];
1086
1760
  btwWidgetVisible = false;
1087
1761
  cmdCtx.ui.setWidget("btw", undefined);
@@ -1106,7 +1780,6 @@ export default function (pi: ExtensionAPI) {
1106
1780
  threadText += `Q: ${ex.question}\nA: ${ex.answer}\n\n`;
1107
1781
  }
1108
1782
 
1109
- // Ask the model to summarize
1110
1783
  const model = ctx.model;
1111
1784
  if (!model) {
1112
1785
  cmdCtx.ui.notify("No model available for summarization.", "error");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@codexstar/pi-listen",
3
- "version": "1.0.12",
3
+ "version": "1.0.14",
4
4
  "description": "Voice input, first-run onboarding, and side-channel BTW conversations for Pi",
5
5
  "type": "module",
6
6
  "keywords": [