@codexstar/pi-listen 1.0.16 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/extensions/voice.ts +415 -342
  2. package/package.json +1 -1
@@ -1,29 +1,48 @@
1
1
  /**
2
- * pi-voice — Deepgram WebSocket streaming STT for Pi CLI.
2
+ * pi-voice — Enterprise-grade voice STT for Pi CLI.
3
3
  *
4
4
  * Architecture (modeled after Claude Code's voice pipeline):
5
- * 1. SoX `rec` captures mic audio as raw PCM (16kHz, mono, 16-bit)
6
- * and pipes it to stdout (no file).
7
- * 2. Raw PCM chunks are streamed over a WebSocket to Deepgram Nova 3.
8
- * 3. Deepgram returns interim + final transcripts in real-time.
9
- * 4. Interim transcripts update a live widget above the editor.
10
- * 5. On key-release (or toggle stop), a CloseStream message is sent;
11
- * final transcript is injected into the editor.
5
+ *
6
+ * STATE MACHINE
7
+ * ─────────────
8
+ * idle warmup recording finalizing idle
9
+ * ↑ │
10
+ * └─────────┘ (rapid re-press recovery)
11
+ *
12
+ * warmup: User holds SPACE for ≥ HOLD_THRESHOLD_MS (500ms).
13
+ * A "keep holding…" hint is shown. If released before
14
+ * the threshold, a normal space character is typed.
15
+ *
16
+ * recording: SoX captures PCM → Deepgram WebSocket streaming.
17
+ * Live interim + final transcripts update the widget.
18
+ * Release SPACE (or press again in toggle mode) → stop.
19
+ *
20
+ * finalizing: CloseStream sent to Deepgram. Waiting for final
21
+ * transcript. Safety timeout auto-completes.
22
+ *
23
+ * HOLD-TO-TALK DETECTION (non-Kitty terminals)
24
+ * ─────────────────────────────────────────────
25
+ * Holding a key sends rapid key-press events (~30ms apart).
26
+ * "Release" is detected when the gap between presses exceeds
27
+ * RELEASE_DETECT_MS (150ms).
28
+ *
29
+ * ENTERPRISE FALLBACKS
30
+ * ────────────────────
31
+ * • Session corruption guard: new recording request during
32
+ * finalizing automatically cancels the stale session first.
33
+ * • Transient failure retry: on WebSocket error during rapid
34
+ * push-to-talk re-press, auto-retry once after 300ms.
35
+ * • Stale transcript cleanup: any prior transcript is cleared
36
+ * before new recording begins.
37
+ * • Silence vs. no-speech: distinguishes "mic captured silence"
38
+ * from "no speech detected" with distinct user messages.
12
39
  *
13
40
  * Activation:
14
- * - Hold SPACE (empty editor) → release to finalize
15
- * - Ctrl+Shift+V → toggle start/stop (fallback for non-Kitty terminals)
41
+ * - Hold SPACE (≥500ms) → release to finalize
42
+ * - Ctrl+Shift+V → toggle start/stop (always works)
16
43
  * - Ctrl+Shift+B → hold to record → auto-send as /btw
17
44
  *
18
- * Config in ~/.pi/agent/settings.json:
19
- * {
20
- * "voice": {
21
- * "enabled": true,
22
- * "language": "en",
23
- * "backend": "deepgram",
24
- * "model": "nova-3"
25
- * }
26
- * }
45
+ * Config in ~/.pi/agent/settings.json under "voice": { ... }
27
46
  */
28
47
 
29
48
  import type {
@@ -57,7 +76,14 @@ import { buildProvisioningPlan } from "./voice/install";
57
76
 
58
77
  // ─── Types ───────────────────────────────────────────────────────────────────
59
78
 
60
- type VoiceState = "idle" | "recording" | "transcribing";
79
+ /**
80
+ * Voice state machine — strict transitions only:
81
+ * idle → warmup → recording → finalizing → idle
82
+ * warmup → idle (released before threshold)
83
+ * recording → idle (on error)
84
+ * finalizing → idle (on completion or timeout)
85
+ */
86
+ type VoiceState = "idle" | "warmup" | "recording" | "finalizing";
61
87
 
62
88
  interface BtwExchange {
63
89
  question: string;
@@ -76,7 +102,14 @@ const DEEPGRAM_WS_URL = "wss://api.deepgram.com/v1/listen";
76
102
  const KEEPALIVE_INTERVAL_MS = 8000;
77
103
  const FINALIZE_SAFETY_TIMEOUT_MS = 5000;
78
104
  const FINALIZE_NO_DATA_TIMEOUT_MS = 1500;
79
- const MAX_RECORDING_SECS = 120; // 2 minutes safety cap (streaming is efficient)
105
+ const MAX_RECORDING_SECS = 120;
106
+
107
+ // Hold-to-talk timing
108
+ const HOLD_THRESHOLD_MS = 500; // Must hold for this long before activation
109
+ const RELEASE_DETECT_MS = 150; // Gap in key-repeat → "released"
110
+ const RETRY_DELAY_MS = 300; // Auto-retry on transient failure during rapid re-press
111
+ const MAX_RETRY_ATTEMPTS = 1; // Max retries per activation attempt
112
+ const CORRUPTION_GUARD_MS = 200; // Min gap between stop and restart
80
113
 
81
114
  const EXT_DIR = path.dirname(new URL(import.meta.url).pathname);
82
115
  const PROJECT_ROOT = path.join(EXT_DIR, "..");
@@ -264,25 +297,17 @@ async function transcribeAudioFile(
264
297
  interface StreamingSession {
265
298
  ws: WebSocket;
266
299
  recProcess: ChildProcess;
267
- interimText: string; // Current interim (partial) transcript
268
- finalizedParts: string[]; // All finalized transcript segments
300
+ interimText: string;
301
+ finalizedParts: string[];
269
302
  keepAliveTimer: ReturnType<typeof setInterval> | null;
270
303
  closed: boolean;
304
+ hadAudioData: boolean; // Track if we received any audio data
305
+ hadSpeech: boolean; // Track if Deepgram detected any speech
271
306
  onTranscript: (interim: string, finals: string[]) => void;
272
- onDone: (fullText: string) => void;
307
+ onDone: (fullText: string, meta: { hadAudio: boolean; hadSpeech: boolean }) => void;
273
308
  onError: (err: string) => void;
274
309
  }
275
310
 
276
- function getDeepgramApiKey(): string | null {
277
- // Priority: env var → config file → null
278
- return process.env.DEEPGRAM_API_KEY || null;
279
- }
280
-
281
- /**
282
- * Resolve the Deepgram API key from all sources:
283
- * 1. process.env.DEEPGRAM_API_KEY (shell)
284
- * 2. config.deepgramApiKey (settings.json, persisted at setup time)
285
- */
286
311
  function resolveDeepgramApiKey(config: VoiceConfig): string | null {
287
312
  return process.env.DEEPGRAM_API_KEY || config.deepgramApiKey || null;
288
313
  }
@@ -290,7 +315,6 @@ function resolveDeepgramApiKey(config: VoiceConfig): string | null {
290
315
  function isDeepgramStreaming(config: VoiceConfig): boolean {
291
316
  const key = resolveDeepgramApiKey(config);
292
317
  if (!key) return false;
293
- // Use streaming for deepgram backend, or auto mode when deepgram key is available
294
318
  return config.backend === "deepgram" || (config.backend === "auto" && !!key);
295
319
  }
296
320
 
@@ -299,8 +323,8 @@ function buildDeepgramWsUrl(config: VoiceConfig): string {
299
323
  encoding: ENCODING,
300
324
  sample_rate: String(SAMPLE_RATE),
301
325
  channels: String(CHANNELS),
302
- endpointing: "300", // ms of silence before phrase boundary
303
- utterance_end_ms: "1000", // ms of silence before utterance is complete
326
+ endpointing: "300",
327
+ utterance_end_ms: "1000",
304
328
  language: config.language || "en",
305
329
  model: config.model || "nova-3",
306
330
  smart_format: "true",
@@ -313,7 +337,7 @@ function startStreamingSession(
313
337
  config: VoiceConfig,
314
338
  callbacks: {
315
339
  onTranscript: (interim: string, finals: string[]) => void;
316
- onDone: (fullText: string) => void;
340
+ onDone: (fullText: string, meta: { hadAudio: boolean; hadSpeech: boolean }) => void;
317
341
  onError: (err: string) => void;
318
342
  },
319
343
  ): StreamingSession | null {
@@ -328,7 +352,6 @@ function startStreamingSession(
328
352
  return null;
329
353
  }
330
354
 
331
- // Start SoX streaming raw PCM to stdout (no file)
332
355
  const recProc = spawn("rec", [
333
356
  "-q",
334
357
  "-r", String(SAMPLE_RATE),
@@ -336,12 +359,11 @@ function startStreamingSession(
336
359
  "-b", "16",
337
360
  "-e", "signed-integer",
338
361
  "-t", "raw",
339
- "-", // output to stdout
362
+ "-",
340
363
  ], { stdio: ["pipe", "pipe", "pipe"] });
341
364
 
342
- recProc.stderr?.on("data", () => {}); // suppress SoX warnings
365
+ recProc.stderr?.on("data", () => {});
343
366
 
344
- // Connect WebSocket to Deepgram
345
367
  const wsUrl = buildDeepgramWsUrl(config);
346
368
  const ws = new WebSocket(wsUrl, {
347
369
  headers: {
@@ -356,25 +378,25 @@ function startStreamingSession(
356
378
  finalizedParts: [],
357
379
  keepAliveTimer: null,
358
380
  closed: false,
381
+ hadAudioData: false,
382
+ hadSpeech: false,
359
383
  onTranscript: callbacks.onTranscript,
360
384
  onDone: callbacks.onDone,
361
385
  onError: callbacks.onError,
362
386
  };
363
387
 
364
388
  ws.onopen = () => {
365
- // Send initial KeepAlive
366
389
  try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
367
390
 
368
- // Start keepalive timer
369
391
  session.keepAliveTimer = setInterval(() => {
370
392
  if (ws.readyState === WebSocket.OPEN) {
371
393
  try { ws.send(JSON.stringify({ type: "KeepAlive" })); } catch {}
372
394
  }
373
395
  }, KEEPALIVE_INTERVAL_MS);
374
396
 
375
- // Pipe SoX stdout → WebSocket as binary frames
376
397
  recProc.stdout?.on("data", (chunk: Buffer) => {
377
398
  if (ws.readyState === WebSocket.OPEN) {
399
+ session.hadAudioData = true;
378
400
  try { ws.send(chunk); } catch {}
379
401
  }
380
402
  });
@@ -389,38 +411,27 @@ function startStreamingSession(
389
411
  const alt = msg.channel?.alternatives?.[0];
390
412
  const transcript = alt?.transcript || "";
391
413
 
414
+ if (transcript.trim()) {
415
+ session.hadSpeech = true;
416
+ }
417
+
392
418
  if (msg.is_final) {
393
- // Final result for this audio segment
394
419
  if (transcript.trim()) {
395
420
  session.finalizedParts.push(transcript.trim());
396
421
  }
397
422
  session.interimText = "";
398
423
  } else {
399
- // Interim result — live update
400
424
  session.interimText = transcript;
401
425
  }
402
426
 
403
427
  session.onTranscript(session.interimText, session.finalizedParts);
404
-
405
- // If speech_final is true, it's the end of an utterance
406
- // (similar to TranscriptEndpoint in Claude Code's protocol)
407
- if (msg.speech_final && transcript.trim()) {
408
- // Already added to finalizedParts above when is_final was true
409
- }
410
- } else if (msg.type === "Metadata") {
411
- // Connection metadata — ignore
412
- } else if (msg.type === "UtteranceEnd") {
413
- // Utterance boundary — Deepgram detected end of speech
414
- // Nothing extra needed, is_final already handles finalization
415
428
  } else if (msg.type === "Error" || msg.type === "error") {
416
429
  session.onError(msg.message || msg.description || "Deepgram error");
417
430
  }
418
- } catch (e: any) {
419
- // Ignore parse errors for binary data
420
- }
431
+ } catch {}
421
432
  };
422
433
 
423
- ws.onerror = (event: Event) => {
434
+ ws.onerror = () => {
424
435
  if (!session.closed) {
425
436
  session.onError("WebSocket connection error");
426
437
  }
@@ -437,7 +448,6 @@ function startStreamingSession(
437
448
  });
438
449
 
439
450
  recProc.on("close", () => {
440
- // SoX stopped — send CloseStream to Deepgram
441
451
  if (ws.readyState === WebSocket.OPEN) {
442
452
  try { ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
443
453
  }
@@ -449,22 +459,20 @@ function startStreamingSession(
449
459
  function stopStreamingSession(session: StreamingSession): void {
450
460
  if (session.closed) return;
451
461
 
452
- // Stop the microphone
453
462
  try { session.recProcess.kill("SIGTERM"); } catch {}
454
463
 
455
- // CloseStream tells Deepgram to flush remaining audio
456
464
  if (session.ws.readyState === WebSocket.OPEN) {
457
465
  try { session.ws.send(JSON.stringify({ type: "CloseStream" })); } catch {}
458
466
  }
459
467
 
460
- // Safety: finalize after timeout even if Deepgram doesn't respond
468
+ // Safety timeout
461
469
  setTimeout(() => {
462
470
  if (!session.closed) {
463
471
  finalizeSession(session);
464
472
  }
465
473
  }, FINALIZE_SAFETY_TIMEOUT_MS);
466
474
 
467
- // Shorter timeout: if no new data arrives for 1.5s, assume done
475
+ // Quick finalize if no new data
468
476
  let lastDataTime = Date.now();
469
477
  const origOnMessage = session.ws.onmessage;
470
478
  session.ws.onmessage = (event: MessageEvent) => {
@@ -486,21 +494,32 @@ function finalizeSession(session: StreamingSession): void {
486
494
  if (session.closed) return;
487
495
  session.closed = true;
488
496
 
489
- // Clean up keepalive
490
497
  if (session.keepAliveTimer) {
491
498
  clearInterval(session.keepAliveTimer);
492
499
  session.keepAliveTimer = null;
493
500
  }
494
501
 
495
- // Close WebSocket
496
502
  try { session.ws.close(); } catch {}
497
-
498
- // Kill SoX if still running
499
503
  try { session.recProcess.kill("SIGKILL"); } catch {}
500
504
 
501
- // Deliver final transcript
502
505
  const fullText = session.finalizedParts.join(" ").trim();
503
- session.onDone(fullText);
506
+ session.onDone(fullText, {
507
+ hadAudio: session.hadAudioData,
508
+ hadSpeech: session.hadSpeech,
509
+ });
510
+ }
511
+
512
+ // ─── Abort helper — nuke everything synchronously ────────────────────────────
513
+
514
+ function abortSession(session: StreamingSession | null): void {
515
+ if (!session || session.closed) return;
516
+ session.closed = true;
517
+ if (session.keepAliveTimer) {
518
+ clearInterval(session.keepAliveTimer);
519
+ session.keepAliveTimer = null;
520
+ }
521
+ try { session.ws.close(); } catch {}
522
+ try { session.recProcess.kill("SIGKILL"); } catch {}
504
523
  }
505
524
 
506
525
  // ─── Extension ───────────────────────────────────────────────────────────────
@@ -515,11 +534,20 @@ export default function (pi: ExtensionAPI) {
515
534
  let recordingStart = 0;
516
535
  let statusTimer: ReturnType<typeof setInterval> | null = null;
517
536
  let terminalInputUnsub: (() => void) | null = null;
518
- let isHolding = false;
519
537
 
520
538
  // Streaming session state
521
539
  let activeSession: StreamingSession | null = null;
522
540
  let currentTarget: "editor" | "btw" = "editor";
541
+ let retryAttempts = 0;
542
+ let lastStopTime = 0; // For corruption guard
543
+
544
+ // Hold-to-talk state
545
+ let kittyReleaseDetected = false;
546
+ let spaceDownTime: number | null = null;
547
+ let holdActivationTimer: ReturnType<typeof setTimeout> | null = null;
548
+ let spaceConsumed = false; // True once threshold passed and recording started
549
+ let releaseDetectTimer: ReturnType<typeof setTimeout> | null = null;
550
+ let warmupWidgetTimer: ReturnType<typeof setInterval> | null = null;
523
551
 
524
552
  // ─── BTW State ───────────────────────────────────────────────────────────
525
553
 
@@ -548,13 +576,16 @@ export default function (pi: ExtensionAPI) {
548
576
  ctx.ui.setStatus("voice", `MIC ${modeTag}`);
549
577
  break;
550
578
  }
579
+ case "warmup":
580
+ ctx.ui.setStatus("voice", "🎙️ HOLD...");
581
+ break;
551
582
  case "recording": {
552
583
  const secs = Math.round((Date.now() - recordingStart) / 1000);
553
584
  ctx.ui.setStatus("voice", `🔴 REC ${secs}s`);
554
585
  break;
555
586
  }
556
- case "transcribing":
557
- ctx.ui.setStatus("voice", "STT...");
587
+ case "finalizing":
588
+ ctx.ui.setStatus("voice", "STT...");
558
589
  break;
559
590
  }
560
591
  }
@@ -564,20 +595,57 @@ export default function (pi: ExtensionAPI) {
564
595
  updateVoiceStatus();
565
596
  }
566
597
 
598
+ // ─── Cleanup helpers ─────────────────────────────────────────────────────
599
+
600
+ function clearHoldTimer() {
601
+ if (holdActivationTimer) {
602
+ clearTimeout(holdActivationTimer);
603
+ holdActivationTimer = null;
604
+ }
605
+ }
606
+
607
+ function clearReleaseTimer() {
608
+ if (releaseDetectTimer) {
609
+ clearTimeout(releaseDetectTimer);
610
+ releaseDetectTimer = null;
611
+ }
612
+ }
613
+
614
+ function clearWarmupWidget() {
615
+ if (warmupWidgetTimer) {
616
+ clearInterval(warmupWidgetTimer);
617
+ warmupWidgetTimer = null;
618
+ }
619
+ }
620
+
621
+ function clearRecordingAnimTimer() {
622
+ const timer = (showRecordingWidget as any)?._animTimer;
623
+ if (timer) {
624
+ clearInterval(timer);
625
+ (showRecordingWidget as any)._animTimer = null;
626
+ }
627
+ }
628
+
629
+ function hideWidget() {
630
+ if (ctx?.hasUI) ctx.ui.setWidget("voice-recording", undefined);
631
+ }
632
+
567
633
  function voiceCleanup() {
568
634
  if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
569
635
  clearHoldTimer();
570
636
  clearReleaseTimer();
571
- stopRecordingWidgetAnimation();
637
+ clearWarmupWidget();
638
+ clearRecordingAnimTimer();
572
639
  if (activeSession) {
573
- finalizeSession(activeSession);
640
+ abortSession(activeSession);
574
641
  activeSession = null;
575
642
  }
576
643
  if (legacyRecProcess) { legacyRecProcess.kill("SIGTERM"); legacyRecProcess = null; }
577
644
  if (tempFile) { try { fs.unlinkSync(tempFile); } catch {} tempFile = null; }
578
- isHolding = false;
579
645
  spaceConsumed = false;
580
646
  spaceDownTime = null;
647
+ retryAttempts = 0;
648
+ hideWidget();
581
649
  setVoiceState("idle");
582
650
  }
583
651
 
@@ -609,58 +677,64 @@ export default function (pi: ExtensionAPI) {
609
677
  ].join("\n"), validated ? "info" : "warning");
610
678
  }
611
679
 
612
- // ─── Live Transcript Widget (Component-based, themed) ───────────────────
680
+ // ─── Warmup Widget ──────────────────────────────────────────────────────
681
+ //
682
+ // During the 500ms hold threshold, show a subtle "keep holding…" hint
683
+ // with a progress indicator. This matches Claude Code's warmup pattern.
613
684
 
614
- /** Subtle hint shown during the hold threshold wait */
615
- function showHoldHintWidget() {
685
+ function showWarmupWidget() {
616
686
  if (!ctx?.hasUI) return;
617
- ctx.ui.setWidget("voice-recording", (tui, theme) => {
618
- return {
619
- invalidate() {},
620
- render(width: number): string[] {
621
- const bar = theme.fg("muted", "─".repeat(Math.min(width - 2, 60)));
622
- return [
623
- bar,
624
- theme.fg("dim", " Hold " + theme.bold("SPACE") + " for voice input..."),
625
- bar,
626
- ];
627
- },
628
- };
629
- }, { placement: "aboveEditor" });
630
- }
631
687
 
632
- function hideHoldHintWidget() {
633
- if (!ctx?.hasUI) return;
634
- ctx.ui.setWidget("voice-recording", undefined);
688
+ const startTime = Date.now();
689
+
690
+ const renderWarmup = () => {
691
+ if (!ctx?.hasUI) return;
692
+ const elapsed = Date.now() - startTime;
693
+ const progress = Math.min(elapsed / HOLD_THRESHOLD_MS, 1);
694
+ const barLen = 20;
695
+ const filled = Math.round(progress * barLen);
696
+ const empty = barLen - filled;
697
+
698
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
699
+ return {
700
+ invalidate() {},
701
+ render(width: number): string[] {
702
+ const maxW = Math.min(width - 2, 60);
703
+ const bar = theme.fg("accent", "█".repeat(filled)) + theme.fg("muted", "░".repeat(empty));
704
+ const hint = progress < 0.6
705
+ ? theme.fg("dim", "Keep holding " + theme.bold("SPACE") + " for voice…")
706
+ : theme.fg("accent", "Almost there… keep holding…");
707
+ const border = theme.fg("border", "─".repeat(maxW));
708
+ return [border, ` ${bar} ${hint}`, border];
709
+ },
710
+ };
711
+ }, { placement: "aboveEditor" });
712
+ };
713
+
714
+ renderWarmup();
715
+ warmupWidgetTimer = setInterval(renderWarmup, 50);
635
716
  }
636
717
 
637
- /** Animated recording indicator with live waveform */
718
+ // ─── Recording Widget ───────────────────────────────────────────────────
719
+
720
+ const waveChars = ["▁", "▂", "▃", "▅", "▆", "▇", "▆", "▅", "▃", "▂"];
721
+
638
722
  function showRecordingWidget(target: "editor" | "btw") {
639
723
  if (!ctx?.hasUI) return;
640
724
 
641
- // Store initial state — once live transcription arrives,
642
- // updateLiveTranscriptWidget takes over and we stop the animation.
643
- (showRecordingWidget as any)._target = target;
644
725
  (showRecordingWidget as any)._frame = 0;
645
726
  (showRecordingWidget as any)._hasTranscript = false;
646
727
 
647
- // Animate the widget every 300ms (only while no transcript is showing)
648
728
  const animTimer = setInterval(() => {
649
- // Stop animating once live transcript takes over
650
729
  if ((showRecordingWidget as any)?._hasTranscript) return;
651
-
652
730
  (showRecordingWidget as any)._frame = ((showRecordingWidget as any)._frame || 0) + 1;
653
731
  showRecordingWidgetFrame(target, (showRecordingWidget as any)._frame);
654
732
  }, 300);
655
733
 
656
- // Store the timer so we can clean it up
657
734
  (showRecordingWidget as any)._animTimer = animTimer;
658
-
659
735
  showRecordingWidgetFrame(target, 0);
660
736
  }
661
737
 
662
- const waveChars = ["▁", "▂", "▃", "▅", "▆", "▇", "▆", "▅", "▃", "▂"];
663
-
664
738
  function showRecordingWidgetFrame(target: "editor" | "btw", frame: number) {
665
739
  if (!ctx?.hasUI) return;
666
740
  ctx.ui.setWidget("voice-recording", (tui, theme) => {
@@ -673,7 +747,6 @@ export default function (pi: ExtensionAPI) {
673
747
  const secs = elapsed % 60;
674
748
  const timeStr = mins > 0 ? `${mins}:${String(secs).padStart(2, "0")}` : `${secs}s`;
675
749
 
676
- // Animated waveform
677
750
  const waveLen = 12;
678
751
  let wave = "";
679
752
  for (let i = 0; i < waveLen; i++) {
@@ -702,33 +775,25 @@ export default function (pi: ExtensionAPI) {
702
775
  ? theme.fg("dim", " Release SPACE to finalize")
703
776
  : theme.fg("dim", " Release SPACE to stop");
704
777
 
705
- const lines = [
778
+ return [
706
779
  topBorder,
707
780
  theme.fg("borderAccent", "│") + pad(titleLine, maxW) + theme.fg("borderAccent", "│"),
708
781
  theme.fg("borderAccent", "│") + pad(hint, maxW) + theme.fg("borderAccent", "│"),
709
782
  botBorder,
710
783
  ];
711
- return lines;
712
784
  },
713
785
  };
714
786
  }, { placement: "aboveEditor" });
715
787
  }
716
788
 
717
- function stopRecordingWidgetAnimation() {
718
- const timer = (showRecordingWidget as any)?._animTimer;
719
- if (timer) {
720
- clearInterval(timer);
721
- (showRecordingWidget as any)._animTimer = null;
722
- }
723
- }
789
+ // ─── Live Transcript Widget ─────────────────────────────────────────────
724
790
 
725
- /** Show live transcript inside a themed box */
726
791
  function updateLiveTranscriptWidget(interim: string, finals: string[]) {
727
792
  if (!ctx?.hasUI) return;
728
793
 
729
- // Stop the recording animation — live transcript takes over
794
+ // Stop the waveform animation — live transcript takes over
730
795
  (showRecordingWidget as any)._hasTranscript = true;
731
- stopRecordingWidgetAnimation();
796
+ clearRecordingAnimTimer();
732
797
 
733
798
  const finalized = finals.join(" ");
734
799
  const displayText = finalized + (interim ? (finalized ? " " : "") + interim : "");
@@ -756,15 +821,14 @@ export default function (pi: ExtensionAPI) {
756
821
  const label = theme.bold(theme.fg("accent", " VOICE "));
757
822
  const timeStyled = theme.fg("muted", timeStr);
758
823
  const titleLine = ` ${dot} ${label} ${timeStyled}`;
759
- const hint = theme.fg("dim", " Release SPACE to stop");
824
+ const hint = theme.fg("dim", " Release SPACE to finalize");
760
825
  const lines = [topBorder, side(titleLine)];
761
826
 
762
827
  if (!displayText.trim()) {
763
- lines.push(side(theme.fg("dim", " Listening... speak now")));
828
+ lines.push(side(theme.fg("dim", " Listening speak now")));
764
829
  } else {
765
830
  lines.push(sep);
766
- // Word-wrap the transcript text
767
- const innerMax = maxW - 4; // padding inside box
831
+ const innerMax = maxW - 4;
768
832
  const words = displayText.split(" ");
769
833
  const wrappedLines: string[] = [];
770
834
  let currentLine = "";
@@ -779,11 +843,9 @@ export default function (pi: ExtensionAPI) {
779
843
  }
780
844
  if (currentLine) wrappedLines.push(currentLine);
781
845
 
782
- // Show last 3 lines of transcript
783
846
  const visible = wrappedLines.slice(-3);
784
847
  for (let i = 0; i < visible.length; i++) {
785
848
  let line = visible[i];
786
- // Style: finalized parts in normal text, interim in accent
787
849
  if (i === visible.length - 1 && interim) {
788
850
  line = theme.fg("text", line) + theme.fg("accent", "▍");
789
851
  } else {
@@ -801,8 +863,9 @@ export default function (pi: ExtensionAPI) {
801
863
  }, { placement: "aboveEditor" });
802
864
  }
803
865
 
804
- /** Transcribing state show a processing indicator */
805
- function showTranscribingWidget() {
866
+ // ─── Finalizing Widget ──────────────────────────────────────────────────
867
+
868
+ function showFinalizingWidget() {
806
869
  if (!ctx?.hasUI) return;
807
870
  ctx.ui.setWidget("voice-recording", (tui, theme) => {
808
871
  return {
@@ -818,7 +881,7 @@ export default function (pi: ExtensionAPI) {
818
881
  };
819
882
  const spinner = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
820
883
  const idx = Math.floor(Date.now() / 100) % spinner.length;
821
- const line = ` ${theme.fg("accent", spinner[idx])} ${theme.fg("dim", "Finalizing transcription...")}`;
884
+ const line = ` ${theme.fg("accent", spinner[idx])} ${theme.fg("dim", "Finalizing transcription")}`;
822
885
  return [topBorder, side(line), botBorder];
823
886
  },
824
887
  };
@@ -828,104 +891,147 @@ export default function (pi: ExtensionAPI) {
828
891
  // ─── Voice: Start / Stop (Streaming or Legacy) ───────────────────────────
829
892
 
830
893
  async function startVoiceRecording(target: "editor" | "btw" = "editor"): Promise<boolean> {
831
- if (voiceState !== "idle" || !ctx) return false;
894
+ if (!ctx) return false;
895
+
896
+ // ── SESSION CORRUPTION GUARD ──
897
+ // If we're still finalizing from a previous recording, abort it first.
898
+ // This prevents the "slow connection overlaps new recording" bug.
899
+ if (voiceState === "finalizing" || voiceState === "recording") {
900
+ abortSession(activeSession);
901
+ activeSession = null;
902
+ clearRecordingAnimTimer();
903
+ clearWarmupWidget();
904
+ hideWidget();
905
+ setVoiceState("idle");
906
+ // Brief pause to let resources release
907
+ await new Promise((r) => setTimeout(r, CORRUPTION_GUARD_MS));
908
+ }
909
+
910
+ // ── STALE TRANSCRIPT CLEANUP ──
911
+ // Clear any prior transcript from the widget
912
+ hideWidget();
832
913
 
833
914
  currentTarget = target;
834
915
  recordingStart = Date.now();
916
+ retryAttempts = 0;
835
917
 
836
918
  if (isDeepgramStreaming(config)) {
837
- // === STREAMING PATH === (Deepgram WebSocket)
838
- setVoiceState("recording");
919
+ return startStreamingRecording(target);
920
+ } else {
921
+ return startLegacyRecording(target);
922
+ }
923
+ }
839
924
 
840
- const session = startStreamingSession(config, {
841
- onTranscript: (interim, finals) => {
842
- updateLiveTranscriptWidget(interim, finals);
843
- updateVoiceStatus();
844
- },
845
- onDone: (fullText) => {
846
- activeSession = null;
847
- stopRecordingWidgetAnimation();
848
- ctx?.ui.setWidget("voice-recording", undefined);
925
+ async function startStreamingRecording(target: "editor" | "btw"): Promise<boolean> {
926
+ setVoiceState("recording");
849
927
 
850
- if (!fullText.trim()) {
928
+ const session = startStreamingSession(config, {
929
+ onTranscript: (interim, finals) => {
930
+ // Live transcript update — this is the key UX feature
931
+ updateLiveTranscriptWidget(interim, finals);
932
+ updateVoiceStatus();
933
+ },
934
+ onDone: (fullText, meta) => {
935
+ activeSession = null;
936
+ clearRecordingAnimTimer();
937
+ hideWidget();
938
+ lastStopTime = Date.now();
939
+
940
+ if (!fullText.trim()) {
941
+ // ── DISTINGUISH SILENCE VS NO SPEECH ──
942
+ if (!meta.hadAudio) {
943
+ ctx?.ui.notify("Microphone captured no audio. Check mic permissions.", "error");
944
+ } else if (!meta.hadSpeech) {
945
+ ctx?.ui.notify("Microphone captured silence — no speech detected.", "warning");
946
+ } else {
851
947
  ctx?.ui.notify("No speech detected.", "warning");
852
- setVoiceState("idle");
853
- return;
854
948
  }
949
+ setVoiceState("idle");
950
+ return;
951
+ }
855
952
 
856
- if (target === "btw") {
857
- handleBtw(fullText);
858
- } else {
859
- if (ctx?.hasUI) {
860
- const existing = ctx.ui.getEditorText();
861
- ctx.ui.setEditorText(existing ? existing + " " + fullText : fullText);
862
- const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
863
- ctx.ui.notify(
864
- `STT (${elapsed}s): ${fullText.slice(0, 80)}${fullText.length > 80 ? "..." : ""}`,
865
- "info",
866
- );
867
- }
953
+ if (target === "btw") {
954
+ handleBtw(fullText);
955
+ } else {
956
+ if (ctx?.hasUI) {
957
+ const existing = ctx.ui.getEditorText();
958
+ ctx.ui.setEditorText(existing ? existing + " " + fullText : fullText);
959
+ const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
960
+ ctx.ui.notify(
961
+ `STT (${elapsed}s): ${fullText.slice(0, 80)}${fullText.length > 80 ? "" : ""}`,
962
+ "info",
963
+ );
868
964
  }
869
- setVoiceState("idle");
870
- },
871
- onError: (err) => {
872
- activeSession = null;
873
- stopRecordingWidgetAnimation();
874
- ctx?.ui.setWidget("voice-recording", undefined);
875
- ctx?.ui.notify(`STT error: ${err}`, "error");
876
- setVoiceState("idle");
877
- },
878
- });
965
+ }
966
+ setVoiceState("idle");
967
+ },
968
+ onError: (err) => {
969
+ activeSession = null;
970
+ clearRecordingAnimTimer();
971
+ hideWidget();
972
+
973
+ // ── TRANSIENT FAILURE RETRY ──
974
+ // On WebSocket error during rapid push-to-talk re-press, auto-retry
975
+ if (retryAttempts < MAX_RETRY_ATTEMPTS) {
976
+ retryAttempts++;
977
+ ctx?.ui.notify(`Voice connection error — retrying (${retryAttempts}/${MAX_RETRY_ATTEMPTS})…`, "warning");
978
+ setTimeout(() => {
979
+ if (voiceState !== "idle") {
980
+ setVoiceState("idle");
981
+ }
982
+ startStreamingRecording(target);
983
+ }, RETRY_DELAY_MS);
984
+ return;
985
+ }
879
986
 
880
- if (!session) {
987
+ ctx?.ui.notify(`STT error: ${err}`, "error");
881
988
  setVoiceState("idle");
882
- return false;
883
- }
989
+ },
990
+ });
991
+
992
+ if (!session) {
993
+ setVoiceState("idle");
994
+ return false;
995
+ }
884
996
 
885
- activeSession = session;
997
+ activeSession = session;
886
998
 
887
- // Status timer for elapsed time
888
- statusTimer = setInterval(() => {
889
- if (voiceState === "recording") {
890
- updateVoiceStatus();
891
- const elapsed = (Date.now() - recordingStart) / 1000;
892
- if (elapsed >= MAX_RECORDING_SECS) {
893
- isHolding = false;
894
- stopVoiceRecording(target);
895
- }
999
+ // Status timer for elapsed time
1000
+ statusTimer = setInterval(() => {
1001
+ if (voiceState === "recording") {
1002
+ updateVoiceStatus();
1003
+ const elapsed = (Date.now() - recordingStart) / 1000;
1004
+ if (elapsed >= MAX_RECORDING_SECS) {
1005
+ stopVoiceRecording(target);
896
1006
  }
897
- }, 1000);
1007
+ }
1008
+ }, 1000);
898
1009
 
899
- // Show the themed recording widget
900
- showRecordingWidget(target);
901
- return true;
1010
+ showRecordingWidget(target);
1011
+ return true;
1012
+ }
902
1013
 
903
- } else {
904
- // === LEGACY PATH === (file-based for local backends)
905
- tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
906
- if (!startLegacyRecordingToFile(tempFile)) {
907
- ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
908
- return false;
909
- }
1014
+ async function startLegacyRecording(target: "editor" | "btw"): Promise<boolean> {
1015
+ if (!ctx) return false;
1016
+ tempFile = path.join(os.tmpdir(), `pi-voice-${Date.now()}.wav`);
1017
+ if (!startLegacyRecordingToFile(tempFile)) {
1018
+ ctx.ui.notify("Voice requires SoX. Install: brew install sox", "error");
1019
+ return false;
1020
+ }
910
1021
 
911
- setVoiceState("recording");
912
- statusTimer = setInterval(() => {
913
- if (voiceState === "recording") {
914
- updateVoiceStatus();
915
- const elapsed = (Date.now() - recordingStart) / 1000;
916
- if (elapsed >= MAX_RECORDING_SECS) {
917
- isHolding = false;
918
- stopVoiceRecording(target);
919
- }
1022
+ setVoiceState("recording");
1023
+ statusTimer = setInterval(() => {
1024
+ if (voiceState === "recording") {
1025
+ updateVoiceStatus();
1026
+ const elapsed = (Date.now() - recordingStart) / 1000;
1027
+ if (elapsed >= MAX_RECORDING_SECS) {
1028
+ stopVoiceRecording(target);
920
1029
  }
921
- }, 1000);
922
-
923
- if (ctx.hasUI) {
924
- // Show themed recording widget for legacy path
925
- showRecordingWidget(target);
926
1030
  }
927
- return true;
928
- }
1031
+ }, 1000);
1032
+
1033
+ showRecordingWidget(target);
1034
+ return true;
929
1035
  }
930
1036
 
931
1037
  async function stopVoiceRecording(target: "editor" | "btw" = "editor") {
@@ -933,34 +1039,35 @@ export default function (pi: ExtensionAPI) {
933
1039
  if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
934
1040
 
935
1041
  if (activeSession) {
936
- // === STREAMING PATH === Stop the stream, finalize will call onDone
937
- setVoiceState("transcribing");
938
- stopRecordingWidgetAnimation();
939
- showTranscribingWidget();
1042
+ setVoiceState("finalizing");
1043
+ clearRecordingAnimTimer();
1044
+ showFinalizingWidget();
940
1045
  stopStreamingSession(activeSession);
941
1046
  return;
942
1047
  }
943
1048
 
944
- // === LEGACY PATH ===
1049
+ // Legacy path
945
1050
  const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
946
1051
  const audioFile = tempFile;
947
- setVoiceState("transcribing");
948
- stopRecordingWidgetAnimation();
949
- showTranscribingWidget();
1052
+ setVoiceState("finalizing");
1053
+ clearRecordingAnimTimer();
1054
+ showFinalizingWidget();
950
1055
 
951
1056
  await stopLegacyRecording();
952
1057
 
953
1058
  if (!audioFile || !fs.existsSync(audioFile)) {
954
1059
  ctx.ui.notify("No audio recorded.", "warning");
1060
+ hideWidget();
955
1061
  setVoiceState("idle");
956
1062
  return;
957
1063
  }
958
1064
 
959
1065
  const stats = fs.statSync(audioFile);
960
1066
  if (stats.size < 1000) {
961
- ctx.ui.notify("Recording too short.", "warning");
1067
+ ctx.ui.notify("Recording too short — mic captured silence.", "warning");
962
1068
  try { fs.unlinkSync(audioFile); } catch {}
963
1069
  tempFile = null;
1070
+ hideWidget();
964
1071
  setVoiceState("idle");
965
1072
  return;
966
1073
  }
@@ -971,6 +1078,8 @@ export default function (pi: ExtensionAPI) {
971
1078
  try { fs.unlinkSync(audioFile); } catch {}
972
1079
  if (tempFile === audioFile) tempFile = null;
973
1080
 
1081
+ hideWidget();
1082
+
974
1083
  if (result.error) {
975
1084
  ctx.ui.notify(`STT error: ${result.error}`, "error");
976
1085
  setVoiceState("idle");
@@ -991,7 +1100,7 @@ export default function (pi: ExtensionAPI) {
991
1100
  const existing = ctx.ui.getEditorText();
992
1101
  ctx.ui.setEditorText(existing ? existing + " " + transcript : transcript);
993
1102
  ctx.ui.notify(
994
- `STT (${elapsed}s): ${transcript.slice(0, 80)}${transcript.length > 80 ? "..." : ""}`,
1103
+ `STT (${elapsed}s): ${transcript.slice(0, 80)}${transcript.length > 80 ? "" : ""}`,
995
1104
  "info",
996
1105
  );
997
1106
  }
@@ -1000,83 +1109,46 @@ export default function (pi: ExtensionAPI) {
1000
1109
  setVoiceState("idle");
1001
1110
  }
1002
1111
 
1003
- // ─── Hold-to-talk with Duration Threshold ──────────────────────────────
1004
- //
1005
- // SPACE activates voice ONLY when:
1006
- // 1. The editor is empty (no text typed yet)
1007
- // 2. SPACE is held for ≥ HOLD_THRESHOLD_MS (500ms)
1112
+ // ─── Hold-to-Talk State Machine ─────────────────────────────────────────
1008
1113
  //
1009
- // If SPACE is released before the threshold, a regular space character
1010
- // is typed into the editor (normal typing behavior).
1114
+ // SPACE key handling with strict hold-duration detection:
1011
1115
  //
1012
- // KEY DESIGN for non-Kitty terminals (no key-release events):
1013
- // Holding a key generates rapid press events (~30ms apart). We detect
1014
- // "release" by watching for the stream of space presses to STOP.
1015
- // Once the gap exceeds RELEASE_DETECT_MS (200ms), we know the user
1016
- // lifted their finger and we stop recording.
1116
+ // 1. SPACE press (first) enter "warmup" state, start 500ms timer
1117
+ // 2. During warmup: show progress bar, consume repeat presses
1118
+ // 3. Timer fires transition to "recording", start voice capture
1119
+ // 4. SPACE release stop recording, finalize
1120
+ // 5. If released during warmup cancel, type a space character
1017
1121
  //
1018
- // Flow:
1019
- // Hold SPACE → rapid presses arrive → first press starts 500ms timer →
1020
- // timer fires → recording starts → presses keep coming (consumed) →
1021
- // user releases → presses stop → 200ms silence → auto-stop recording
1022
- //
1023
- // Kitty protocol terminals get true key-release events and work natively.
1024
-
1025
- const HOLD_THRESHOLD_MS = 500; // minimum hold time before voice activates
1026
- const RELEASE_DETECT_MS = 200; // gap in key-repeat that means "released"
1027
- let kittyReleaseDetected = false;
1028
- let spaceDownTime: number | null = null;
1029
- let holdActivationTimer: ReturnType<typeof setTimeout> | null = null;
1030
- let spaceConsumed = false;
1031
- let lastSpacePressTime = 0;
1032
- let releaseDetectTimer: ReturnType<typeof setTimeout> | null = null;
1033
-
1034
- function clearHoldTimer() {
1035
- if (holdActivationTimer) {
1036
- clearTimeout(holdActivationTimer);
1037
- holdActivationTimer = null;
1038
- }
1039
- }
1040
-
1041
- function clearReleaseTimer() {
1042
- if (releaseDetectTimer) {
1043
- clearTimeout(releaseDetectTimer);
1044
- releaseDetectTimer = null;
1045
- }
1046
- }
1122
+ // Non-Kitty detection: rapid press events = "holding", gap > 150ms = "released"
1047
1123
 
1048
- /** Called when we detect the user has released SPACE (non-Kitty) */
1049
1124
  function onSpaceReleaseDetected() {
1050
1125
  releaseDetectTimer = null;
1051
1126
 
1052
- // If we're still in the threshold wait (< 500ms), user just tapped space
1053
- if (spaceDownTime && !spaceConsumed) {
1127
+ // Released during warmup cancel, type a space
1128
+ if (voiceState === "warmup") {
1054
1129
  clearHoldTimer();
1130
+ clearWarmupWidget();
1131
+ hideWidget();
1132
+ setVoiceState("idle");
1055
1133
  spaceDownTime = null;
1056
1134
  spaceConsumed = false;
1057
- // Insert a space character
1058
1135
  if (ctx?.hasUI) {
1059
1136
  ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
1060
- hideHoldHintWidget();
1061
1137
  }
1062
1138
  return;
1063
1139
  }
1064
1140
 
1065
- // If we're recording, stop
1141
+ // Released during recording stop
1066
1142
  if (spaceConsumed && voiceState === "recording") {
1067
- isHolding = false;
1068
1143
  spaceConsumed = false;
1069
1144
  spaceDownTime = null;
1070
1145
  stopVoiceRecording("editor");
1071
1146
  }
1072
1147
  }
1073
1148
 
1074
- /** Reset the release detection timer — called on every space press */
1075
1149
  function resetReleaseDetect() {
1076
1150
  clearReleaseTimer();
1077
- // If we're in a hold state (threshold pending or recording),
1078
- // start a timer to detect release
1079
- if (spaceDownTime || spaceConsumed || voiceState === "recording") {
1151
+ if (voiceState === "warmup" || voiceState === "recording" || spaceDownTime || spaceConsumed) {
1080
1152
  releaseDetectTimer = setTimeout(onSpaceReleaseDetected, RELEASE_DETECT_MS);
1081
1153
  }
1082
1154
  }
@@ -1091,33 +1163,26 @@ export default function (pi: ExtensionAPI) {
1091
1163
 
1092
1164
  // ── SPACE handling ──
1093
1165
  if (matchesKey(data, "space")) {
1094
- // RULE: If editor has content, SPACE always types a space — never voice
1095
- const editorText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
1096
- if (editorText && editorText.trim().length > 0) {
1097
- clearHoldTimer();
1098
- clearReleaseTimer();
1099
- spaceDownTime = null;
1100
- spaceConsumed = false;
1101
- return undefined; // let the default space character through
1102
- }
1103
1166
 
1104
1167
  // ── Kitty key-release ──
1105
1168
  if (isKeyRelease(data)) {
1106
1169
  kittyReleaseDetected = true;
1107
1170
  clearReleaseTimer();
1108
1171
 
1109
- // Released before threshold → type a space character
1110
- if (spaceDownTime && !spaceConsumed) {
1172
+ // Released during warmupcancel, type a space
1173
+ if (voiceState === "warmup") {
1111
1174
  clearHoldTimer();
1175
+ clearWarmupWidget();
1176
+ hideWidget();
1177
+ setVoiceState("idle");
1112
1178
  spaceDownTime = null;
1113
1179
  spaceConsumed = false;
1114
1180
  if (ctx?.hasUI) ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
1115
1181
  return { consume: true };
1116
1182
  }
1117
1183
 
1118
- // Released after threshold → stop recording (true hold-to-talk)
1184
+ // Released during recording → stop
1119
1185
  if (spaceConsumed && voiceState === "recording") {
1120
- isHolding = false;
1121
1186
  spaceConsumed = false;
1122
1187
  spaceDownTime = null;
1123
1188
  stopVoiceRecording("editor");
@@ -1129,60 +1194,58 @@ export default function (pi: ExtensionAPI) {
1129
1194
  return undefined;
1130
1195
  }
1131
1196
 
1132
- // ── Kitty key-repeat: ALWAYS suppress while holding/recording ──
1197
+ // ── Kitty key-repeat: suppress while in warmup/recording ──
1133
1198
  if (isKeyRepeat(data)) {
1134
- if (spaceDownTime || spaceConsumed || isHolding || voiceState === "recording") {
1135
- resetReleaseDetect(); // keep resetting — still holding
1199
+ if (voiceState === "warmup" || voiceState === "recording" || voiceState === "finalizing" || spaceConsumed) {
1200
+ resetReleaseDetect();
1136
1201
  return { consume: true };
1137
1202
  }
1138
1203
  return undefined;
1139
1204
  }
1140
1205
 
1141
1206
  // === Key PRESS ===
1142
- // In non-Kitty terminals, holding a key sends rapid press events.
1143
- // We use these to detect "still holding" and the gap to detect "released".
1144
-
1145
- // Reset release detection — user is still holding
1146
1207
  resetReleaseDetect();
1147
1208
 
1148
- // If transcribing → ignore
1149
- if (voiceState === "transcribing") {
1209
+ // If finalizing → ignore
1210
+ if (voiceState === "finalizing") {
1150
1211
  return { consume: true };
1151
1212
  }
1152
1213
 
1153
- // If already recording → just consume (release detect handles stop)
1214
+ // If already recording → just consume (release handles stop)
1154
1215
  if (voiceState === "recording") {
1155
1216
  return { consume: true };
1156
1217
  }
1157
1218
 
1158
- // If we already started the hold timer, this is a repeat → consume
1159
- if (spaceDownTime) {
1219
+ // If already in warmup consume (threshold timer is running)
1220
+ if (voiceState === "warmup") {
1221
+ return { consume: true };
1222
+ }
1223
+
1224
+ // If we've already consumed space for this hold → consume
1225
+ if (spaceConsumed || spaceDownTime) {
1160
1226
  return { consume: true };
1161
1227
  }
1162
1228
 
1163
- // Idle, first press → start the hold timer
1229
+ // IDLE first press → start warmup
1164
1230
  if (voiceState === "idle") {
1165
1231
  spaceDownTime = Date.now();
1166
1232
  spaceConsumed = false;
1167
- lastSpacePressTime = Date.now();
1168
1233
 
1169
- // Show a subtle "preparing" indicator
1170
- if (ctx?.hasUI) {
1171
- showHoldHintWidget();
1172
- }
1234
+ // Transition to warmup state
1235
+ setVoiceState("warmup");
1236
+ showWarmupWidget();
1173
1237
 
1174
1238
  // After threshold: activate voice recording
1175
1239
  holdActivationTimer = setTimeout(() => {
1176
1240
  holdActivationTimer = null;
1177
- const currentText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
1178
- if (voiceState === "idle" && spaceDownTime && !(currentText && currentText.trim().length > 0)) {
1241
+ if (voiceState === "warmup" && spaceDownTime) {
1242
+ clearWarmupWidget();
1179
1243
  spaceConsumed = true;
1180
- isHolding = true;
1181
1244
  startVoiceRecording("editor").then((ok) => {
1182
1245
  if (!ok) {
1183
- isHolding = false;
1184
1246
  spaceConsumed = false;
1185
1247
  spaceDownTime = null;
1248
+ setVoiceState("idle");
1186
1249
  }
1187
1250
  });
1188
1251
  } else {
@@ -1194,17 +1257,19 @@ export default function (pi: ExtensionAPI) {
1194
1257
  return { consume: true };
1195
1258
  }
1196
1259
 
1197
- if (isHolding || spaceConsumed) return { consume: true };
1260
+ if (spaceConsumed) return { consume: true };
1198
1261
  return undefined;
1199
1262
  }
1200
1263
 
1201
- // ── Any other key while holding space (pre-threshold) → cancel hold, insert space ──
1202
- if (spaceDownTime && !spaceConsumed && !matchesKey(data, "space")) {
1264
+ // ── Any other key during warmup → cancel hold, type a space ──
1265
+ if (voiceState === "warmup" && spaceDownTime && !spaceConsumed) {
1203
1266
  clearHoldTimer();
1204
1267
  clearReleaseTimer();
1268
+ clearWarmupWidget();
1269
+ hideWidget();
1270
+ setVoiceState("idle");
1205
1271
  if (ctx?.hasUI) {
1206
1272
  ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
1207
- hideHoldHintWidget();
1208
1273
  }
1209
1274
  spaceDownTime = null;
1210
1275
  spaceConsumed = false;
@@ -1215,8 +1280,7 @@ export default function (pi: ExtensionAPI) {
1215
1280
  if (matchesKey(data, "ctrl+shift+b")) {
1216
1281
  if (isKeyRelease(data)) {
1217
1282
  kittyReleaseDetected = true;
1218
- if (isHolding && voiceState === "recording") {
1219
- isHolding = false;
1283
+ if (voiceState === "recording" && currentTarget === "btw") {
1220
1284
  stopVoiceRecording("btw");
1221
1285
  return { consume: true };
1222
1286
  }
@@ -1224,25 +1288,23 @@ export default function (pi: ExtensionAPI) {
1224
1288
  }
1225
1289
 
1226
1290
  if (isKeyRepeat(data)) {
1227
- if (isHolding) return { consume: true };
1291
+ if (voiceState === "recording" && currentTarget === "btw") return { consume: true };
1228
1292
  return undefined;
1229
1293
  }
1230
1294
 
1231
- if (voiceState === "recording") {
1232
- isHolding = false;
1295
+ if (voiceState === "recording" && currentTarget === "btw") {
1233
1296
  stopVoiceRecording("btw");
1234
1297
  return { consume: true };
1235
1298
  }
1236
1299
 
1237
- if (voiceState === "idle" && !isHolding) {
1238
- isHolding = true;
1239
- startVoiceRecording("btw").then((ok) => {
1240
- if (!ok) isHolding = false;
1241
- });
1300
+ if (voiceState === "idle") {
1301
+ startVoiceRecording("btw");
1242
1302
  return { consume: true };
1243
1303
  }
1244
1304
 
1245
- if (isHolding) return { consume: true };
1305
+ if (voiceState === "recording" || voiceState === "finalizing" || voiceState === "warmup") {
1306
+ return { consume: true };
1307
+ }
1246
1308
  return undefined;
1247
1309
  }
1248
1310
 
@@ -1286,12 +1348,12 @@ export default function (pi: ExtensionAPI) {
1286
1348
  "",
1287
1349
  ];
1288
1350
 
1289
- lines.push(` Q: ${last.question.slice(0, 100)}${last.question.length > 100 ? "..." : ""}`);
1351
+ lines.push(` Q: ${last.question.slice(0, 100)}${last.question.length > 100 ? "" : ""}`);
1290
1352
  const answerLines = last.answer.split("\n");
1291
1353
  for (const line of answerLines.slice(0, 8)) {
1292
1354
  lines.push(` ${line}`);
1293
1355
  }
1294
- if (answerLines.length > 8) lines.push(" ...");
1356
+ if (answerLines.length > 8) lines.push(" ");
1295
1357
 
1296
1358
  lines.push("");
1297
1359
  lines.push(" /btw:clear to dismiss | /btw:inject to send to agent");
@@ -1307,9 +1369,9 @@ export default function (pi: ExtensionAPI) {
1307
1369
  ctx.ui.setWidget("btw", [
1308
1370
  " BTW",
1309
1371
  "",
1310
- ` Q: ${message.slice(0, 100)}${message.length > 100 ? "..." : ""}`,
1372
+ ` Q: ${message.slice(0, 100)}${message.length > 100 ? "" : ""}`,
1311
1373
  "",
1312
- " Thinking...",
1374
+ " Thinking",
1313
1375
  ], { placement: "aboveEditor" });
1314
1376
 
1315
1377
  const btwContext = buildBtwContext();
@@ -1394,16 +1456,12 @@ export default function (pi: ExtensionAPI) {
1394
1456
  return;
1395
1457
  }
1396
1458
  if (voiceState === "idle") {
1397
- // Direct start — bypass hold threshold
1398
1459
  spaceConsumed = true;
1399
- isHolding = true;
1400
1460
  const ok = await startVoiceRecording("editor");
1401
1461
  if (!ok) {
1402
- isHolding = false;
1403
1462
  spaceConsumed = false;
1404
1463
  }
1405
1464
  } else if (voiceState === "recording") {
1406
- isHolding = false;
1407
1465
  spaceConsumed = false;
1408
1466
  spaceDownTime = null;
1409
1467
  clearHoldTimer();
@@ -1422,9 +1480,7 @@ export default function (pi: ExtensionAPI) {
1422
1480
  configSource = loaded.source;
1423
1481
  updateSocketPath(config, currentCwd);
1424
1482
 
1425
- // Auto-capture DEEPGRAM_API_KEY from env into config if not already stored.
1426
- // This ensures streaming works even when Pi is launched from a context
1427
- // that doesn't source .zshrc (GUI app, tmux, etc.)
1483
+ // Auto-capture DEEPGRAM_API_KEY from env into config
1428
1484
  if (process.env.DEEPGRAM_API_KEY && !config.deepgramApiKey) {
1429
1485
  config.deepgramApiKey = process.env.DEEPGRAM_API_KEY;
1430
1486
  if (configSource !== "default") {
@@ -1432,7 +1488,7 @@ export default function (pi: ExtensionAPI) {
1432
1488
  }
1433
1489
  }
1434
1490
 
1435
- // Also try to load DEEPGRAM_API_KEY from shell if not in process.env and not in config
1491
+ // Try to load DEEPGRAM_API_KEY from shell if not available
1436
1492
  if (!resolveDeepgramApiKey(config) && config.backend === "deepgram") {
1437
1493
  try {
1438
1494
  const result = spawnSync("zsh", ["-ic", "echo $DEEPGRAM_API_KEY"], {
@@ -1443,7 +1499,7 @@ export default function (pi: ExtensionAPI) {
1443
1499
  const shellKey = result.stdout?.toString().trim();
1444
1500
  if (shellKey && shellKey.length > 5) {
1445
1501
  config.deepgramApiKey = shellKey;
1446
- process.env.DEEPGRAM_API_KEY = shellKey; // Also set for child processes
1502
+ process.env.DEEPGRAM_API_KEY = shellKey;
1447
1503
  if (configSource !== "default") {
1448
1504
  saveConfig(config, config.scope, currentCwd);
1449
1505
  }
@@ -1454,7 +1510,6 @@ export default function (pi: ExtensionAPI) {
1454
1510
  if (config.enabled && config.onboarding.completed) {
1455
1511
  updateVoiceStatus();
1456
1512
  setupHoldToTalk();
1457
- // Only start daemon for non-streaming backends
1458
1513
  if (!isDeepgramStreaming(config)) {
1459
1514
  ensureDaemon(config).catch(() => {});
1460
1515
  }
@@ -1515,7 +1570,15 @@ export default function (pi: ExtensionAPI) {
1515
1570
  ensureDaemon(config).catch(() => {});
1516
1571
  }
1517
1572
  const mode = isDeepgramStreaming(config) ? "Deepgram streaming" : config.backend;
1518
- cmdCtx.ui.notify(`Voice enabled (${mode}).\n Hold SPACE (empty editor) → release to transcribe\n Ctrl+Shift+V → toggle recording on/off\n Live transcription shown while speaking`, "info");
1573
+ cmdCtx.ui.notify([
1574
+ `Voice enabled (${mode}).`,
1575
+ "",
1576
+ " Hold SPACE (500ms) → release to transcribe",
1577
+ " Ctrl+Shift+V → toggle recording on/off",
1578
+ " Quick SPACE tap → types a space (no voice)",
1579
+ "",
1580
+ " Live transcription shown while speaking",
1581
+ ].join("\n"), "info");
1519
1582
  return;
1520
1583
  }
1521
1584
 
@@ -1530,9 +1593,14 @@ export default function (pi: ExtensionAPI) {
1530
1593
 
1531
1594
  if (sub === "stop") {
1532
1595
  if (voiceState === "recording") {
1533
- isHolding = false;
1534
1596
  await stopVoiceRecording("editor");
1535
1597
  cmdCtx.ui.notify("Recording stopped and transcribed.", "info");
1598
+ } else if (voiceState === "warmup") {
1599
+ clearHoldTimer();
1600
+ clearWarmupWidget();
1601
+ hideWidget();
1602
+ setVoiceState("idle");
1603
+ cmdCtx.ui.notify("Warmup cancelled.", "info");
1536
1604
  } else {
1537
1605
  cmdCtx.ui.notify("No recording in progress.", "info");
1538
1606
  }
@@ -1540,7 +1608,7 @@ export default function (pi: ExtensionAPI) {
1540
1608
  }
1541
1609
 
1542
1610
  if (sub === "test") {
1543
- cmdCtx.ui.notify("Testing voice setup...", "info");
1611
+ cmdCtx.ui.notify("Testing voice setup", "info");
1544
1612
  const diagnostics = scanEnvironment(TRANSCRIBE_SCRIPT);
1545
1613
  const dgKey = resolveDeepgramApiKey(config);
1546
1614
  const streaming = isDeepgramStreaming(config);
@@ -1557,11 +1625,15 @@ export default function (pi: ExtensionAPI) {
1557
1625
  ` model status: ${modelReadiness}`,
1558
1626
  ` language: ${config.language}`,
1559
1627
  ` streaming: ${streaming ? "YES (Deepgram WS)" : "NO (batch)"}`,
1560
- ` DEEPGRAM_API_KEY: ${dgKey ? "set (" + dgKey.slice(0, 8) + "...)" : "NOT SET"}`,
1628
+ ` DEEPGRAM_API_KEY: ${dgKey ? "set (" + dgKey.slice(0, 8) + ")" : "NOT SET"}`,
1561
1629
  ` onboarding: ${config.onboarding.completed ? "complete" : "incomplete"}`,
1562
1630
  ` python3: ${diagnostics.hasPython ? "OK" : "missing"}`,
1563
1631
  ` sox/rec: ${diagnostics.hasSox ? "OK" : "missing"}`,
1564
1632
  ` daemon: ${daemonUp ? "running" : "not running"}`,
1633
+ ` state: ${voiceState}`,
1634
+ ` hold threshold: ${HOLD_THRESHOLD_MS}ms`,
1635
+ ` release detect: ${RELEASE_DETECT_MS}ms`,
1636
+ ` kitty protocol: ${kittyReleaseDetected ? "detected" : "not detected"}`,
1565
1637
  ];
1566
1638
 
1567
1639
  if (diagnostics.hasSox) {
@@ -1615,14 +1687,15 @@ export default function (pi: ExtensionAPI) {
1615
1687
  ` setup: ${config.onboarding.completed ? `complete (${config.onboarding.source ?? "unknown"})` : "incomplete"}`,
1616
1688
  ` socket: ${activeSocketPath}`,
1617
1689
  ` daemon: ${daemonUp ? "running" : "stopped"}${daemonInfo}`,
1618
- ` hold-key: SPACE (editor empty) or Ctrl+Shift+V (toggle)`,
1690
+ ` hold-key: SPACE (hold ≥${HOLD_THRESHOLD_MS}ms) or Ctrl+Shift+V (toggle)`,
1619
1691
  ` btw-key: Ctrl+Shift+B (hold to record → auto-btw)`,
1692
+ ` kitty: ${kittyReleaseDetected ? "yes" : "no"}`,
1620
1693
  ].join("\n"), "info");
1621
1694
  return;
1622
1695
  }
1623
1696
 
1624
1697
  if (sub === "daemon" || sub === "daemon start") {
1625
- cmdCtx.ui.notify("Starting STT daemon...", "info");
1698
+ cmdCtx.ui.notify("Starting STT daemon", "info");
1626
1699
  const ok = await ensureDaemon(config);
1627
1700
  cmdCtx.ui.notify(ok ? "Daemon started." : "Failed to start daemon.", ok ? "info" : "error");
1628
1701
  return;
@@ -1864,7 +1937,7 @@ export default function (pi: ExtensionAPI) {
1864
1937
  return;
1865
1938
  }
1866
1939
 
1867
- cmdCtx.ui.notify("Summarizing BTW thread...", "info");
1940
+ cmdCtx.ui.notify("Summarizing BTW thread", "info");
1868
1941
 
1869
1942
  try {
1870
1943
  let summary = "";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@codexstar/pi-listen",
3
- "version": "1.0.16",
3
+ "version": "1.0.18",
4
4
  "description": "Voice input, first-run onboarding, and side-channel BTW conversations for Pi",
5
5
  "type": "module",
6
6
  "keywords": [