@codexstar/pi-listen 1.0.13 → 1.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/extensions/voice.ts +318 -70
  2. package/package.json +1 -1
@@ -566,6 +566,8 @@ export default function (pi: ExtensionAPI) {
566
566
 
567
567
  function voiceCleanup() {
568
568
  if (statusTimer) { clearInterval(statusTimer); statusTimer = null; }
569
+ clearHoldTimer();
570
+ stopRecordingWidgetAnimation();
569
571
  if (activeSession) {
570
572
  finalizeSession(activeSession);
571
573
  activeSession = null;
@@ -573,6 +575,8 @@ export default function (pi: ExtensionAPI) {
573
575
  if (legacyRecProcess) { legacyRecProcess.kill("SIGTERM"); legacyRecProcess = null; }
574
576
  if (tempFile) { try { fs.unlinkSync(tempFile); } catch {} tempFile = null; }
575
577
  isHolding = false;
578
+ spaceConsumed = false;
579
+ spaceDownTime = null;
576
580
  setVoiceState("idle");
577
581
  }
578
582
 
@@ -604,46 +608,211 @@ export default function (pi: ExtensionAPI) {
604
608
  ].join("\n"), validated ? "info" : "warning");
605
609
  }
606
610
 
607
- // ─── Live Transcript Widget ──────────────────────────────────────────────
611
+ // ─── Live Transcript Widget (Component-based, themed) ───────────────────
608
612
 
613
+ /** Subtle hint shown during the hold threshold wait */
614
+ function showHoldHintWidget() {
615
+ if (!ctx?.hasUI) return;
616
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
617
+ return {
618
+ invalidate() {},
619
+ render(width: number): string[] {
620
+ const bar = theme.fg("muted", "─".repeat(Math.min(width - 2, 60)));
621
+ return [
622
+ bar,
623
+ theme.fg("dim", " Hold " + theme.bold("SPACE") + " for voice input..."),
624
+ bar,
625
+ ];
626
+ },
627
+ };
628
+ }, { placement: "aboveEditor" });
629
+ }
630
+
631
+ function hideHoldHintWidget() {
632
+ if (!ctx?.hasUI) return;
633
+ ctx.ui.setWidget("voice-recording", undefined);
634
+ }
635
+
636
+ /** Animated recording indicator with live waveform */
637
+ function showRecordingWidget(target: "editor" | "btw") {
638
+ if (!ctx?.hasUI) return;
639
+ let frame = 0;
640
+ const waveChars = ["▁", "▂", "▃", "▅", "▆", "▇", "▆", "▅", "▃", "▂"];
641
+
642
+ // Animate the widget every 200ms
643
+ const animTimer = setInterval(() => {
644
+ frame++;
645
+ if (ctx?.hasUI) ctx.ui.setWidget("voice-recording", undefined); // force re-render
646
+ showRecordingWidgetFrame(target, frame, waveChars);
647
+ }, 200);
648
+
649
+ // Store the timer so we can clean it up
650
+ (showRecordingWidget as any)._animTimer = animTimer;
651
+
652
+ showRecordingWidgetFrame(target, frame, waveChars);
653
+ }
654
+
655
+ function showRecordingWidgetFrame(target: "editor" | "btw", frame: number, waveChars: string[]) {
656
+ if (!ctx?.hasUI) return;
657
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
658
+ return {
659
+ invalidate() {},
660
+ render(width: number): string[] {
661
+ const maxW = Math.min(width - 2, 72);
662
+ const elapsed = Math.round((Date.now() - recordingStart) / 1000);
663
+ const mins = Math.floor(elapsed / 60);
664
+ const secs = elapsed % 60;
665
+ const timeStr = mins > 0 ? `${mins}:${String(secs).padStart(2, "0")}` : `${secs}s`;
666
+
667
+ // Animated waveform
668
+ const waveLen = 12;
669
+ let wave = "";
670
+ for (let i = 0; i < waveLen; i++) {
671
+ wave += waveChars[(frame + i) % waveChars.length];
672
+ }
673
+
674
+ const topBorder = theme.fg("borderAccent", "╭" + "─".repeat(maxW) + "╮");
675
+ const botBorder = theme.fg("borderAccent", "╰" + "─".repeat(maxW) + "╯");
676
+ const pad = (s: string, w: number) => {
677
+ const visible = s.replace(/\x1b\[[^m]*m/g, "").length;
678
+ return s + " ".repeat(Math.max(0, w - visible));
679
+ };
680
+
681
+ const dot = theme.fg("error", "●");
682
+ const label = target === "btw"
683
+ ? theme.bold(theme.fg("accent", " BTW "))
684
+ : theme.bold(theme.fg("accent", " VOICE "));
685
+ const waveStyled = theme.fg("accent", wave);
686
+ const timeStyled = theme.fg("muted", timeStr);
687
+
688
+ const titleLine = ` ${dot} ${label} ${waveStyled} ${timeStyled}`;
689
+
690
+ const hint = target === "btw"
691
+ ? theme.fg("dim", " Press Ctrl+Shift+B to stop")
692
+ : kittyReleaseDetected
693
+ ? theme.fg("dim", " Release SPACE to finalize")
694
+ : theme.fg("dim", " Press SPACE again to stop");
695
+
696
+ const lines = [
697
+ topBorder,
698
+ theme.fg("borderAccent", "│") + pad(titleLine, maxW) + theme.fg("borderAccent", "│"),
699
+ theme.fg("borderAccent", "│") + pad(hint, maxW) + theme.fg("borderAccent", "│"),
700
+ botBorder,
701
+ ];
702
+ return lines;
703
+ },
704
+ };
705
+ }, { placement: "aboveEditor" });
706
+ }
707
+
708
+ function stopRecordingWidgetAnimation() {
709
+ const timer = (showRecordingWidget as any)?._animTimer;
710
+ if (timer) {
711
+ clearInterval(timer);
712
+ (showRecordingWidget as any)._animTimer = null;
713
+ }
714
+ }
715
+
716
+ /** Show live transcript inside a themed box */
609
717
  function updateLiveTranscriptWidget(interim: string, finals: string[]) {
610
718
  if (!ctx?.hasUI) return;
611
719
 
612
720
  const finalized = finals.join(" ");
613
721
  const displayText = finalized + (interim ? (finalized ? " " : "") + interim : "");
614
722
 
615
- if (!displayText.trim()) {
616
- ctx.ui.setWidget("voice-recording", [
617
- " 🎙 Listening... (speak now)",
618
- ], { placement: "aboveEditor" });
619
- return;
620
- }
723
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
724
+ return {
725
+ invalidate() {},
726
+ render(width: number): string[] {
727
+ const maxW = Math.min(width - 2, 72);
728
+ const elapsed = Math.round((Date.now() - recordingStart) / 1000);
729
+ const mins = Math.floor(elapsed / 60);
730
+ const secs = elapsed % 60;
731
+ const timeStr = mins > 0 ? `${mins}:${String(secs).padStart(2, "0")}` : `${secs}s`;
732
+
733
+ const topBorder = theme.fg("borderAccent", "╭" + "─".repeat(maxW) + "╮");
734
+ const botBorder = theme.fg("borderAccent", "╰" + "─".repeat(maxW) + "╯");
735
+ const sep = theme.fg("borderAccent", "│") + theme.fg("borderAccent", "─".repeat(maxW)) + theme.fg("borderAccent", "│");
736
+ const side = (content: string) => {
737
+ const stripped = content.replace(/\x1b\[[^m]*m/g, "");
738
+ const padding = Math.max(0, maxW - stripped.length);
739
+ return theme.fg("borderAccent", "│") + content + " ".repeat(padding) + theme.fg("borderAccent", "│");
740
+ };
741
+
742
+ const dot = theme.fg("error", "●");
743
+ const label = theme.bold(theme.fg("accent", " VOICE "));
744
+ const timeStyled = theme.fg("muted", timeStr);
745
+ const titleLine = ` ${dot} ${label} ${timeStyled}`;
746
+ const hint = kittyReleaseDetected
747
+ ? theme.fg("dim", " Release SPACE to finalize")
748
+ : theme.fg("dim", " Press SPACE again to stop");
749
+
750
+ const lines = [topBorder, side(titleLine)];
751
+
752
+ if (!displayText.trim()) {
753
+ lines.push(side(theme.fg("dim", " Listening... speak now")));
754
+ } else {
755
+ lines.push(sep);
756
+ // Word-wrap the transcript text
757
+ const innerMax = maxW - 4; // padding inside box
758
+ const words = displayText.split(" ");
759
+ const wrappedLines: string[] = [];
760
+ let currentLine = "";
761
+
762
+ for (const word of words) {
763
+ if ((currentLine + " " + word).trim().length > innerMax && currentLine) {
764
+ wrappedLines.push(currentLine);
765
+ currentLine = word;
766
+ } else {
767
+ currentLine = currentLine ? currentLine + " " + word : word;
768
+ }
769
+ }
770
+ if (currentLine) wrappedLines.push(currentLine);
771
+
772
+ // Show last 3 lines of transcript
773
+ const visible = wrappedLines.slice(-3);
774
+ for (let i = 0; i < visible.length; i++) {
775
+ let line = visible[i];
776
+ // Style: finalized parts in normal text, interim in accent
777
+ if (i === visible.length - 1 && interim) {
778
+ line = theme.fg("text", line) + theme.fg("accent", "▍");
779
+ } else {
780
+ line = theme.fg("text", line);
781
+ }
782
+ lines.push(side(" " + line));
783
+ }
784
+ }
621
785
 
622
- // Show the live transcript — last 3 lines max
623
- const words = displayText.split(" ");
624
- const lines: string[] = [];
625
- let currentLine = " 🎙 ";
626
- const maxLineLen = 70;
627
-
628
- for (const word of words) {
629
- if ((currentLine + word).length > maxLineLen) {
630
- lines.push(currentLine);
631
- currentLine = " " + word + " ";
632
- } else {
633
- currentLine += word + " ";
634
- }
635
- }
636
- if (currentLine.trim()) lines.push(currentLine);
637
-
638
- // Keep only last 4 lines to avoid widget overflow
639
- const visibleLines = lines.slice(-4);
640
- if (interim) {
641
- // Show a blinking cursor for interim text
642
- const lastIdx = visibleLines.length - 1;
643
- visibleLines[lastIdx] = visibleLines[lastIdx].trimEnd() + "▍";
644
- }
786
+ lines.push(side(hint));
787
+ lines.push(botBorder);
788
+ return lines;
789
+ },
790
+ };
791
+ }, { placement: "aboveEditor" });
792
+ }
645
793
 
646
- ctx.ui.setWidget("voice-recording", visibleLines, { placement: "aboveEditor" });
794
+ /** Transcribing state show a processing indicator */
795
+ function showTranscribingWidget() {
796
+ if (!ctx?.hasUI) return;
797
+ ctx.ui.setWidget("voice-recording", (tui, theme) => {
798
+ return {
799
+ invalidate() {},
800
+ render(width: number): string[] {
801
+ const maxW = Math.min(width - 2, 72);
802
+ const topBorder = theme.fg("border", "╭" + "─".repeat(maxW) + "╮");
803
+ const botBorder = theme.fg("border", "╰" + "─".repeat(maxW) + "╯");
804
+ const side = (content: string) => {
805
+ const stripped = content.replace(/\x1b\[[^m]*m/g, "");
806
+ const padding = Math.max(0, maxW - stripped.length);
807
+ return theme.fg("border", "│") + content + " ".repeat(padding) + theme.fg("border", "│");
808
+ };
809
+ const spinner = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
810
+ const idx = Math.floor(Date.now() / 100) % spinner.length;
811
+ const line = ` ${theme.fg("accent", spinner[idx])} ${theme.fg("dim", "Finalizing transcription...")}`;
812
+ return [topBorder, side(line), botBorder];
813
+ },
814
+ };
815
+ }, { placement: "aboveEditor" });
647
816
  }
648
817
 
649
818
  // ─── Voice: Start / Stop (Streaming or Legacy) ───────────────────────────
@@ -665,6 +834,7 @@ export default function (pi: ExtensionAPI) {
665
834
  },
666
835
  onDone: (fullText) => {
667
836
  activeSession = null;
837
+ stopRecordingWidgetAnimation();
668
838
  ctx?.ui.setWidget("voice-recording", undefined);
669
839
 
670
840
  if (!fullText.trim()) {
@@ -690,6 +860,7 @@ export default function (pi: ExtensionAPI) {
690
860
  },
691
861
  onError: (err) => {
692
862
  activeSession = null;
863
+ stopRecordingWidgetAnimation();
693
864
  ctx?.ui.setWidget("voice-recording", undefined);
694
865
  ctx?.ui.notify(`STT error: ${err}`, "error");
695
866
  setVoiceState("idle");
@@ -715,11 +886,8 @@ export default function (pi: ExtensionAPI) {
715
886
  }
716
887
  }, 1000);
717
888
 
718
- if (ctx.hasUI) {
719
- ctx.ui.setWidget("voice-recording", [
720
- " 🎙 Listening... speak now — press SPACE again to stop",
721
- ], { placement: "aboveEditor" });
722
- }
889
+ // Show the themed recording widget
890
+ showRecordingWidget(target);
723
891
  return true;
724
892
 
725
893
  } else {
@@ -743,11 +911,8 @@ export default function (pi: ExtensionAPI) {
743
911
  }, 1000);
744
912
 
745
913
  if (ctx.hasUI) {
746
- ctx.ui.setWidget("voice-recording", [
747
- target === "btw"
748
- ? " 🎙 BTW Recording... Ctrl+Shift+V to stop"
749
- : " 🎙 Recording... Ctrl+Shift+V to stop (or release SPACE)",
750
- ], { placement: "aboveEditor" });
914
+ // Show themed recording widget for legacy path
915
+ showRecordingWidget(target);
751
916
  }
752
917
  return true;
753
918
  }
@@ -760,6 +925,8 @@ export default function (pi: ExtensionAPI) {
760
925
  if (activeSession) {
761
926
  // === STREAMING PATH === Stop the stream, finalize will call onDone
762
927
  setVoiceState("transcribing");
928
+ stopRecordingWidgetAnimation();
929
+ showTranscribingWidget();
763
930
  stopStreamingSession(activeSession);
764
931
  return;
765
932
  }
@@ -768,7 +935,8 @@ export default function (pi: ExtensionAPI) {
768
935
  const elapsed = ((Date.now() - recordingStart) / 1000).toFixed(1);
769
936
  const audioFile = tempFile;
770
937
  setVoiceState("transcribing");
771
- ctx.ui.setWidget("voice-recording", undefined);
938
+ stopRecordingWidgetAnimation();
939
+ showTranscribingWidget();
772
940
 
773
941
  await stopLegacyRecording();
774
942
 
@@ -822,20 +990,35 @@ export default function (pi: ExtensionAPI) {
822
990
  setVoiceState("idle");
823
991
  }
824
992
 
825
- // ─── Hold-to-talk / Toggle-to-talk ──────────────────────────────────────
993
+ // ─── Hold-to-talk with Duration Threshold ──────────────────────────────
826
994
  //
827
- // Kitty protocol terminals (Ghostty, WezTerm, Kitty) send key-release
828
- // events (":3u" sequences), enabling true hold-to-talk.
995
+ // SPACE activates voice ONLY when:
996
+ // 1. The editor is empty (no text typed yet)
997
+ // 2. SPACE is held for ≥ HOLD_THRESHOLD_MS (500ms)
829
998
  //
830
- // Non-Kitty terminals (Apple Terminal, iTerm2 without config, basic xterm)
831
- // only send key-press. We detect this and fall back to toggle:
832
- // 1st SPACE press → start recording
833
- // 2nd SPACE press → stop recording + transcribe
999
+ // If SPACE is released before the threshold, a regular space character
1000
+ // is typed into the editor (normal typing behavior).
834
1001
  //
835
- // We auto-detect Kitty support: if we see a key-release within the first
836
- // recording, we know hold-to-talk works. Otherwise, we stay in toggle mode.
837
-
838
- let kittyReleaseDetected = false; // have we ever seen a Kitty release event?
1002
+ // This prevents accidental voice activation when typing and matches
1003
+ // Claude Code's hold-to-talk UX pattern.
1004
+ //
1005
+ // For Kitty protocol terminals: hold wait threshold activate
1006
+ // release → stop recording. True hold-to-talk.
1007
+ // For non-Kitty terminals: hold → wait threshold → activate →
1008
+ // press SPACE again → stop recording. Toggle after activation.
1009
+
1010
+ const HOLD_THRESHOLD_MS = 500; // minimum hold time before voice activates
1011
+ let kittyReleaseDetected = false;
1012
+ let spaceDownTime: number | null = null; // timestamp when SPACE was first pressed
1013
+ let holdActivationTimer: ReturnType<typeof setTimeout> | null = null;
1014
+ let spaceConsumed = false; // whether we've committed to voice (past threshold)
1015
+
1016
+ function clearHoldTimer() {
1017
+ if (holdActivationTimer) {
1018
+ clearTimeout(holdActivationTimer);
1019
+ holdActivationTimer = null;
1020
+ }
1021
+ }
839
1022
 
840
1023
  function setupHoldToTalk() {
841
1024
  if (!ctx?.hasUI) return;
@@ -847,54 +1030,119 @@ export default function (pi: ExtensionAPI) {
847
1030
 
848
1031
  // ── SPACE handling ──
849
1032
  if (matchesKey(data, "space")) {
1033
+ // RULE: If editor has content, SPACE always types a space — never voice
850
1034
  const editorText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
851
- if (editorText && editorText.trim().length > 0) return undefined;
1035
+ if (editorText && editorText.trim().length > 0) {
1036
+ clearHoldTimer();
1037
+ spaceDownTime = null;
1038
+ spaceConsumed = false;
1039
+ return undefined; // let the default space character through
1040
+ }
852
1041
 
853
- // Kitty key-release: stop recording
1042
+ // ── Kitty key-release ──
854
1043
  if (isKeyRelease(data)) {
855
1044
  kittyReleaseDetected = true;
856
- if (isHolding && voiceState === "recording") {
1045
+
1046
+ // Released before threshold → type a space character
1047
+ if (spaceDownTime && !spaceConsumed) {
1048
+ clearHoldTimer();
1049
+ spaceDownTime = null;
1050
+ spaceConsumed = false;
1051
+ // Insert a space into editor
1052
+ if (ctx?.hasUI) ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
1053
+ return { consume: true };
1054
+ }
1055
+
1056
+ // Released after threshold → stop recording (true hold-to-talk)
1057
+ if (spaceConsumed && isHolding && voiceState === "recording") {
857
1058
  isHolding = false;
1059
+ spaceConsumed = false;
1060
+ spaceDownTime = null;
858
1061
  stopVoiceRecording("editor");
859
1062
  return { consume: true };
860
1063
  }
1064
+
1065
+ spaceDownTime = null;
1066
+ spaceConsumed = false;
861
1067
  return undefined;
862
1068
  }
863
1069
 
864
- // Kitty key-repeat: suppress while holding
1070
+ // ── Kitty key-repeat: suppress while holding past threshold ──
865
1071
  if (isKeyRepeat(data)) {
866
- if (isHolding) return { consume: true };
1072
+ if (spaceConsumed || isHolding) return { consume: true };
867
1073
  return undefined;
868
1074
  }
869
1075
 
870
1076
  // === Key PRESS ===
871
1077
 
872
- // Currently recording? this is the "stop" press (toggle mode)
873
- if (voiceState === "recording") {
1078
+ // If already recording (toggle mode for non-Kitty) stop
1079
+ if (voiceState === "recording" && spaceConsumed) {
874
1080
  isHolding = false;
1081
+ spaceConsumed = false;
1082
+ spaceDownTime = null;
1083
+ clearHoldTimer();
875
1084
  stopVoiceRecording("editor");
876
1085
  return { consume: true };
877
1086
  }
878
1087
 
879
- // Currently transcribing? → ignore, wait for it to finish
1088
+ // If transcribing → ignore
880
1089
  if (voiceState === "transcribing") {
881
1090
  return { consume: true };
882
1091
  }
883
1092
 
884
- // Idle → start recording
885
- if (voiceState === "idle" && !isHolding) {
886
- isHolding = true;
887
- startVoiceRecording("editor").then((ok) => {
888
- if (!ok) isHolding = false;
889
- });
890
- return { consume: true };
1093
+ // Idle → start the hold timer
1094
+ if (voiceState === "idle" && !spaceDownTime) {
1095
+ spaceDownTime = Date.now();
1096
+ spaceConsumed = false;
1097
+
1098
+ // Show a subtle "preparing" indicator
1099
+ if (ctx?.hasUI) {
1100
+ showHoldHintWidget();
1101
+ }
1102
+
1103
+ // After threshold: activate voice recording
1104
+ holdActivationTimer = setTimeout(() => {
1105
+ holdActivationTimer = null;
1106
+ // Double-check: still idle, still holding, editor still empty
1107
+ const currentText = ctx?.hasUI ? ctx.ui.getEditorText() : "";
1108
+ if (voiceState === "idle" && spaceDownTime && !(currentText && currentText.trim().length > 0)) {
1109
+ spaceConsumed = true;
1110
+ isHolding = true;
1111
+ startVoiceRecording("editor").then((ok) => {
1112
+ if (!ok) {
1113
+ isHolding = false;
1114
+ spaceConsumed = false;
1115
+ spaceDownTime = null;
1116
+ }
1117
+ });
1118
+ } else {
1119
+ spaceDownTime = null;
1120
+ spaceConsumed = false;
1121
+ }
1122
+ }, HOLD_THRESHOLD_MS);
1123
+
1124
+ return { consume: true }; // consume now — we'll insert space on early release
891
1125
  }
892
1126
 
893
- if (isHolding) return { consume: true };
1127
+ if (isHolding || spaceConsumed) return { consume: true };
1128
+ return undefined;
1129
+ }
1130
+
1131
+ // ── Any other key while holding space (pre-threshold) → cancel hold, insert space ──
1132
+ if (spaceDownTime && !spaceConsumed && !matchesKey(data, "space")) {
1133
+ clearHoldTimer();
1134
+ // Insert the space that was consumed during hold detection
1135
+ if (ctx?.hasUI) {
1136
+ ctx.ui.setEditorText((ctx.ui.getEditorText() || "") + " ");
1137
+ hideHoldHintWidget();
1138
+ }
1139
+ spaceDownTime = null;
1140
+ spaceConsumed = false;
1141
+ // Don't consume this key — let it through
894
1142
  return undefined;
895
1143
  }
896
1144
 
897
- // ── Ctrl+Shift+B handling (BTW voice) ──
1145
+ // ── Ctrl+Shift+B handling (BTW voice) — direct toggle, no hold threshold ──
898
1146
  if (matchesKey(data, "ctrl+shift+b")) {
899
1147
  if (isKeyRelease(data)) {
900
1148
  kittyReleaseDetected = true;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@codexstar/pi-listen",
3
- "version": "1.0.13",
3
+ "version": "1.0.14",
4
4
  "description": "Voice input, first-run onboarding, and side-channel BTW conversations for Pi",
5
5
  "type": "module",
6
6
  "keywords": [