@phenx-inc/ctlsurf 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/out/headless/index.mjs +26 -10
  2. package/out/headless/index.mjs.map +2 -2
  3. package/out/main/index.js +31 -9
  4. package/out/preload/index.js +8 -0
  5. package/out/renderer/assets/{cssMode-DbMmcl1h.js → cssMode-BQN8v2ok.js} +3 -3
  6. package/out/renderer/assets/{freemarker2-CvaHiy92.js → freemarker2-DbxGYYVp.js} +1 -1
  7. package/out/renderer/assets/{handlebars-D58lUIOu.js → handlebars-3auU1CAd.js} +1 -1
  8. package/out/renderer/assets/{html-D1h1aJbM.js → html-D8xFiRmI.js} +1 -1
  9. package/out/renderer/assets/{htmlMode-BdkAp9qr.js → htmlMode-M3MApZ4n.js} +3 -3
  10. package/out/renderer/assets/{index-B60JU1yI.js → index---H6cxNl.js} +854 -38
  11. package/out/renderer/assets/{index-DJFYmHjz.css → index-B-iM7dFC.css} +269 -0
  12. package/out/renderer/assets/{javascript-CXqZcnvb.js → javascript-BO_ViZM5.js} +2 -2
  13. package/out/renderer/assets/{jsonMode-BuVr-eSl.js → jsonMode-CKp2zvZu.js} +3 -3
  14. package/out/renderer/assets/{liquid-LKu0Wd0B.js → liquid-C1eHcrht.js} +1 -1
  15. package/out/renderer/assets/{lspLanguageFeatures-Cjr_4HGs.js → lspLanguageFeatures-CHWJx_Tl.js} +1 -1
  16. package/out/renderer/assets/{mdx-Bl84ILla.js → mdx-Qqdtk7fL.js} +1 -1
  17. package/out/renderer/assets/{python-0sFd9G1k.js → python-DKu7rNbs.js} +1 -1
  18. package/out/renderer/assets/{razor-Cqcu1rLJ.js → razor-BOMpCo6z.js} +1 -1
  19. package/out/renderer/assets/{tsMode-CYd3NUkW.js → tsMode-yAjlPR-D.js} +1 -1
  20. package/out/renderer/assets/{typescript-rkc9lhpi.js → typescript-BiJRCUcL.js} +1 -1
  21. package/out/renderer/assets/{xml-EsHEUps1.js → xml-D4PvYeQq.js} +1 -1
  22. package/out/renderer/assets/{yaml-B9-nQ_s2.js → yaml-BeHVkmnS.js} +1 -1
  23. package/out/renderer/index.html +2 -2
  24. package/package.json +1 -1
  25. package/src/main/index.ts +7 -0
  26. package/src/main/orchestrator.ts +38 -9
  27. package/src/preload/index.ts +11 -0
  28. package/src/renderer/App.tsx +39 -6
  29. package/src/renderer/components/FloatingMic.tsx +128 -0
  30. package/src/renderer/components/SpeakControls.tsx +235 -0
  31. package/src/renderer/components/VoiceInput.tsx +170 -6
  32. package/src/renderer/lib/localWhisper.ts +48 -4
  33. package/src/renderer/lib/speech.ts +299 -0
  34. package/src/renderer/styles.css +269 -0
@@ -0,0 +1,128 @@
1
+ import { useCallback, useEffect, useRef, useState } from 'react'
2
+ import { VoiceInput } from './VoiceInput'
3
+
4
+ // A draggable, dismissable push-to-talk mic that floats over the panes. It wraps
5
+ // the same <VoiceInput> push-to-talk logic used in the titlebar; only the chrome
6
+ // (drag handle + hide button) and positioning live here.
7
+
8
+ const POS_KEY = 'ctlsurf.floatingMicPos'
9
+
10
+ interface Pos { x: number; y: number }
11
+
12
+ interface FloatingMicProps {
13
+ onTranscript: (text: string) => void
14
+ onHide: () => void
15
+ }
16
+
17
+ // Keep the button clear of the 38px titlebar and 24px status bar.
18
+ const EDGE = 20
19
+ const TOP_MIN = 46
20
+ const BOTTOM_GAP = 36
21
+
22
+ function loadPos(): Pos | null {
23
+ try {
24
+ const raw = localStorage.getItem(POS_KEY)
25
+ if (raw) {
26
+ const p = JSON.parse(raw) as Partial<Pos>
27
+ if (typeof p.x === 'number' && typeof p.y === 'number') return { x: p.x, y: p.y }
28
+ }
29
+ } catch { /* ignore */ }
30
+ return null
31
+ }
32
+
33
+ export function FloatingMic({ onTranscript, onHide }: FloatingMicProps) {
34
+ const [pos, setPos] = useState<Pos | null>(loadPos)
35
+ const elRef = useRef<HTMLDivElement>(null)
36
+ // Pointer-to-element offset captured at drag start; null when not dragging.
37
+ const dragRef = useRef<{ dx: number; dy: number } | null>(null)
38
+
39
+ // Keep the button fully inside the viewport (used on drag, mount, and resize).
40
+ const clamp = useCallback((x: number, y: number): Pos => {
41
+ const el = elRef.current
42
+ const w = el?.offsetWidth ?? 64
43
+ const h = el?.offsetHeight ?? 90
44
+ return {
45
+ x: Math.max(EDGE, Math.min(x, window.innerWidth - w - EDGE)),
46
+ y: Math.max(TOP_MIN, Math.min(y, window.innerHeight - h - BOTTOM_GAP)),
47
+ }
48
+ }, [])
49
+
50
+ // First mount with no saved position: default to bottom-right.
51
+ useEffect(() => {
52
+ if (pos) return
53
+ const el = elRef.current
54
+ const w = el?.offsetWidth ?? 64
55
+ const h = el?.offsetHeight ?? 90
56
+ setPos({
57
+ x: window.innerWidth - w - EDGE,
58
+ y: window.innerHeight - h - BOTTOM_GAP,
59
+ })
60
+ }, [pos])
61
+
62
+ // Keep it reachable if the window shrinks.
63
+ useEffect(() => {
64
+ const onResize = () => setPos((p) => (p ? clamp(p.x, p.y) : p))
65
+ window.addEventListener('resize', onResize)
66
+ return () => window.removeEventListener('resize', onResize)
67
+ }, [clamp])
68
+
69
+ const onHandleDown = useCallback((e: React.PointerEvent) => {
70
+ const el = elRef.current
71
+ if (!el) return
72
+ e.preventDefault()
73
+ const rect = el.getBoundingClientRect()
74
+ dragRef.current = { dx: e.clientX - rect.left, dy: e.clientY - rect.top }
75
+ e.currentTarget.setPointerCapture?.(e.pointerId)
76
+ }, [])
77
+
78
+ const onHandleMove = useCallback((e: React.PointerEvent) => {
79
+ const d = dragRef.current
80
+ if (!d) return
81
+ setPos(clamp(e.clientX - d.dx, e.clientY - d.dy))
82
+ }, [clamp])
83
+
84
+ const onHandleUp = useCallback((e: React.PointerEvent) => {
85
+ if (!dragRef.current) return
86
+ dragRef.current = null
87
+ e.currentTarget.releasePointerCapture?.(e.pointerId)
88
+ setPos((p) => {
89
+ if (p) {
90
+ try { localStorage.setItem(POS_KEY, JSON.stringify(p)) } catch { /* ignore */ }
91
+ }
92
+ return p
93
+ })
94
+ }, [])
95
+
96
+ // Render off-screen+hidden until the first position is computed (no flash).
97
+ const style: React.CSSProperties = pos
98
+ ? { left: pos.x, top: pos.y }
99
+ : { left: -9999, top: -9999, visibility: 'hidden' }
100
+
101
+ return (
102
+ <div ref={elRef} className="floating-mic" style={style}>
103
+ <div
104
+ className="floating-mic-handle"
105
+ onPointerDown={onHandleDown}
106
+ onPointerMove={onHandleMove}
107
+ onPointerUp={onHandleUp}
108
+ onPointerCancel={onHandleUp}
109
+ title="Drag to move"
110
+ aria-label="Drag floating mic"
111
+ >
112
+ <span className="floating-mic-grip" aria-hidden="true">⠿</span>
113
+ <button
114
+ type="button"
115
+ className="floating-mic-hide"
116
+ // Don't let a click on the hide button start a drag.
117
+ onPointerDown={(e) => e.stopPropagation()}
118
+ onClick={onHide}
119
+ title="Hide floating mic"
120
+ aria-label="Hide floating mic"
121
+ >
122
+ ×
123
+ </button>
124
+ </div>
125
+ <VoiceInput variant="floating" onTranscript={onTranscript} />
126
+ </div>
127
+ )
128
+ }
@@ -0,0 +1,235 @@
1
+ import { useCallback, useEffect, useRef, useState } from 'react'
2
+ import {
3
+ speech,
4
+ getEngine,
5
+ setEngine as persistEngine,
6
+ getVoiceURI,
7
+ setVoiceURI as persistVoiceURI,
8
+ getRate,
9
+ setRate as persistRate,
10
+ listWebVoices,
11
+ type TtsEngineId,
12
+ type TtsModelProgress,
13
+ } from '../lib/speech'
14
+
15
+ // Titlebar control for "speak agent replies" (Electron desktop only). A speaker
16
+ // toggle drives the main-process speakReplies setting (which runs the transcript
17
+ // tailer); a ▾ dropdown picks the engine, voice, and rate.
18
+
19
+ const SAMPLE = 'This is how spoken agent replies will sound.'
20
+
21
+ export function SpeakControls() {
22
+ const [enabled, setEnabled] = useState(false)
23
+ const [engine, setEngineState] = useState<TtsEngineId>(getEngine)
24
+ const [voices, setVoices] = useState<SpeechSynthesisVoice[]>([])
25
+ const [voiceURI, setVoiceURIState] = useState<string | null>(getVoiceURI)
26
+ const [rate, setRateState] = useState<number>(getRate)
27
+ const [showMenu, setShowMenu] = useState(false)
28
+ const [modelPct, setModelPct] = useState<number | null>(null)
29
+ const [speaking, setSpeaking] = useState(false)
30
+ const [error, setError] = useState<string | null>(null)
31
+ const wrapRef = useRef<HTMLDivElement>(null)
32
+
33
+ // Reflect the persisted main-process setting on mount.
34
+ useEffect(() => {
35
+ let alive = true
36
+ window.worker.getSpeakReplies().then((r) => { if (alive) setEnabled(!!r.enabled) }).catch(() => {})
37
+ return () => { alive = false }
38
+ }, [])
39
+
40
+ // Speak replies as they arrive. Main only forwards when the setting is on, so
41
+ // this is a no-op while disabled.
42
+ useEffect(() => {
43
+ const off = window.worker.onAgentMessage((text) => speech.enqueue(text))
44
+ return off
45
+ }, [])
46
+
47
+ // Surface neural-model download/transcribe progress on the button.
48
+ useEffect(() => {
49
+ speech.onModelProgress = (p: TtsModelProgress | null) => {
50
+ if (p && p.status === 'progress' && typeof p.progress === 'number') {
51
+ setModelPct(Math.min(100, Math.round(p.progress)))
52
+ } else if (!p) {
53
+ setModelPct(null)
54
+ }
55
+ }
56
+ speech.onActivityChange = (a) => setSpeaking(a)
57
+ speech.onError = (msg) => setError(msg)
58
+ return () => {
59
+ speech.onModelProgress = null
60
+ speech.onActivityChange = null
61
+ speech.onError = null
62
+ }
63
+ }, [])
64
+
65
+ // Auto-dismiss the error chip.
66
+ useEffect(() => {
67
+ if (!error) return
68
+ const t = setTimeout(() => setError(null), 4000)
69
+ return () => clearTimeout(t)
70
+ }, [error])
71
+
72
+ // Web Speech voices populate asynchronously.
73
+ useEffect(() => {
74
+ const load = () => setVoices(listWebVoices())
75
+ load()
76
+ if (typeof speechSynthesis !== 'undefined') {
77
+ speechSynthesis.addEventListener('voiceschanged', load)
78
+ return () => speechSynthesis.removeEventListener('voiceschanged', load)
79
+ }
80
+ }, [])
81
+
82
+ // Close the menu on an outside click.
83
+ useEffect(() => {
84
+ if (!showMenu) return
85
+ const onDown = (e: MouseEvent) => {
86
+ if (!wrapRef.current?.contains(e.target as Node)) setShowMenu(false)
87
+ }
88
+ document.addEventListener('mousedown', onDown)
89
+ return () => document.removeEventListener('mousedown', onDown)
90
+ }, [showMenu])
91
+
92
+ const toggle = useCallback(async () => {
93
+ const next = !enabled
94
+ setEnabled(next)
95
+ // Prime the audio context from this click so neural playback isn't blocked
96
+ // by autoplay policy when a reply later arrives without a gesture.
97
+ if (next) speech.unlock()
98
+ else speech.stop()
99
+ try { await window.worker.setSpeakReplies(next) } catch { /* ignore */ }
100
+ }, [enabled])
101
+
102
+ const chooseEngine = useCallback((id: TtsEngineId) => {
103
+ persistEngine(id)
104
+ setEngineState(id)
105
+ speech.stop()
106
+ // Preload the neural model so the first reply isn't slowed by download.
107
+ if (id === 'neural') speech.warmup()
108
+ }, [])
109
+
110
+ const chooseVoice = useCallback((uri: string) => {
111
+ const v = uri || null
112
+ persistVoiceURI(v)
113
+ setVoiceURIState(v)
114
+ }, [])
115
+
116
+ const changeRate = useCallback((r: number) => {
117
+ persistRate(r)
118
+ setRateState(r)
119
+ }, [])
120
+
121
+ const testVoice = useCallback(() => {
122
+ speech.unlock()
123
+ speech.stop()
124
+ speech.enqueue(SAMPLE)
125
+ }, [])
126
+
127
+ const title = enabled ? 'Spoken replies on — click to mute' : 'Speak agent replies (off)'
128
+
129
+ return (
130
+ <div className="speak-controls" ref={wrapRef}>
131
+ <button
132
+ type="button"
133
+ className={`titlebar-btn titlebar-icon-btn speak-btn ${enabled ? 'active' : ''}`}
134
+ onClick={toggle}
135
+ title={title}
136
+ aria-label="Toggle spoken agent replies"
137
+ aria-pressed={enabled}
138
+ >
139
+ <svg viewBox="0 0 24 24" width="13" height="13" fill="none" stroke="currentColor"
140
+ strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" aria-hidden="true">
141
+ <polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
142
+ {enabled ? (
143
+ <>
144
+ <path d="M15.54 8.46a5 5 0 0 1 0 7.07" />
145
+ <path d="M19.07 4.93a10 10 0 0 1 0 14.14" />
146
+ </>
147
+ ) : (
148
+ <line x1="23" y1="9" x2="17" y2="15" />
149
+ )}
150
+ </svg>
151
+ {modelPct !== null && <span className="speak-pct">{modelPct}%</span>}
152
+ </button>
153
+ {speaking && (
154
+ <button
155
+ type="button"
156
+ className="titlebar-btn titlebar-icon-btn speak-stop"
157
+ onClick={() => speech.stop()}
158
+ title="Stop speaking"
159
+ aria-label="Stop speaking"
160
+ >
161
+ <svg viewBox="0 0 24 24" width="11" height="11" fill="currentColor" aria-hidden="true">
162
+ <rect x="5" y="5" width="14" height="14" rx="2" />
163
+ </svg>
164
+ </button>
165
+ )}
166
+ <button
167
+ type="button"
168
+ className="titlebar-btn speak-caret"
169
+ onClick={() => setShowMenu((v) => !v)}
170
+ title="Voice options"
171
+ aria-label="Voice options"
172
+ aria-expanded={showMenu}
173
+ >
174
+
175
+ </button>
176
+
177
+ {showMenu && (
178
+ <div className="speak-menu" role="menu">
179
+ <div className="speak-menu-head">Engine</div>
180
+ <button
181
+ type="button"
182
+ className={`speak-menu-item ${engine === 'web' ? 'active' : ''}`}
183
+ onClick={() => chooseEngine('web')}
184
+ >
185
+ <span className="speak-menu-check">{engine === 'web' ? '✓' : ''}</span>
186
+ <span>System voice (instant)</span>
187
+ </button>
188
+ <button
189
+ type="button"
190
+ className={`speak-menu-item ${engine === 'neural' ? 'active' : ''}`}
191
+ onClick={() => chooseEngine('neural')}
192
+ >
193
+ <span className="speak-menu-check">{engine === 'neural' ? '✓' : ''}</span>
194
+ <span>Neural voice (downloads)</span>
195
+ </button>
196
+
197
+ {engine === 'web' && (
198
+ <>
199
+ <div className="speak-menu-head">Voice</div>
200
+ <select
201
+ className="speak-select"
202
+ value={voiceURI || ''}
203
+ onChange={(e) => chooseVoice(e.target.value)}
204
+ >
205
+ <option value="">System default</option>
206
+ {voices.map((v) => (
207
+ <option key={v.voiceURI} value={v.voiceURI}>
208
+ {v.name} {v.lang ? `(${v.lang})` : ''}
209
+ </option>
210
+ ))}
211
+ </select>
212
+ </>
213
+ )}
214
+
215
+ <div className="speak-menu-head">Rate · {rate.toFixed(1)}×</div>
216
+ <input
217
+ className="speak-rate"
218
+ type="range"
219
+ min={0.5}
220
+ max={2}
221
+ step={0.1}
222
+ value={rate}
223
+ onChange={(e) => changeRate(Number(e.target.value))}
224
+ />
225
+
226
+ <div className="speak-menu-row">
227
+ <button type="button" className="speak-menu-btn" onClick={testVoice}>Test</button>
228
+ <button type="button" className="speak-menu-btn" onClick={() => speech.stop()}>Stop</button>
229
+ </div>
230
+ </div>
231
+ )}
232
+ {error && <div className="speak-error">{error}</div>}
233
+ </div>
234
+ )
235
+ }
@@ -43,6 +43,12 @@ type Engine = 'web-speech' | 'local'
43
43
  type Phase = 'idle' | 'listening' | 'transcribing'
44
44
 
45
45
  const ENGINE_KEY = 'ctlsurf.voiceEngine'
46
+ // Persisted audioinput deviceId for the local engine; null/absent = OS default.
47
+ const DEVICE_KEY = 'ctlsurf.voiceDeviceId'
48
+
49
+ function loadDeviceId(): string | null {
50
+ try { return localStorage.getItem(DEVICE_KEY) || null } catch { return null }
51
+ }
46
52
 
47
53
  const WEB_SPEECH_SUPPORTED = getRecognitionCtor() !== null
48
54
  const LOCAL_SUPPORTED =
@@ -76,9 +82,12 @@ function describeMicError(err: unknown): string {
76
82
  interface VoiceInputProps {
77
83
  // Called once per push-to-talk session with the final transcribed text.
78
84
  onTranscript: (text: string) => void
85
+ // 'titlebar' (default) renders the compact titlebar pill; 'floating' renders
86
+ // a round FAB used by the draggable on-canvas mic (see FloatingMic).
87
+ variant?: 'titlebar' | 'floating'
79
88
  }
80
89
 
81
- export function VoiceInput({ onTranscript }: VoiceInputProps) {
90
+ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputProps) {
82
91
  const [engine, setEngine] = useState<Engine>(loadInitialEngine)
83
92
  const [phase, setPhase] = useState<Phase>('idle')
84
93
  const [interim, setInterim] = useState('')
@@ -86,6 +95,14 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
86
95
  const [error, setError] = useState<string | null>(null)
87
96
  const [notice, setNotice] = useState<string | null>(null)
88
97
 
98
+ // Mic source selection (local engine only)
99
+ const [devices, setDevices] = useState<MediaDeviceInfo[]>([])
100
+ const [selectedDeviceId, setSelectedDeviceId] = useState<string | null>(loadDeviceId)
101
+ const [showDevicePicker, setShowDevicePicker] = useState(false)
102
+ const selectedDeviceIdRef = useRef(selectedDeviceId)
103
+ useEffect(() => { selectedDeviceIdRef.current = selectedDeviceId }, [selectedDeviceId])
104
+ const wrapRef = useRef<HTMLDivElement>(null)
105
+
89
106
  // Web Speech refs
90
107
  const recognitionRef = useRef<SpeechRecognitionLike | null>(null)
91
108
  const finalRef = useRef('')
@@ -125,6 +142,88 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
125
142
  streamRef.current = null
126
143
  }, [])
127
144
 
145
+ // ─── Mic source selection ──────────────────────────
146
+
147
+ const refreshDevices = useCallback(async () => {
148
+ if (!navigator.mediaDevices?.enumerateDevices) return
149
+ try {
150
+ const all = await navigator.mediaDevices.enumerateDevices()
151
+ setDevices(all.filter((d) => d.kind === 'audioinput'))
152
+ } catch { /* ignore */ }
153
+ }, [])
154
+
155
+ // Device labels are blank until mic permission is granted, so when the user
156
+ // opens the picker we request a one-shot permission to populate names.
157
+ const ensureDeviceLabels = useCallback(async () => {
158
+ if (!navigator.mediaDevices?.enumerateDevices) return
159
+ try {
160
+ const all = await navigator.mediaDevices.enumerateDevices()
161
+ const inputs = all.filter((d) => d.kind === 'audioinput')
162
+ if (inputs.length && inputs.every((d) => !d.label)) {
163
+ const s = await navigator.mediaDevices.getUserMedia({ audio: true })
164
+ s.getTracks().forEach((t) => t.stop())
165
+ }
166
+ } catch { /* permission denied → we'll show generic names */ }
167
+ await refreshDevices()
168
+ }, [refreshDevices])
169
+
170
+ const toggleDevicePicker = useCallback(() => {
171
+ setShowDevicePicker((open) => !open)
172
+ }, [])
173
+
174
+ // Populate device labels whenever the picker opens (needs mic permission).
175
+ useEffect(() => {
176
+ if (showDevicePicker) void ensureDeviceLabels()
177
+ }, [showDevicePicker, ensureDeviceLabels])
178
+
179
+ const chooseDevice = useCallback((id: string | null) => {
180
+ setSelectedDeviceId(id)
181
+ try {
182
+ if (id) localStorage.setItem(DEVICE_KEY, id)
183
+ else localStorage.removeItem(DEVICE_KEY)
184
+ } catch { /* ignore */ }
185
+ setShowDevicePicker(false)
186
+ }, [])
187
+
188
+ // Acquire a stream honoring the saved device, gracefully falling back to the
189
+ // OS default if that device was unplugged since it was chosen.
190
+ const getStream = useCallback(async (): Promise<MediaStream> => {
191
+ const id = selectedDeviceIdRef.current
192
+ try {
193
+ return await navigator.mediaDevices.getUserMedia({
194
+ audio: id ? { deviceId: { exact: id } } : true,
195
+ })
196
+ } catch (err) {
197
+ if (id && (err as { name?: string })?.name === 'OverconstrainedError') {
198
+ try { localStorage.removeItem(DEVICE_KEY) } catch { /* ignore */ }
199
+ setSelectedDeviceId(null)
200
+ setNotice('Saved microphone unavailable — using system default.')
201
+ return navigator.mediaDevices.getUserMedia({ audio: true })
202
+ }
203
+ throw err
204
+ }
205
+ }, [])
206
+
207
+ // Keep the list fresh when devices are plugged/unplugged.
208
+ useEffect(() => {
209
+ const md = navigator.mediaDevices
210
+ if (!md?.addEventListener) return
211
+ const onChange = () => { void refreshDevices() }
212
+ md.addEventListener('devicechange', onChange)
213
+ void refreshDevices()
214
+ return () => md.removeEventListener('devicechange', onChange)
215
+ }, [refreshDevices])
216
+
217
+ // Close the picker on an outside click.
218
+ useEffect(() => {
219
+ if (!showDevicePicker) return
220
+ const onDocDown = (e: MouseEvent) => {
221
+ if (!wrapRef.current?.contains(e.target as Node)) setShowDevicePicker(false)
222
+ }
223
+ document.addEventListener('mousedown', onDocDown)
224
+ return () => document.removeEventListener('mousedown', onDocDown)
225
+ }, [showDevicePicker])
226
+
128
227
  // ─── Web Speech engine ─────────────────────────────
129
228
 
130
229
  const startWebSpeech = useCallback(() => {
@@ -204,6 +303,7 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
204
303
  try {
205
304
  const text = await transcribeBlob(blob, handleModelProgress)
206
305
  if (text) onTranscriptRef.current(text)
306
+ else setNotice('No speech detected — check the mic source (▾).')
207
307
  } catch (err) {
208
308
  setError('On-device transcription failed')
209
309
  console.error('[voice] local transcription failed', err)
@@ -217,7 +317,7 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
217
317
  setError(null); setNotice(null); setInterim('')
218
318
  cancelGestureRef.current = false
219
319
  try {
220
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
320
+ const stream = await getStream()
221
321
  // Released during the permission/await — don't record anything.
222
322
  if (cancelGestureRef.current) {
223
323
  stream.getTracks().forEach((t) => t.stop())
@@ -238,7 +338,7 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
238
338
  setError(describeMicError(err))
239
339
  console.error('[voice] getUserMedia failed', err)
240
340
  }
241
- }, [runLocalTranscription, stopStream])
341
+ }, [runLocalTranscription, stopStream, getStream])
242
342
 
243
343
  const stopLocal = useCallback(() => {
244
344
  cancelGestureRef.current = true
@@ -291,11 +391,31 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
291
391
  else if (listening) chip = { kind: 'listening', text: interim || (engine === 'local' ? 'Recording…' : 'Listening…') }
292
392
  else if (busy) chip = { kind: 'busy', text: modelPct !== null ? `Downloading voice model… ${modelPct}%` : 'Transcribing…' }
293
393
 
394
+ const floating = variant === 'floating'
395
+ const btnClass = floating
396
+ ? `voice-btn voice-btn-floating ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
397
+ : `titlebar-btn titlebar-icon-btn voice-btn ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
398
+
399
+ // Hide Chromium's synthetic "default"/"communications" aggregate entries from
400
+ // the list; we expose our own "System default" item instead.
401
+ const realDevices = devices.filter(
402
+ (d) => d.deviceId && d.deviceId !== 'default' && d.deviceId !== 'communications',
403
+ )
404
+ const rawDefault = devices.find((d) => d.deviceId === 'default')?.label
405
+ const defaultLabel = rawDefault
406
+ ? `System default · ${rawDefault.replace(/^Default\s*-\s*/i, '')}`
407
+ : 'System default'
408
+ const activeLabel = selectedDeviceId
409
+ ? (realDevices.find((d) => d.deviceId === selectedDeviceId)?.label || 'Selected microphone')
410
+ : defaultLabel
411
+ // Source selection only affects the local (getUserMedia) engine.
412
+ const showSourcePicker = floating && LOCAL_SUPPORTED
413
+
294
414
  return (
295
- <div className="voice-input-wrap">
415
+ <div className="voice-input-wrap" ref={wrapRef}>
296
416
  <button
297
417
  type="button"
298
- className={`titlebar-btn titlebar-icon-btn voice-btn ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`}
418
+ className={btnClass}
299
419
  disabled={!ANY_SUPPORTED}
300
420
  onPointerDown={handlePointerDown}
301
421
  onPointerUp={handlePointerUp}
@@ -307,7 +427,51 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
307
427
  <span className="voice-icon" aria-hidden="true">🎤</span>
308
428
  <span className={`voice-dot ${listening ? 'on' : busy ? 'busy' : 'off'}`} />
309
429
  </button>
310
- {chip && <div className={`voice-chip ${chip.kind}`}>{chip.text}</div>}
430
+ {showSourcePicker && (
431
+ <button
432
+ type="button"
433
+ className="voice-source-btn"
434
+ onPointerDown={(e) => e.stopPropagation()}
435
+ onClick={toggleDevicePicker}
436
+ title={`Mic source: ${activeLabel}`}
437
+ aria-label="Choose microphone source"
438
+ aria-expanded={showDevicePicker}
439
+ >
440
+
441
+ </button>
442
+ )}
443
+ {showSourcePicker && showDevicePicker && (
444
+ <div className="voice-source-menu" role="menu">
445
+ <div className="voice-source-head">Microphone source</div>
446
+ <button
447
+ type="button"
448
+ role="menuitemradio"
449
+ aria-checked={selectedDeviceId === null}
450
+ className={`voice-source-item ${selectedDeviceId === null ? 'active' : ''}`}
451
+ onClick={() => chooseDevice(null)}
452
+ >
453
+ <span className="voice-source-check">{selectedDeviceId === null ? '✓' : ''}</span>
454
+ <span className="voice-source-label">{defaultLabel}</span>
455
+ </button>
456
+ {realDevices.map((d, i) => (
457
+ <button
458
+ key={d.deviceId}
459
+ type="button"
460
+ role="menuitemradio"
461
+ aria-checked={selectedDeviceId === d.deviceId}
462
+ className={`voice-source-item ${selectedDeviceId === d.deviceId ? 'active' : ''}`}
463
+ onClick={() => chooseDevice(d.deviceId)}
464
+ >
465
+ <span className="voice-source-check">{selectedDeviceId === d.deviceId ? '✓' : ''}</span>
466
+ <span className="voice-source-label">{d.label || `Microphone ${i + 1}`}</span>
467
+ </button>
468
+ ))}
469
+ {realDevices.length === 0 && (
470
+ <div className="voice-source-empty">No microphones found</div>
471
+ )}
472
+ </div>
473
+ )}
474
+ {chip && <div className={`voice-chip ${chip.kind} ${floating ? 'voice-chip-floating' : ''}`}>{chip.text}</div>}
311
475
  </div>
312
476
  )
313
477
  }
@@ -6,6 +6,37 @@
6
6
  const MODEL = 'Xenova/whisper-base'
7
7
  const TARGET_SAMPLE_RATE = 16000
8
8
 
9
+ // Whisper hallucinates filler tokens ("you", "Thank you.", "Thanks for
10
+ // watching.") when fed silence. Two guards below tame that:
11
+ // - Below SILENCE_RMS we treat the clip as silent and skip the model entirely.
12
+ // - Between SILENCE_RMS and LOW_CONFIDENCE_RMS we still run the model but drop
13
+ // the result if it's *only* a known filler phrase (likely a hallucination on
14
+ // a quiet clip rather than real speech).
15
+ const SILENCE_RMS = 0.008
16
+ const LOW_CONFIDENCE_RMS = 0.02
17
+ const FILLER_PHRASES = new Set([
18
+ 'you',
19
+ 'thank you',
20
+ 'thank you very much',
21
+ 'thank you for watching',
22
+ 'thanks for watching',
23
+ 'please subscribe',
24
+ ])
25
+
26
+ // Root-mean-square amplitude of the clip — a cheap loudness proxy. Normalized
27
+ // speech sits around 0.05–0.15; a silent room is well under 0.005.
28
+ function computeRms(pcm: Float32Array): number {
29
+ if (pcm.length === 0) return 0
30
+ let sum = 0
31
+ for (let i = 0; i < pcm.length; i++) sum += pcm[i] * pcm[i]
32
+ return Math.sqrt(sum / pcm.length)
33
+ }
34
+
35
+ function isFillerOnly(text: string): boolean {
36
+ const norm = text.toLowerCase().replace(/[.!?,…"']/g, '').replace(/\s+/g, ' ').trim()
37
+ return FILLER_PHRASES.has(norm)
38
+ }
39
+
9
40
  export interface ModelProgress {
10
41
  status: string
11
42
  file?: string
@@ -77,12 +108,25 @@ async function blobToPcm16k(blob: Blob): Promise<Float32Array | null> {
77
108
  }
78
109
 
79
110
  export async function transcribeBlob(blob: Blob, onProgress?: (p: ModelProgress) => void): Promise<string> {
80
- const transcriber = await loadTranscriber(onProgress)
111
+ // Decode first so a silent clip never triggers the (one-time, ~75MB) model
112
+ // download just to be thrown away below.
81
113
  const pcm = await blobToPcm16k(blob)
82
114
  if (!pcm) return ''
115
+ const rms = computeRms(pcm)
116
+ if (rms < SILENCE_RMS) {
117
+ console.info(`[voice] near-silent clip (rms=${rms.toFixed(4)}); skipping transcription`)
118
+ return ''
119
+ }
120
+
121
+ const transcriber = await loadTranscriber(onProgress)
83
122
  const result = await transcriber(pcm)
84
- const text = Array.isArray(result)
123
+ const text = (Array.isArray(result)
85
124
  ? result.map((r) => r.text).join(' ')
86
- : result?.text
87
- return (text || '').trim()
125
+ : result?.text || '').trim()
126
+
127
+ if (text && rms < LOW_CONFIDENCE_RMS && isFillerOnly(text)) {
128
+ console.info(`[voice] dropping filler-only output "${text}" from quiet clip (rms=${rms.toFixed(4)})`)
129
+ return ''
130
+ }
131
+ return text
88
132
  }