@phenx-inc/ctlsurf 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/out/headless/index.mjs +26 -10
  2. package/out/headless/index.mjs.map +2 -2
  3. package/out/main/index.js +31 -9
  4. package/out/preload/index.js +8 -0
  5. package/out/renderer/assets/{cssMode-eTXVdAkZ.js → cssMode-BQN8v2ok.js} +3 -3
  6. package/out/renderer/assets/{freemarker2-B5BKaiK4.js → freemarker2-DbxGYYVp.js} +1 -1
  7. package/out/renderer/assets/{handlebars-BIdLd2wU.js → handlebars-3auU1CAd.js} +1 -1
  8. package/out/renderer/assets/{html-BXL4cnLS.js → html-D8xFiRmI.js} +1 -1
  9. package/out/renderer/assets/{htmlMode-46N3XG2c.js → htmlMode-M3MApZ4n.js} +3 -3
  10. package/out/renderer/assets/{index-dRvutfbl.js → index---H6cxNl.js} +696 -33
  11. package/out/renderer/assets/{index-Cf-RsxoC.css → index-B-iM7dFC.css} +195 -0
  12. package/out/renderer/assets/{javascript-n_iZZzDX.js → javascript-BO_ViZM5.js} +2 -2
  13. package/out/renderer/assets/{jsonMode-DXDczSNu.js → jsonMode-CKp2zvZu.js} +3 -3
  14. package/out/renderer/assets/{liquid-B1QweUh7.js → liquid-C1eHcrht.js} +1 -1
  15. package/out/renderer/assets/{lspLanguageFeatures-DqzMqkRk.js → lspLanguageFeatures-CHWJx_Tl.js} +1 -1
  16. package/out/renderer/assets/{mdx-BCv8lm5e.js → mdx-Qqdtk7fL.js} +1 -1
  17. package/out/renderer/assets/{python-BLNzYwDv.js → python-DKu7rNbs.js} +1 -1
  18. package/out/renderer/assets/{razor-CvAww8bG.js → razor-BOMpCo6z.js} +1 -1
  19. package/out/renderer/assets/{tsMode-C7m6Kr5E.js → tsMode-yAjlPR-D.js} +1 -1
  20. package/out/renderer/assets/{typescript-DhPw4VVg.js → typescript-BiJRCUcL.js} +1 -1
  21. package/out/renderer/assets/{xml-B0WLFJ2U.js → xml-D4PvYeQq.js} +1 -1
  22. package/out/renderer/assets/{yaml-BWyn9Wd7.js → yaml-BeHVkmnS.js} +1 -1
  23. package/out/renderer/index.html +2 -2
  24. package/package.json +1 -1
  25. package/src/main/index.ts +7 -0
  26. package/src/main/orchestrator.ts +38 -9
  27. package/src/preload/index.ts +11 -0
  28. package/src/renderer/App.tsx +5 -0
  29. package/src/renderer/components/SpeakControls.tsx +235 -0
  30. package/src/renderer/components/VoiceInput.tsx +159 -3
  31. package/src/renderer/lib/localWhisper.ts +48 -4
  32. package/src/renderer/lib/speech.ts +299 -0
  33. package/src/renderer/styles.css +195 -0
@@ -0,0 +1,235 @@
1
+ import { useCallback, useEffect, useRef, useState } from 'react'
2
+ import {
3
+ speech,
4
+ getEngine,
5
+ setEngine as persistEngine,
6
+ getVoiceURI,
7
+ setVoiceURI as persistVoiceURI,
8
+ getRate,
9
+ setRate as persistRate,
10
+ listWebVoices,
11
+ type TtsEngineId,
12
+ type TtsModelProgress,
13
+ } from '../lib/speech'
14
+
15
+ // Titlebar control for "speak agent replies" (Electron desktop only). A speaker
16
+ // toggle drives the main-process speakReplies setting (which runs the transcript
17
+ // tailer); a ▾ dropdown picks the engine, voice, and rate.
18
+
19
+ const SAMPLE = 'This is how spoken agent replies will sound.'
20
+
21
+ export function SpeakControls() {
22
+ const [enabled, setEnabled] = useState(false)
23
+ const [engine, setEngineState] = useState<TtsEngineId>(getEngine)
24
+ const [voices, setVoices] = useState<SpeechSynthesisVoice[]>([])
25
+ const [voiceURI, setVoiceURIState] = useState<string | null>(getVoiceURI)
26
+ const [rate, setRateState] = useState<number>(getRate)
27
+ const [showMenu, setShowMenu] = useState(false)
28
+ const [modelPct, setModelPct] = useState<number | null>(null)
29
+ const [speaking, setSpeaking] = useState(false)
30
+ const [error, setError] = useState<string | null>(null)
31
+ const wrapRef = useRef<HTMLDivElement>(null)
32
+
33
+ // Reflect the persisted main-process setting on mount.
34
+ useEffect(() => {
35
+ let alive = true
36
+ window.worker.getSpeakReplies().then((r) => { if (alive) setEnabled(!!r.enabled) }).catch(() => {})
37
+ return () => { alive = false }
38
+ }, [])
39
+
40
+ // Speak replies as they arrive. Main only forwards when the setting is on, so
41
+ // this is a no-op while disabled.
42
+ useEffect(() => {
43
+ const off = window.worker.onAgentMessage((text) => speech.enqueue(text))
44
+ return off
45
+ }, [])
46
+
47
+ // Surface neural-model download/transcribe progress on the button.
48
+ useEffect(() => {
49
+ speech.onModelProgress = (p: TtsModelProgress | null) => {
50
+ if (p && p.status === 'progress' && typeof p.progress === 'number') {
51
+ setModelPct(Math.min(100, Math.round(p.progress)))
52
+ } else if (!p) {
53
+ setModelPct(null)
54
+ }
55
+ }
56
+ speech.onActivityChange = (a) => setSpeaking(a)
57
+ speech.onError = (msg) => setError(msg)
58
+ return () => {
59
+ speech.onModelProgress = null
60
+ speech.onActivityChange = null
61
+ speech.onError = null
62
+ }
63
+ }, [])
64
+
65
+ // Auto-dismiss the error chip.
66
+ useEffect(() => {
67
+ if (!error) return
68
+ const t = setTimeout(() => setError(null), 4000)
69
+ return () => clearTimeout(t)
70
+ }, [error])
71
+
72
+ // Web Speech voices populate asynchronously.
73
+ useEffect(() => {
74
+ const load = () => setVoices(listWebVoices())
75
+ load()
76
+ if (typeof speechSynthesis !== 'undefined') {
77
+ speechSynthesis.addEventListener('voiceschanged', load)
78
+ return () => speechSynthesis.removeEventListener('voiceschanged', load)
79
+ }
80
+ }, [])
81
+
82
+ // Close the menu on an outside click.
83
+ useEffect(() => {
84
+ if (!showMenu) return
85
+ const onDown = (e: MouseEvent) => {
86
+ if (!wrapRef.current?.contains(e.target as Node)) setShowMenu(false)
87
+ }
88
+ document.addEventListener('mousedown', onDown)
89
+ return () => document.removeEventListener('mousedown', onDown)
90
+ }, [showMenu])
91
+
92
+ const toggle = useCallback(async () => {
93
+ const next = !enabled
94
+ setEnabled(next)
95
+ // Prime the audio context from this click so neural playback isn't blocked
96
+ // by autoplay policy when a reply later arrives without a gesture.
97
+ if (next) speech.unlock()
98
+ else speech.stop()
99
+ try { await window.worker.setSpeakReplies(next) } catch { /* ignore */ }
100
+ }, [enabled])
101
+
102
+ const chooseEngine = useCallback((id: TtsEngineId) => {
103
+ persistEngine(id)
104
+ setEngineState(id)
105
+ speech.stop()
106
+ // Preload the neural model so the first reply isn't slowed by download.
107
+ if (id === 'neural') speech.warmup()
108
+ }, [])
109
+
110
+ const chooseVoice = useCallback((uri: string) => {
111
+ const v = uri || null
112
+ persistVoiceURI(v)
113
+ setVoiceURIState(v)
114
+ }, [])
115
+
116
+ const changeRate = useCallback((r: number) => {
117
+ persistRate(r)
118
+ setRateState(r)
119
+ }, [])
120
+
121
+ const testVoice = useCallback(() => {
122
+ speech.unlock()
123
+ speech.stop()
124
+ speech.enqueue(SAMPLE)
125
+ }, [])
126
+
127
+ const title = enabled ? 'Spoken replies on — click to mute' : 'Speak agent replies (off)'
128
+
129
+ return (
130
+ <div className="speak-controls" ref={wrapRef}>
131
+ <button
132
+ type="button"
133
+ className={`titlebar-btn titlebar-icon-btn speak-btn ${enabled ? 'active' : ''}`}
134
+ onClick={toggle}
135
+ title={title}
136
+ aria-label="Toggle spoken agent replies"
137
+ aria-pressed={enabled}
138
+ >
139
+ <svg viewBox="0 0 24 24" width="13" height="13" fill="none" stroke="currentColor"
140
+ strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" aria-hidden="true">
141
+ <polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
142
+ {enabled ? (
143
+ <>
144
+ <path d="M15.54 8.46a5 5 0 0 1 0 7.07" />
145
+ <path d="M19.07 4.93a10 10 0 0 1 0 14.14" />
146
+ </>
147
+ ) : (
148
+ <line x1="23" y1="9" x2="17" y2="15" />
149
+ )}
150
+ </svg>
151
+ {modelPct !== null && <span className="speak-pct">{modelPct}%</span>}
152
+ </button>
153
+ {speaking && (
154
+ <button
155
+ type="button"
156
+ className="titlebar-btn titlebar-icon-btn speak-stop"
157
+ onClick={() => speech.stop()}
158
+ title="Stop speaking"
159
+ aria-label="Stop speaking"
160
+ >
161
+ <svg viewBox="0 0 24 24" width="11" height="11" fill="currentColor" aria-hidden="true">
162
+ <rect x="5" y="5" width="14" height="14" rx="2" />
163
+ </svg>
164
+ </button>
165
+ )}
166
+ <button
167
+ type="button"
168
+ className="titlebar-btn speak-caret"
169
+ onClick={() => setShowMenu((v) => !v)}
170
+ title="Voice options"
171
+ aria-label="Voice options"
172
+ aria-expanded={showMenu}
173
+ >
174
+
175
+ </button>
176
+
177
+ {showMenu && (
178
+ <div className="speak-menu" role="menu">
179
+ <div className="speak-menu-head">Engine</div>
180
+ <button
181
+ type="button"
182
+ className={`speak-menu-item ${engine === 'web' ? 'active' : ''}`}
183
+ onClick={() => chooseEngine('web')}
184
+ >
185
+ <span className="speak-menu-check">{engine === 'web' ? '✓' : ''}</span>
186
+ <span>System voice (instant)</span>
187
+ </button>
188
+ <button
189
+ type="button"
190
+ className={`speak-menu-item ${engine === 'neural' ? 'active' : ''}`}
191
+ onClick={() => chooseEngine('neural')}
192
+ >
193
+ <span className="speak-menu-check">{engine === 'neural' ? '✓' : ''}</span>
194
+ <span>Neural voice (downloads)</span>
195
+ </button>
196
+
197
+ {engine === 'web' && (
198
+ <>
199
+ <div className="speak-menu-head">Voice</div>
200
+ <select
201
+ className="speak-select"
202
+ value={voiceURI || ''}
203
+ onChange={(e) => chooseVoice(e.target.value)}
204
+ >
205
+ <option value="">System default</option>
206
+ {voices.map((v) => (
207
+ <option key={v.voiceURI} value={v.voiceURI}>
208
+ {v.name} {v.lang ? `(${v.lang})` : ''}
209
+ </option>
210
+ ))}
211
+ </select>
212
+ </>
213
+ )}
214
+
215
+ <div className="speak-menu-head">Rate · {rate.toFixed(1)}×</div>
216
+ <input
217
+ className="speak-rate"
218
+ type="range"
219
+ min={0.5}
220
+ max={2}
221
+ step={0.1}
222
+ value={rate}
223
+ onChange={(e) => changeRate(Number(e.target.value))}
224
+ />
225
+
226
+ <div className="speak-menu-row">
227
+ <button type="button" className="speak-menu-btn" onClick={testVoice}>Test</button>
228
+ <button type="button" className="speak-menu-btn" onClick={() => speech.stop()}>Stop</button>
229
+ </div>
230
+ </div>
231
+ )}
232
+ {error && <div className="speak-error">{error}</div>}
233
+ </div>
234
+ )
235
+ }
@@ -43,6 +43,12 @@ type Engine = 'web-speech' | 'local'
43
43
  type Phase = 'idle' | 'listening' | 'transcribing'
44
44
 
45
45
  const ENGINE_KEY = 'ctlsurf.voiceEngine'
46
+ // Persisted audioinput deviceId for the local engine; null/absent = OS default.
47
+ const DEVICE_KEY = 'ctlsurf.voiceDeviceId'
48
+
49
+ function loadDeviceId(): string | null {
50
+ try { return localStorage.getItem(DEVICE_KEY) || null } catch { return null }
51
+ }
46
52
 
47
53
  const WEB_SPEECH_SUPPORTED = getRecognitionCtor() !== null
48
54
  const LOCAL_SUPPORTED =
@@ -89,6 +95,14 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
89
95
  const [error, setError] = useState<string | null>(null)
90
96
  const [notice, setNotice] = useState<string | null>(null)
91
97
 
98
+ // Mic source selection (local engine only)
99
+ const [devices, setDevices] = useState<MediaDeviceInfo[]>([])
100
+ const [selectedDeviceId, setSelectedDeviceId] = useState<string | null>(loadDeviceId)
101
+ const [showDevicePicker, setShowDevicePicker] = useState(false)
102
+ const selectedDeviceIdRef = useRef(selectedDeviceId)
103
+ useEffect(() => { selectedDeviceIdRef.current = selectedDeviceId }, [selectedDeviceId])
104
+ const wrapRef = useRef<HTMLDivElement>(null)
105
+
92
106
  // Web Speech refs
93
107
  const recognitionRef = useRef<SpeechRecognitionLike | null>(null)
94
108
  const finalRef = useRef('')
@@ -128,6 +142,88 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
128
142
  streamRef.current = null
129
143
  }, [])
130
144
 
145
+ // ─── Mic source selection ──────────────────────────
146
+
147
+ const refreshDevices = useCallback(async () => {
148
+ if (!navigator.mediaDevices?.enumerateDevices) return
149
+ try {
150
+ const all = await navigator.mediaDevices.enumerateDevices()
151
+ setDevices(all.filter((d) => d.kind === 'audioinput'))
152
+ } catch { /* ignore */ }
153
+ }, [])
154
+
155
+ // Device labels are blank until mic permission is granted, so when the user
156
+ // opens the picker we request a one-shot permission to populate names.
157
+ const ensureDeviceLabels = useCallback(async () => {
158
+ if (!navigator.mediaDevices?.enumerateDevices) return
159
+ try {
160
+ const all = await navigator.mediaDevices.enumerateDevices()
161
+ const inputs = all.filter((d) => d.kind === 'audioinput')
162
+ if (inputs.length && inputs.every((d) => !d.label)) {
163
+ const s = await navigator.mediaDevices.getUserMedia({ audio: true })
164
+ s.getTracks().forEach((t) => t.stop())
165
+ }
166
+ } catch { /* permission denied → we'll show generic names */ }
167
+ await refreshDevices()
168
+ }, [refreshDevices])
169
+
170
+ const toggleDevicePicker = useCallback(() => {
171
+ setShowDevicePicker((open) => !open)
172
+ }, [])
173
+
174
+ // Populate device labels whenever the picker opens (needs mic permission).
175
+ useEffect(() => {
176
+ if (showDevicePicker) void ensureDeviceLabels()
177
+ }, [showDevicePicker, ensureDeviceLabels])
178
+
179
+ const chooseDevice = useCallback((id: string | null) => {
180
+ setSelectedDeviceId(id)
181
+ try {
182
+ if (id) localStorage.setItem(DEVICE_KEY, id)
183
+ else localStorage.removeItem(DEVICE_KEY)
184
+ } catch { /* ignore */ }
185
+ setShowDevicePicker(false)
186
+ }, [])
187
+
188
+ // Acquire a stream honoring the saved device, gracefully falling back to the
189
+ // OS default if that device was unplugged since it was chosen.
190
+ const getStream = useCallback(async (): Promise<MediaStream> => {
191
+ const id = selectedDeviceIdRef.current
192
+ try {
193
+ return await navigator.mediaDevices.getUserMedia({
194
+ audio: id ? { deviceId: { exact: id } } : true,
195
+ })
196
+ } catch (err) {
197
+ if (id && (err as { name?: string })?.name === 'OverconstrainedError') {
198
+ try { localStorage.removeItem(DEVICE_KEY) } catch { /* ignore */ }
199
+ setSelectedDeviceId(null)
200
+ setNotice('Saved microphone unavailable — using system default.')
201
+ return navigator.mediaDevices.getUserMedia({ audio: true })
202
+ }
203
+ throw err
204
+ }
205
+ }, [])
206
+
207
+ // Keep the list fresh when devices are plugged/unplugged.
208
+ useEffect(() => {
209
+ const md = navigator.mediaDevices
210
+ if (!md?.addEventListener) return
211
+ const onChange = () => { void refreshDevices() }
212
+ md.addEventListener('devicechange', onChange)
213
+ void refreshDevices()
214
+ return () => md.removeEventListener('devicechange', onChange)
215
+ }, [refreshDevices])
216
+
217
+ // Close the picker on an outside click.
218
+ useEffect(() => {
219
+ if (!showDevicePicker) return
220
+ const onDocDown = (e: MouseEvent) => {
221
+ if (!wrapRef.current?.contains(e.target as Node)) setShowDevicePicker(false)
222
+ }
223
+ document.addEventListener('mousedown', onDocDown)
224
+ return () => document.removeEventListener('mousedown', onDocDown)
225
+ }, [showDevicePicker])
226
+
131
227
  // ─── Web Speech engine ─────────────────────────────
132
228
 
133
229
  const startWebSpeech = useCallback(() => {
@@ -207,6 +303,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
207
303
  try {
208
304
  const text = await transcribeBlob(blob, handleModelProgress)
209
305
  if (text) onTranscriptRef.current(text)
306
+ else setNotice('No speech detected — check the mic source (▾).')
210
307
  } catch (err) {
211
308
  setError('On-device transcription failed')
212
309
  console.error('[voice] local transcription failed', err)
@@ -220,7 +317,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
220
317
  setError(null); setNotice(null); setInterim('')
221
318
  cancelGestureRef.current = false
222
319
  try {
223
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
320
+ const stream = await getStream()
224
321
  // Released during the permission/await — don't record anything.
225
322
  if (cancelGestureRef.current) {
226
323
  stream.getTracks().forEach((t) => t.stop())
@@ -241,7 +338,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
241
338
  setError(describeMicError(err))
242
339
  console.error('[voice] getUserMedia failed', err)
243
340
  }
244
- }, [runLocalTranscription, stopStream])
341
+ }, [runLocalTranscription, stopStream, getStream])
245
342
 
246
343
  const stopLocal = useCallback(() => {
247
344
  cancelGestureRef.current = true
@@ -299,8 +396,23 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
299
396
  ? `voice-btn voice-btn-floating ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
300
397
  : `titlebar-btn titlebar-icon-btn voice-btn ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
301
398
 
399
+ // Hide Chromium's synthetic "default"/"communications" aggregate entries from
400
+ // the list; we expose our own "System default" item instead.
401
+ const realDevices = devices.filter(
402
+ (d) => d.deviceId && d.deviceId !== 'default' && d.deviceId !== 'communications',
403
+ )
404
+ const rawDefault = devices.find((d) => d.deviceId === 'default')?.label
405
+ const defaultLabel = rawDefault
406
+ ? `System default · ${rawDefault.replace(/^Default\s*-\s*/i, '')}`
407
+ : 'System default'
408
+ const activeLabel = selectedDeviceId
409
+ ? (realDevices.find((d) => d.deviceId === selectedDeviceId)?.label || 'Selected microphone')
410
+ : defaultLabel
411
+ // Source selection only affects the local (getUserMedia) engine.
412
+ const showSourcePicker = floating && LOCAL_SUPPORTED
413
+
302
414
  return (
303
- <div className="voice-input-wrap">
415
+ <div className="voice-input-wrap" ref={wrapRef}>
304
416
  <button
305
417
  type="button"
306
418
  className={btnClass}
@@ -315,6 +427,50 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
315
427
  <span className="voice-icon" aria-hidden="true">🎤</span>
316
428
  <span className={`voice-dot ${listening ? 'on' : busy ? 'busy' : 'off'}`} />
317
429
  </button>
430
+ {showSourcePicker && (
431
+ <button
432
+ type="button"
433
+ className="voice-source-btn"
434
+ onPointerDown={(e) => e.stopPropagation()}
435
+ onClick={toggleDevicePicker}
436
+ title={`Mic source: ${activeLabel}`}
437
+ aria-label="Choose microphone source"
438
+ aria-expanded={showDevicePicker}
439
+ >
440
+
441
+ </button>
442
+ )}
443
+ {showSourcePicker && showDevicePicker && (
444
+ <div className="voice-source-menu" role="menu">
445
+ <div className="voice-source-head">Microphone source</div>
446
+ <button
447
+ type="button"
448
+ role="menuitemradio"
449
+ aria-checked={selectedDeviceId === null}
450
+ className={`voice-source-item ${selectedDeviceId === null ? 'active' : ''}`}
451
+ onClick={() => chooseDevice(null)}
452
+ >
453
+ <span className="voice-source-check">{selectedDeviceId === null ? '✓' : ''}</span>
454
+ <span className="voice-source-label">{defaultLabel}</span>
455
+ </button>
456
+ {realDevices.map((d, i) => (
457
+ <button
458
+ key={d.deviceId}
459
+ type="button"
460
+ role="menuitemradio"
461
+ aria-checked={selectedDeviceId === d.deviceId}
462
+ className={`voice-source-item ${selectedDeviceId === d.deviceId ? 'active' : ''}`}
463
+ onClick={() => chooseDevice(d.deviceId)}
464
+ >
465
+ <span className="voice-source-check">{selectedDeviceId === d.deviceId ? '✓' : ''}</span>
466
+ <span className="voice-source-label">{d.label || `Microphone ${i + 1}`}</span>
467
+ </button>
468
+ ))}
469
+ {realDevices.length === 0 && (
470
+ <div className="voice-source-empty">No microphones found</div>
471
+ )}
472
+ </div>
473
+ )}
318
474
  {chip && <div className={`voice-chip ${chip.kind} ${floating ? 'voice-chip-floating' : ''}`}>{chip.text}</div>}
319
475
  </div>
320
476
  )
@@ -6,6 +6,37 @@
6
6
  const MODEL = 'Xenova/whisper-base'
7
7
  const TARGET_SAMPLE_RATE = 16000
8
8
 
9
+ // Whisper hallucinates filler tokens ("you", "Thank you.", "Thanks for
10
+ // watching.") when fed silence. Two guards below tame that:
11
+ // - Below SILENCE_RMS we treat the clip as silent and skip the model entirely.
12
+ // - Between SILENCE_RMS and LOW_CONFIDENCE_RMS we still run the model but drop
13
+ // the result if it's *only* a known filler phrase (likely a hallucination on
14
+ // a quiet clip rather than real speech).
15
+ const SILENCE_RMS = 0.008
16
+ const LOW_CONFIDENCE_RMS = 0.02
17
+ const FILLER_PHRASES = new Set([
18
+ 'you',
19
+ 'thank you',
20
+ 'thank you very much',
21
+ 'thank you for watching',
22
+ 'thanks for watching',
23
+ 'please subscribe',
24
+ ])
25
+
26
+ // Root-mean-square amplitude of the clip — a cheap loudness proxy. Normalized
27
+ // speech sits around 0.05–0.15; a silent room is well under 0.005.
28
+ function computeRms(pcm: Float32Array): number {
29
+ if (pcm.length === 0) return 0
30
+ let sum = 0
31
+ for (let i = 0; i < pcm.length; i++) sum += pcm[i] * pcm[i]
32
+ return Math.sqrt(sum / pcm.length)
33
+ }
34
+
35
+ function isFillerOnly(text: string): boolean {
36
+ const norm = text.toLowerCase().replace(/[.!?,…"']/g, '').replace(/\s+/g, ' ').trim()
37
+ return FILLER_PHRASES.has(norm)
38
+ }
39
+
9
40
  export interface ModelProgress {
10
41
  status: string
11
42
  file?: string
@@ -77,12 +108,25 @@ async function blobToPcm16k(blob: Blob): Promise<Float32Array | null> {
77
108
  }
78
109
 
79
110
  export async function transcribeBlob(blob: Blob, onProgress?: (p: ModelProgress) => void): Promise<string> {
80
- const transcriber = await loadTranscriber(onProgress)
111
+ // Decode first so a silent clip never triggers the (one-time, ~75MB) model
112
+ // download just to be thrown away below.
81
113
  const pcm = await blobToPcm16k(blob)
82
114
  if (!pcm) return ''
115
+ const rms = computeRms(pcm)
116
+ if (rms < SILENCE_RMS) {
117
+ console.info(`[voice] near-silent clip (rms=${rms.toFixed(4)}); skipping transcription`)
118
+ return ''
119
+ }
120
+
121
+ const transcriber = await loadTranscriber(onProgress)
83
122
  const result = await transcriber(pcm)
84
- const text = Array.isArray(result)
123
+ const text = (Array.isArray(result)
85
124
  ? result.map((r) => r.text).join(' ')
86
- : result?.text
87
- return (text || '').trim()
125
+ : result?.text || '').trim()
126
+
127
+ if (text && rms < LOW_CONFIDENCE_RMS && isFillerOnly(text)) {
128
+ console.info(`[voice] dropping filler-only output "${text}" from quiet clip (rms=${rms.toFixed(4)})`)
129
+ return ''
130
+ }
131
+ return text
88
132
  }