@phenx-inc/ctlsurf 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/headless/index.mjs +26 -10
- package/out/headless/index.mjs.map +2 -2
- package/out/main/index.js +31 -9
- package/out/preload/index.js +8 -0
- package/out/renderer/assets/{cssMode-eTXVdAkZ.js → cssMode-BQN8v2ok.js} +3 -3
- package/out/renderer/assets/{freemarker2-B5BKaiK4.js → freemarker2-DbxGYYVp.js} +1 -1
- package/out/renderer/assets/{handlebars-BIdLd2wU.js → handlebars-3auU1CAd.js} +1 -1
- package/out/renderer/assets/{html-BXL4cnLS.js → html-D8xFiRmI.js} +1 -1
- package/out/renderer/assets/{htmlMode-46N3XG2c.js → htmlMode-M3MApZ4n.js} +3 -3
- package/out/renderer/assets/{index-dRvutfbl.js → index---H6cxNl.js} +696 -33
- package/out/renderer/assets/{index-Cf-RsxoC.css → index-B-iM7dFC.css} +195 -0
- package/out/renderer/assets/{javascript-n_iZZzDX.js → javascript-BO_ViZM5.js} +2 -2
- package/out/renderer/assets/{jsonMode-DXDczSNu.js → jsonMode-CKp2zvZu.js} +3 -3
- package/out/renderer/assets/{liquid-B1QweUh7.js → liquid-C1eHcrht.js} +1 -1
- package/out/renderer/assets/{lspLanguageFeatures-DqzMqkRk.js → lspLanguageFeatures-CHWJx_Tl.js} +1 -1
- package/out/renderer/assets/{mdx-BCv8lm5e.js → mdx-Qqdtk7fL.js} +1 -1
- package/out/renderer/assets/{python-BLNzYwDv.js → python-DKu7rNbs.js} +1 -1
- package/out/renderer/assets/{razor-CvAww8bG.js → razor-BOMpCo6z.js} +1 -1
- package/out/renderer/assets/{tsMode-C7m6Kr5E.js → tsMode-yAjlPR-D.js} +1 -1
- package/out/renderer/assets/{typescript-DhPw4VVg.js → typescript-BiJRCUcL.js} +1 -1
- package/out/renderer/assets/{xml-B0WLFJ2U.js → xml-D4PvYeQq.js} +1 -1
- package/out/renderer/assets/{yaml-BWyn9Wd7.js → yaml-BeHVkmnS.js} +1 -1
- package/out/renderer/index.html +2 -2
- package/package.json +1 -1
- package/src/main/index.ts +7 -0
- package/src/main/orchestrator.ts +38 -9
- package/src/preload/index.ts +11 -0
- package/src/renderer/App.tsx +5 -0
- package/src/renderer/components/SpeakControls.tsx +235 -0
- package/src/renderer/components/VoiceInput.tsx +159 -3
- package/src/renderer/lib/localWhisper.ts +48 -4
- package/src/renderer/lib/speech.ts +299 -0
- package/src/renderer/styles.css +195 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
import { useCallback, useEffect, useRef, useState } from 'react'
|
|
2
|
+
import {
|
|
3
|
+
speech,
|
|
4
|
+
getEngine,
|
|
5
|
+
setEngine as persistEngine,
|
|
6
|
+
getVoiceURI,
|
|
7
|
+
setVoiceURI as persistVoiceURI,
|
|
8
|
+
getRate,
|
|
9
|
+
setRate as persistRate,
|
|
10
|
+
listWebVoices,
|
|
11
|
+
type TtsEngineId,
|
|
12
|
+
type TtsModelProgress,
|
|
13
|
+
} from '../lib/speech'
|
|
14
|
+
|
|
15
|
+
// Titlebar control for "speak agent replies" (Electron desktop only). A speaker
|
|
16
|
+
// toggle drives the main-process speakReplies setting (which runs the transcript
|
|
17
|
+
// tailer); a ▾ dropdown picks the engine, voice, and rate.
|
|
18
|
+
|
|
19
|
+
const SAMPLE = 'This is how spoken agent replies will sound.'
|
|
20
|
+
|
|
21
|
+
export function SpeakControls() {
|
|
22
|
+
const [enabled, setEnabled] = useState(false)
|
|
23
|
+
const [engine, setEngineState] = useState<TtsEngineId>(getEngine)
|
|
24
|
+
const [voices, setVoices] = useState<SpeechSynthesisVoice[]>([])
|
|
25
|
+
const [voiceURI, setVoiceURIState] = useState<string | null>(getVoiceURI)
|
|
26
|
+
const [rate, setRateState] = useState<number>(getRate)
|
|
27
|
+
const [showMenu, setShowMenu] = useState(false)
|
|
28
|
+
const [modelPct, setModelPct] = useState<number | null>(null)
|
|
29
|
+
const [speaking, setSpeaking] = useState(false)
|
|
30
|
+
const [error, setError] = useState<string | null>(null)
|
|
31
|
+
const wrapRef = useRef<HTMLDivElement>(null)
|
|
32
|
+
|
|
33
|
+
// Reflect the persisted main-process setting on mount.
|
|
34
|
+
useEffect(() => {
|
|
35
|
+
let alive = true
|
|
36
|
+
window.worker.getSpeakReplies().then((r) => { if (alive) setEnabled(!!r.enabled) }).catch(() => {})
|
|
37
|
+
return () => { alive = false }
|
|
38
|
+
}, [])
|
|
39
|
+
|
|
40
|
+
// Speak replies as they arrive. Main only forwards when the setting is on, so
|
|
41
|
+
// this is a no-op while disabled.
|
|
42
|
+
useEffect(() => {
|
|
43
|
+
const off = window.worker.onAgentMessage((text) => speech.enqueue(text))
|
|
44
|
+
return off
|
|
45
|
+
}, [])
|
|
46
|
+
|
|
47
|
+
// Surface neural-model download/transcribe progress on the button.
|
|
48
|
+
useEffect(() => {
|
|
49
|
+
speech.onModelProgress = (p: TtsModelProgress | null) => {
|
|
50
|
+
if (p && p.status === 'progress' && typeof p.progress === 'number') {
|
|
51
|
+
setModelPct(Math.min(100, Math.round(p.progress)))
|
|
52
|
+
} else if (!p) {
|
|
53
|
+
setModelPct(null)
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
speech.onActivityChange = (a) => setSpeaking(a)
|
|
57
|
+
speech.onError = (msg) => setError(msg)
|
|
58
|
+
return () => {
|
|
59
|
+
speech.onModelProgress = null
|
|
60
|
+
speech.onActivityChange = null
|
|
61
|
+
speech.onError = null
|
|
62
|
+
}
|
|
63
|
+
}, [])
|
|
64
|
+
|
|
65
|
+
// Auto-dismiss the error chip.
|
|
66
|
+
useEffect(() => {
|
|
67
|
+
if (!error) return
|
|
68
|
+
const t = setTimeout(() => setError(null), 4000)
|
|
69
|
+
return () => clearTimeout(t)
|
|
70
|
+
}, [error])
|
|
71
|
+
|
|
72
|
+
// Web Speech voices populate asynchronously.
|
|
73
|
+
useEffect(() => {
|
|
74
|
+
const load = () => setVoices(listWebVoices())
|
|
75
|
+
load()
|
|
76
|
+
if (typeof speechSynthesis !== 'undefined') {
|
|
77
|
+
speechSynthesis.addEventListener('voiceschanged', load)
|
|
78
|
+
return () => speechSynthesis.removeEventListener('voiceschanged', load)
|
|
79
|
+
}
|
|
80
|
+
}, [])
|
|
81
|
+
|
|
82
|
+
// Close the menu on an outside click.
|
|
83
|
+
useEffect(() => {
|
|
84
|
+
if (!showMenu) return
|
|
85
|
+
const onDown = (e: MouseEvent) => {
|
|
86
|
+
if (!wrapRef.current?.contains(e.target as Node)) setShowMenu(false)
|
|
87
|
+
}
|
|
88
|
+
document.addEventListener('mousedown', onDown)
|
|
89
|
+
return () => document.removeEventListener('mousedown', onDown)
|
|
90
|
+
}, [showMenu])
|
|
91
|
+
|
|
92
|
+
const toggle = useCallback(async () => {
|
|
93
|
+
const next = !enabled
|
|
94
|
+
setEnabled(next)
|
|
95
|
+
// Prime the audio context from this click so neural playback isn't blocked
|
|
96
|
+
// by autoplay policy when a reply later arrives without a gesture.
|
|
97
|
+
if (next) speech.unlock()
|
|
98
|
+
else speech.stop()
|
|
99
|
+
try { await window.worker.setSpeakReplies(next) } catch { /* ignore */ }
|
|
100
|
+
}, [enabled])
|
|
101
|
+
|
|
102
|
+
const chooseEngine = useCallback((id: TtsEngineId) => {
|
|
103
|
+
persistEngine(id)
|
|
104
|
+
setEngineState(id)
|
|
105
|
+
speech.stop()
|
|
106
|
+
// Preload the neural model so the first reply isn't slowed by download.
|
|
107
|
+
if (id === 'neural') speech.warmup()
|
|
108
|
+
}, [])
|
|
109
|
+
|
|
110
|
+
const chooseVoice = useCallback((uri: string) => {
|
|
111
|
+
const v = uri || null
|
|
112
|
+
persistVoiceURI(v)
|
|
113
|
+
setVoiceURIState(v)
|
|
114
|
+
}, [])
|
|
115
|
+
|
|
116
|
+
const changeRate = useCallback((r: number) => {
|
|
117
|
+
persistRate(r)
|
|
118
|
+
setRateState(r)
|
|
119
|
+
}, [])
|
|
120
|
+
|
|
121
|
+
const testVoice = useCallback(() => {
|
|
122
|
+
speech.unlock()
|
|
123
|
+
speech.stop()
|
|
124
|
+
speech.enqueue(SAMPLE)
|
|
125
|
+
}, [])
|
|
126
|
+
|
|
127
|
+
const title = enabled ? 'Spoken replies on — click to mute' : 'Speak agent replies (off)'
|
|
128
|
+
|
|
129
|
+
return (
|
|
130
|
+
<div className="speak-controls" ref={wrapRef}>
|
|
131
|
+
<button
|
|
132
|
+
type="button"
|
|
133
|
+
className={`titlebar-btn titlebar-icon-btn speak-btn ${enabled ? 'active' : ''}`}
|
|
134
|
+
onClick={toggle}
|
|
135
|
+
title={title}
|
|
136
|
+
aria-label="Toggle spoken agent replies"
|
|
137
|
+
aria-pressed={enabled}
|
|
138
|
+
>
|
|
139
|
+
<svg viewBox="0 0 24 24" width="13" height="13" fill="none" stroke="currentColor"
|
|
140
|
+
strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" aria-hidden="true">
|
|
141
|
+
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
|
|
142
|
+
{enabled ? (
|
|
143
|
+
<>
|
|
144
|
+
<path d="M15.54 8.46a5 5 0 0 1 0 7.07" />
|
|
145
|
+
<path d="M19.07 4.93a10 10 0 0 1 0 14.14" />
|
|
146
|
+
</>
|
|
147
|
+
) : (
|
|
148
|
+
<line x1="23" y1="9" x2="17" y2="15" />
|
|
149
|
+
)}
|
|
150
|
+
</svg>
|
|
151
|
+
{modelPct !== null && <span className="speak-pct">{modelPct}%</span>}
|
|
152
|
+
</button>
|
|
153
|
+
{speaking && (
|
|
154
|
+
<button
|
|
155
|
+
type="button"
|
|
156
|
+
className="titlebar-btn titlebar-icon-btn speak-stop"
|
|
157
|
+
onClick={() => speech.stop()}
|
|
158
|
+
title="Stop speaking"
|
|
159
|
+
aria-label="Stop speaking"
|
|
160
|
+
>
|
|
161
|
+
<svg viewBox="0 0 24 24" width="11" height="11" fill="currentColor" aria-hidden="true">
|
|
162
|
+
<rect x="5" y="5" width="14" height="14" rx="2" />
|
|
163
|
+
</svg>
|
|
164
|
+
</button>
|
|
165
|
+
)}
|
|
166
|
+
<button
|
|
167
|
+
type="button"
|
|
168
|
+
className="titlebar-btn speak-caret"
|
|
169
|
+
onClick={() => setShowMenu((v) => !v)}
|
|
170
|
+
title="Voice options"
|
|
171
|
+
aria-label="Voice options"
|
|
172
|
+
aria-expanded={showMenu}
|
|
173
|
+
>
|
|
174
|
+
▾
|
|
175
|
+
</button>
|
|
176
|
+
|
|
177
|
+
{showMenu && (
|
|
178
|
+
<div className="speak-menu" role="menu">
|
|
179
|
+
<div className="speak-menu-head">Engine</div>
|
|
180
|
+
<button
|
|
181
|
+
type="button"
|
|
182
|
+
className={`speak-menu-item ${engine === 'web' ? 'active' : ''}`}
|
|
183
|
+
onClick={() => chooseEngine('web')}
|
|
184
|
+
>
|
|
185
|
+
<span className="speak-menu-check">{engine === 'web' ? '✓' : ''}</span>
|
|
186
|
+
<span>System voice (instant)</span>
|
|
187
|
+
</button>
|
|
188
|
+
<button
|
|
189
|
+
type="button"
|
|
190
|
+
className={`speak-menu-item ${engine === 'neural' ? 'active' : ''}`}
|
|
191
|
+
onClick={() => chooseEngine('neural')}
|
|
192
|
+
>
|
|
193
|
+
<span className="speak-menu-check">{engine === 'neural' ? '✓' : ''}</span>
|
|
194
|
+
<span>Neural voice (downloads)</span>
|
|
195
|
+
</button>
|
|
196
|
+
|
|
197
|
+
{engine === 'web' && (
|
|
198
|
+
<>
|
|
199
|
+
<div className="speak-menu-head">Voice</div>
|
|
200
|
+
<select
|
|
201
|
+
className="speak-select"
|
|
202
|
+
value={voiceURI || ''}
|
|
203
|
+
onChange={(e) => chooseVoice(e.target.value)}
|
|
204
|
+
>
|
|
205
|
+
<option value="">System default</option>
|
|
206
|
+
{voices.map((v) => (
|
|
207
|
+
<option key={v.voiceURI} value={v.voiceURI}>
|
|
208
|
+
{v.name} {v.lang ? `(${v.lang})` : ''}
|
|
209
|
+
</option>
|
|
210
|
+
))}
|
|
211
|
+
</select>
|
|
212
|
+
</>
|
|
213
|
+
)}
|
|
214
|
+
|
|
215
|
+
<div className="speak-menu-head">Rate · {rate.toFixed(1)}×</div>
|
|
216
|
+
<input
|
|
217
|
+
className="speak-rate"
|
|
218
|
+
type="range"
|
|
219
|
+
min={0.5}
|
|
220
|
+
max={2}
|
|
221
|
+
step={0.1}
|
|
222
|
+
value={rate}
|
|
223
|
+
onChange={(e) => changeRate(Number(e.target.value))}
|
|
224
|
+
/>
|
|
225
|
+
|
|
226
|
+
<div className="speak-menu-row">
|
|
227
|
+
<button type="button" className="speak-menu-btn" onClick={testVoice}>Test</button>
|
|
228
|
+
<button type="button" className="speak-menu-btn" onClick={() => speech.stop()}>Stop</button>
|
|
229
|
+
</div>
|
|
230
|
+
</div>
|
|
231
|
+
)}
|
|
232
|
+
{error && <div className="speak-error">{error}</div>}
|
|
233
|
+
</div>
|
|
234
|
+
)
|
|
235
|
+
}
|
|
@@ -43,6 +43,12 @@ type Engine = 'web-speech' | 'local'
|
|
|
43
43
|
type Phase = 'idle' | 'listening' | 'transcribing'
|
|
44
44
|
|
|
45
45
|
const ENGINE_KEY = 'ctlsurf.voiceEngine'
|
|
46
|
+
// Persisted audioinput deviceId for the local engine; null/absent = OS default.
|
|
47
|
+
const DEVICE_KEY = 'ctlsurf.voiceDeviceId'
|
|
48
|
+
|
|
49
|
+
function loadDeviceId(): string | null {
|
|
50
|
+
try { return localStorage.getItem(DEVICE_KEY) || null } catch { return null }
|
|
51
|
+
}
|
|
46
52
|
|
|
47
53
|
const WEB_SPEECH_SUPPORTED = getRecognitionCtor() !== null
|
|
48
54
|
const LOCAL_SUPPORTED =
|
|
@@ -89,6 +95,14 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
|
|
|
89
95
|
const [error, setError] = useState<string | null>(null)
|
|
90
96
|
const [notice, setNotice] = useState<string | null>(null)
|
|
91
97
|
|
|
98
|
+
// Mic source selection (local engine only)
|
|
99
|
+
const [devices, setDevices] = useState<MediaDeviceInfo[]>([])
|
|
100
|
+
const [selectedDeviceId, setSelectedDeviceId] = useState<string | null>(loadDeviceId)
|
|
101
|
+
const [showDevicePicker, setShowDevicePicker] = useState(false)
|
|
102
|
+
const selectedDeviceIdRef = useRef(selectedDeviceId)
|
|
103
|
+
useEffect(() => { selectedDeviceIdRef.current = selectedDeviceId }, [selectedDeviceId])
|
|
104
|
+
const wrapRef = useRef<HTMLDivElement>(null)
|
|
105
|
+
|
|
92
106
|
// Web Speech refs
|
|
93
107
|
const recognitionRef = useRef<SpeechRecognitionLike | null>(null)
|
|
94
108
|
const finalRef = useRef('')
|
|
@@ -128,6 +142,88 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
|
|
|
128
142
|
streamRef.current = null
|
|
129
143
|
}, [])
|
|
130
144
|
|
|
145
|
+
// ─── Mic source selection ──────────────────────────
|
|
146
|
+
|
|
147
|
+
const refreshDevices = useCallback(async () => {
|
|
148
|
+
if (!navigator.mediaDevices?.enumerateDevices) return
|
|
149
|
+
try {
|
|
150
|
+
const all = await navigator.mediaDevices.enumerateDevices()
|
|
151
|
+
setDevices(all.filter((d) => d.kind === 'audioinput'))
|
|
152
|
+
} catch { /* ignore */ }
|
|
153
|
+
}, [])
|
|
154
|
+
|
|
155
|
+
// Device labels are blank until mic permission is granted, so when the user
|
|
156
|
+
// opens the picker we request a one-shot permission to populate names.
|
|
157
|
+
const ensureDeviceLabels = useCallback(async () => {
|
|
158
|
+
if (!navigator.mediaDevices?.enumerateDevices) return
|
|
159
|
+
try {
|
|
160
|
+
const all = await navigator.mediaDevices.enumerateDevices()
|
|
161
|
+
const inputs = all.filter((d) => d.kind === 'audioinput')
|
|
162
|
+
if (inputs.length && inputs.every((d) => !d.label)) {
|
|
163
|
+
const s = await navigator.mediaDevices.getUserMedia({ audio: true })
|
|
164
|
+
s.getTracks().forEach((t) => t.stop())
|
|
165
|
+
}
|
|
166
|
+
} catch { /* permission denied → we'll show generic names */ }
|
|
167
|
+
await refreshDevices()
|
|
168
|
+
}, [refreshDevices])
|
|
169
|
+
|
|
170
|
+
const toggleDevicePicker = useCallback(() => {
|
|
171
|
+
setShowDevicePicker((open) => !open)
|
|
172
|
+
}, [])
|
|
173
|
+
|
|
174
|
+
// Populate device labels whenever the picker opens (needs mic permission).
|
|
175
|
+
useEffect(() => {
|
|
176
|
+
if (showDevicePicker) void ensureDeviceLabels()
|
|
177
|
+
}, [showDevicePicker, ensureDeviceLabels])
|
|
178
|
+
|
|
179
|
+
const chooseDevice = useCallback((id: string | null) => {
|
|
180
|
+
setSelectedDeviceId(id)
|
|
181
|
+
try {
|
|
182
|
+
if (id) localStorage.setItem(DEVICE_KEY, id)
|
|
183
|
+
else localStorage.removeItem(DEVICE_KEY)
|
|
184
|
+
} catch { /* ignore */ }
|
|
185
|
+
setShowDevicePicker(false)
|
|
186
|
+
}, [])
|
|
187
|
+
|
|
188
|
+
// Acquire a stream honoring the saved device, gracefully falling back to the
|
|
189
|
+
// OS default if that device was unplugged since it was chosen.
|
|
190
|
+
const getStream = useCallback(async (): Promise<MediaStream> => {
|
|
191
|
+
const id = selectedDeviceIdRef.current
|
|
192
|
+
try {
|
|
193
|
+
return await navigator.mediaDevices.getUserMedia({
|
|
194
|
+
audio: id ? { deviceId: { exact: id } } : true,
|
|
195
|
+
})
|
|
196
|
+
} catch (err) {
|
|
197
|
+
if (id && (err as { name?: string })?.name === 'OverconstrainedError') {
|
|
198
|
+
try { localStorage.removeItem(DEVICE_KEY) } catch { /* ignore */ }
|
|
199
|
+
setSelectedDeviceId(null)
|
|
200
|
+
setNotice('Saved microphone unavailable — using system default.')
|
|
201
|
+
return navigator.mediaDevices.getUserMedia({ audio: true })
|
|
202
|
+
}
|
|
203
|
+
throw err
|
|
204
|
+
}
|
|
205
|
+
}, [])
|
|
206
|
+
|
|
207
|
+
// Keep the list fresh when devices are plugged/unplugged.
|
|
208
|
+
useEffect(() => {
|
|
209
|
+
const md = navigator.mediaDevices
|
|
210
|
+
if (!md?.addEventListener) return
|
|
211
|
+
const onChange = () => { void refreshDevices() }
|
|
212
|
+
md.addEventListener('devicechange', onChange)
|
|
213
|
+
void refreshDevices()
|
|
214
|
+
return () => md.removeEventListener('devicechange', onChange)
|
|
215
|
+
}, [refreshDevices])
|
|
216
|
+
|
|
217
|
+
// Close the picker on an outside click.
|
|
218
|
+
useEffect(() => {
|
|
219
|
+
if (!showDevicePicker) return
|
|
220
|
+
const onDocDown = (e: MouseEvent) => {
|
|
221
|
+
if (!wrapRef.current?.contains(e.target as Node)) setShowDevicePicker(false)
|
|
222
|
+
}
|
|
223
|
+
document.addEventListener('mousedown', onDocDown)
|
|
224
|
+
return () => document.removeEventListener('mousedown', onDocDown)
|
|
225
|
+
}, [showDevicePicker])
|
|
226
|
+
|
|
131
227
|
// ─── Web Speech engine ─────────────────────────────
|
|
132
228
|
|
|
133
229
|
const startWebSpeech = useCallback(() => {
|
|
@@ -207,6 +303,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
|
|
|
207
303
|
try {
|
|
208
304
|
const text = await transcribeBlob(blob, handleModelProgress)
|
|
209
305
|
if (text) onTranscriptRef.current(text)
|
|
306
|
+
else setNotice('No speech detected — check the mic source (▾).')
|
|
210
307
|
} catch (err) {
|
|
211
308
|
setError('On-device transcription failed')
|
|
212
309
|
console.error('[voice] local transcription failed', err)
|
|
@@ -220,7 +317,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
|
|
|
220
317
|
setError(null); setNotice(null); setInterim('')
|
|
221
318
|
cancelGestureRef.current = false
|
|
222
319
|
try {
|
|
223
|
-
const stream = await
|
|
320
|
+
const stream = await getStream()
|
|
224
321
|
// Released during the permission/await — don't record anything.
|
|
225
322
|
if (cancelGestureRef.current) {
|
|
226
323
|
stream.getTracks().forEach((t) => t.stop())
|
|
@@ -241,7 +338,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
|
|
|
241
338
|
setError(describeMicError(err))
|
|
242
339
|
console.error('[voice] getUserMedia failed', err)
|
|
243
340
|
}
|
|
244
|
-
}, [runLocalTranscription, stopStream])
|
|
341
|
+
}, [runLocalTranscription, stopStream, getStream])
|
|
245
342
|
|
|
246
343
|
const stopLocal = useCallback(() => {
|
|
247
344
|
cancelGestureRef.current = true
|
|
@@ -299,8 +396,23 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
|
|
|
299
396
|
? `voice-btn voice-btn-floating ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
|
|
300
397
|
: `titlebar-btn titlebar-icon-btn voice-btn ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
|
|
301
398
|
|
|
399
|
+
// Hide Chromium's synthetic "default"/"communications" aggregate entries from
|
|
400
|
+
// the list; we expose our own "System default" item instead.
|
|
401
|
+
const realDevices = devices.filter(
|
|
402
|
+
(d) => d.deviceId && d.deviceId !== 'default' && d.deviceId !== 'communications',
|
|
403
|
+
)
|
|
404
|
+
const rawDefault = devices.find((d) => d.deviceId === 'default')?.label
|
|
405
|
+
const defaultLabel = rawDefault
|
|
406
|
+
? `System default · ${rawDefault.replace(/^Default\s*-\s*/i, '')}`
|
|
407
|
+
: 'System default'
|
|
408
|
+
const activeLabel = selectedDeviceId
|
|
409
|
+
? (realDevices.find((d) => d.deviceId === selectedDeviceId)?.label || 'Selected microphone')
|
|
410
|
+
: defaultLabel
|
|
411
|
+
// Source selection only affects the local (getUserMedia) engine.
|
|
412
|
+
const showSourcePicker = floating && LOCAL_SUPPORTED
|
|
413
|
+
|
|
302
414
|
return (
|
|
303
|
-
<div className="voice-input-wrap">
|
|
415
|
+
<div className="voice-input-wrap" ref={wrapRef}>
|
|
304
416
|
<button
|
|
305
417
|
type="button"
|
|
306
418
|
className={btnClass}
|
|
@@ -315,6 +427,50 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
|
|
|
315
427
|
<span className="voice-icon" aria-hidden="true">🎤</span>
|
|
316
428
|
<span className={`voice-dot ${listening ? 'on' : busy ? 'busy' : 'off'}`} />
|
|
317
429
|
</button>
|
|
430
|
+
{showSourcePicker && (
|
|
431
|
+
<button
|
|
432
|
+
type="button"
|
|
433
|
+
className="voice-source-btn"
|
|
434
|
+
onPointerDown={(e) => e.stopPropagation()}
|
|
435
|
+
onClick={toggleDevicePicker}
|
|
436
|
+
title={`Mic source: ${activeLabel}`}
|
|
437
|
+
aria-label="Choose microphone source"
|
|
438
|
+
aria-expanded={showDevicePicker}
|
|
439
|
+
>
|
|
440
|
+
▾
|
|
441
|
+
</button>
|
|
442
|
+
)}
|
|
443
|
+
{showSourcePicker && showDevicePicker && (
|
|
444
|
+
<div className="voice-source-menu" role="menu">
|
|
445
|
+
<div className="voice-source-head">Microphone source</div>
|
|
446
|
+
<button
|
|
447
|
+
type="button"
|
|
448
|
+
role="menuitemradio"
|
|
449
|
+
aria-checked={selectedDeviceId === null}
|
|
450
|
+
className={`voice-source-item ${selectedDeviceId === null ? 'active' : ''}`}
|
|
451
|
+
onClick={() => chooseDevice(null)}
|
|
452
|
+
>
|
|
453
|
+
<span className="voice-source-check">{selectedDeviceId === null ? '✓' : ''}</span>
|
|
454
|
+
<span className="voice-source-label">{defaultLabel}</span>
|
|
455
|
+
</button>
|
|
456
|
+
{realDevices.map((d, i) => (
|
|
457
|
+
<button
|
|
458
|
+
key={d.deviceId}
|
|
459
|
+
type="button"
|
|
460
|
+
role="menuitemradio"
|
|
461
|
+
aria-checked={selectedDeviceId === d.deviceId}
|
|
462
|
+
className={`voice-source-item ${selectedDeviceId === d.deviceId ? 'active' : ''}`}
|
|
463
|
+
onClick={() => chooseDevice(d.deviceId)}
|
|
464
|
+
>
|
|
465
|
+
<span className="voice-source-check">{selectedDeviceId === d.deviceId ? '✓' : ''}</span>
|
|
466
|
+
<span className="voice-source-label">{d.label || `Microphone ${i + 1}`}</span>
|
|
467
|
+
</button>
|
|
468
|
+
))}
|
|
469
|
+
{realDevices.length === 0 && (
|
|
470
|
+
<div className="voice-source-empty">No microphones found</div>
|
|
471
|
+
)}
|
|
472
|
+
</div>
|
|
473
|
+
)}
|
|
318
474
|
{chip && <div className={`voice-chip ${chip.kind} ${floating ? 'voice-chip-floating' : ''}`}>{chip.text}</div>}
|
|
319
475
|
</div>
|
|
320
476
|
)
|
|
@@ -6,6 +6,37 @@
|
|
|
6
6
|
const MODEL = 'Xenova/whisper-base'
|
|
7
7
|
const TARGET_SAMPLE_RATE = 16000
|
|
8
8
|
|
|
9
|
+
// Whisper hallucinates filler tokens ("you", "Thank you.", "Thanks for
|
|
10
|
+
// watching.") when fed silence. Two guards below tame that:
|
|
11
|
+
// - Below SILENCE_RMS we treat the clip as silent and skip the model entirely.
|
|
12
|
+
// - Between SILENCE_RMS and LOW_CONFIDENCE_RMS we still run the model but drop
|
|
13
|
+
// the result if it's *only* a known filler phrase (likely a hallucination on
|
|
14
|
+
// a quiet clip rather than real speech).
|
|
15
|
+
const SILENCE_RMS = 0.008
|
|
16
|
+
const LOW_CONFIDENCE_RMS = 0.02
|
|
17
|
+
const FILLER_PHRASES = new Set([
|
|
18
|
+
'you',
|
|
19
|
+
'thank you',
|
|
20
|
+
'thank you very much',
|
|
21
|
+
'thank you for watching',
|
|
22
|
+
'thanks for watching',
|
|
23
|
+
'please subscribe',
|
|
24
|
+
])
|
|
25
|
+
|
|
26
|
+
// Root-mean-square amplitude of the clip — a cheap loudness proxy. Normalized
|
|
27
|
+
// speech sits around 0.05–0.15; a silent room is well under 0.005.
|
|
28
|
+
function computeRms(pcm: Float32Array): number {
|
|
29
|
+
if (pcm.length === 0) return 0
|
|
30
|
+
let sum = 0
|
|
31
|
+
for (let i = 0; i < pcm.length; i++) sum += pcm[i] * pcm[i]
|
|
32
|
+
return Math.sqrt(sum / pcm.length)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function isFillerOnly(text: string): boolean {
|
|
36
|
+
const norm = text.toLowerCase().replace(/[.!?,…"']/g, '').replace(/\s+/g, ' ').trim()
|
|
37
|
+
return FILLER_PHRASES.has(norm)
|
|
38
|
+
}
|
|
39
|
+
|
|
9
40
|
export interface ModelProgress {
|
|
10
41
|
status: string
|
|
11
42
|
file?: string
|
|
@@ -77,12 +108,25 @@ async function blobToPcm16k(blob: Blob): Promise<Float32Array | null> {
|
|
|
77
108
|
}
|
|
78
109
|
|
|
79
110
|
export async function transcribeBlob(blob: Blob, onProgress?: (p: ModelProgress) => void): Promise<string> {
|
|
80
|
-
|
|
111
|
+
// Decode first so a silent clip never triggers the (one-time, ~75MB) model
|
|
112
|
+
// download just to be thrown away below.
|
|
81
113
|
const pcm = await blobToPcm16k(blob)
|
|
82
114
|
if (!pcm) return ''
|
|
115
|
+
const rms = computeRms(pcm)
|
|
116
|
+
if (rms < SILENCE_RMS) {
|
|
117
|
+
console.info(`[voice] near-silent clip (rms=${rms.toFixed(4)}); skipping transcription`)
|
|
118
|
+
return ''
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const transcriber = await loadTranscriber(onProgress)
|
|
83
122
|
const result = await transcriber(pcm)
|
|
84
|
-
const text = Array.isArray(result)
|
|
123
|
+
const text = (Array.isArray(result)
|
|
85
124
|
? result.map((r) => r.text).join(' ')
|
|
86
|
-
: result?.text
|
|
87
|
-
|
|
125
|
+
: result?.text || '').trim()
|
|
126
|
+
|
|
127
|
+
if (text && rms < LOW_CONFIDENCE_RMS && isFillerOnly(text)) {
|
|
128
|
+
console.info(`[voice] dropping filler-only output "${text}" from quiet clip (rms=${rms.toFixed(4)})`)
|
|
129
|
+
return ''
|
|
130
|
+
}
|
|
131
|
+
return text
|
|
88
132
|
}
|