@phenx-inc/ctlsurf 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/headless/index.mjs +26 -10
- package/out/headless/index.mjs.map +2 -2
- package/out/main/index.js +31 -9
- package/out/preload/index.js +8 -0
- package/out/renderer/assets/{cssMode-DbMmcl1h.js → cssMode-BQN8v2ok.js} +3 -3
- package/out/renderer/assets/{freemarker2-CvaHiy92.js → freemarker2-DbxGYYVp.js} +1 -1
- package/out/renderer/assets/{handlebars-D58lUIOu.js → handlebars-3auU1CAd.js} +1 -1
- package/out/renderer/assets/{html-D1h1aJbM.js → html-D8xFiRmI.js} +1 -1
- package/out/renderer/assets/{htmlMode-BdkAp9qr.js → htmlMode-M3MApZ4n.js} +3 -3
- package/out/renderer/assets/{index-B60JU1yI.js → index---H6cxNl.js} +854 -38
- package/out/renderer/assets/{index-DJFYmHjz.css → index-B-iM7dFC.css} +269 -0
- package/out/renderer/assets/{javascript-CXqZcnvb.js → javascript-BO_ViZM5.js} +2 -2
- package/out/renderer/assets/{jsonMode-BuVr-eSl.js → jsonMode-CKp2zvZu.js} +3 -3
- package/out/renderer/assets/{liquid-LKu0Wd0B.js → liquid-C1eHcrht.js} +1 -1
- package/out/renderer/assets/{lspLanguageFeatures-Cjr_4HGs.js → lspLanguageFeatures-CHWJx_Tl.js} +1 -1
- package/out/renderer/assets/{mdx-Bl84ILla.js → mdx-Qqdtk7fL.js} +1 -1
- package/out/renderer/assets/{python-0sFd9G1k.js → python-DKu7rNbs.js} +1 -1
- package/out/renderer/assets/{razor-Cqcu1rLJ.js → razor-BOMpCo6z.js} +1 -1
- package/out/renderer/assets/{tsMode-CYd3NUkW.js → tsMode-yAjlPR-D.js} +1 -1
- package/out/renderer/assets/{typescript-rkc9lhpi.js → typescript-BiJRCUcL.js} +1 -1
- package/out/renderer/assets/{xml-EsHEUps1.js → xml-D4PvYeQq.js} +1 -1
- package/out/renderer/assets/{yaml-B9-nQ_s2.js → yaml-BeHVkmnS.js} +1 -1
- package/out/renderer/index.html +2 -2
- package/package.json +1 -1
- package/src/main/index.ts +7 -0
- package/src/main/orchestrator.ts +38 -9
- package/src/preload/index.ts +11 -0
- package/src/renderer/App.tsx +39 -6
- package/src/renderer/components/FloatingMic.tsx +128 -0
- package/src/renderer/components/SpeakControls.tsx +235 -0
- package/src/renderer/components/VoiceInput.tsx +170 -6
- package/src/renderer/lib/localWhisper.ts +48 -4
- package/src/renderer/lib/speech.ts +299 -0
- package/src/renderer/styles.css +269 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import { useCallback, useEffect, useRef, useState } from 'react'
|
|
2
|
+
import { VoiceInput } from './VoiceInput'
|
|
3
|
+
|
|
4
|
+
// A draggable, dismissable push-to-talk mic that floats over the panes. It wraps
|
|
5
|
+
// the same <VoiceInput> push-to-talk logic used in the titlebar; only the chrome
|
|
6
|
+
// (drag handle + hide button) and positioning live here.
|
|
7
|
+
|
|
8
|
+
const POS_KEY = 'ctlsurf.floatingMicPos'
|
|
9
|
+
|
|
10
|
+
interface Pos { x: number; y: number }
|
|
11
|
+
|
|
12
|
+
interface FloatingMicProps {
|
|
13
|
+
onTranscript: (text: string) => void
|
|
14
|
+
onHide: () => void
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// Keep the button clear of the 38px titlebar and 24px status bar.
|
|
18
|
+
const EDGE = 20
|
|
19
|
+
const TOP_MIN = 46
|
|
20
|
+
const BOTTOM_GAP = 36
|
|
21
|
+
|
|
22
|
+
function loadPos(): Pos | null {
|
|
23
|
+
try {
|
|
24
|
+
const raw = localStorage.getItem(POS_KEY)
|
|
25
|
+
if (raw) {
|
|
26
|
+
const p = JSON.parse(raw) as Partial<Pos>
|
|
27
|
+
if (typeof p.x === 'number' && typeof p.y === 'number') return { x: p.x, y: p.y }
|
|
28
|
+
}
|
|
29
|
+
} catch { /* ignore */ }
|
|
30
|
+
return null
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function FloatingMic({ onTranscript, onHide }: FloatingMicProps) {
|
|
34
|
+
const [pos, setPos] = useState<Pos | null>(loadPos)
|
|
35
|
+
const elRef = useRef<HTMLDivElement>(null)
|
|
36
|
+
// Pointer-to-element offset captured at drag start; null when not dragging.
|
|
37
|
+
const dragRef = useRef<{ dx: number; dy: number } | null>(null)
|
|
38
|
+
|
|
39
|
+
// Keep the button fully inside the viewport (used on drag, mount, and resize).
|
|
40
|
+
const clamp = useCallback((x: number, y: number): Pos => {
|
|
41
|
+
const el = elRef.current
|
|
42
|
+
const w = el?.offsetWidth ?? 64
|
|
43
|
+
const h = el?.offsetHeight ?? 90
|
|
44
|
+
return {
|
|
45
|
+
x: Math.max(EDGE, Math.min(x, window.innerWidth - w - EDGE)),
|
|
46
|
+
y: Math.max(TOP_MIN, Math.min(y, window.innerHeight - h - BOTTOM_GAP)),
|
|
47
|
+
}
|
|
48
|
+
}, [])
|
|
49
|
+
|
|
50
|
+
// First mount with no saved position: default to bottom-right.
|
|
51
|
+
useEffect(() => {
|
|
52
|
+
if (pos) return
|
|
53
|
+
const el = elRef.current
|
|
54
|
+
const w = el?.offsetWidth ?? 64
|
|
55
|
+
const h = el?.offsetHeight ?? 90
|
|
56
|
+
setPos({
|
|
57
|
+
x: window.innerWidth - w - EDGE,
|
|
58
|
+
y: window.innerHeight - h - BOTTOM_GAP,
|
|
59
|
+
})
|
|
60
|
+
}, [pos])
|
|
61
|
+
|
|
62
|
+
// Keep it reachable if the window shrinks.
|
|
63
|
+
useEffect(() => {
|
|
64
|
+
const onResize = () => setPos((p) => (p ? clamp(p.x, p.y) : p))
|
|
65
|
+
window.addEventListener('resize', onResize)
|
|
66
|
+
return () => window.removeEventListener('resize', onResize)
|
|
67
|
+
}, [clamp])
|
|
68
|
+
|
|
69
|
+
const onHandleDown = useCallback((e: React.PointerEvent) => {
|
|
70
|
+
const el = elRef.current
|
|
71
|
+
if (!el) return
|
|
72
|
+
e.preventDefault()
|
|
73
|
+
const rect = el.getBoundingClientRect()
|
|
74
|
+
dragRef.current = { dx: e.clientX - rect.left, dy: e.clientY - rect.top }
|
|
75
|
+
e.currentTarget.setPointerCapture?.(e.pointerId)
|
|
76
|
+
}, [])
|
|
77
|
+
|
|
78
|
+
const onHandleMove = useCallback((e: React.PointerEvent) => {
|
|
79
|
+
const d = dragRef.current
|
|
80
|
+
if (!d) return
|
|
81
|
+
setPos(clamp(e.clientX - d.dx, e.clientY - d.dy))
|
|
82
|
+
}, [clamp])
|
|
83
|
+
|
|
84
|
+
const onHandleUp = useCallback((e: React.PointerEvent) => {
|
|
85
|
+
if (!dragRef.current) return
|
|
86
|
+
dragRef.current = null
|
|
87
|
+
e.currentTarget.releasePointerCapture?.(e.pointerId)
|
|
88
|
+
setPos((p) => {
|
|
89
|
+
if (p) {
|
|
90
|
+
try { localStorage.setItem(POS_KEY, JSON.stringify(p)) } catch { /* ignore */ }
|
|
91
|
+
}
|
|
92
|
+
return p
|
|
93
|
+
})
|
|
94
|
+
}, [])
|
|
95
|
+
|
|
96
|
+
// Render off-screen+hidden until the first position is computed (no flash).
|
|
97
|
+
const style: React.CSSProperties = pos
|
|
98
|
+
? { left: pos.x, top: pos.y }
|
|
99
|
+
: { left: -9999, top: -9999, visibility: 'hidden' }
|
|
100
|
+
|
|
101
|
+
return (
|
|
102
|
+
<div ref={elRef} className="floating-mic" style={style}>
|
|
103
|
+
<div
|
|
104
|
+
className="floating-mic-handle"
|
|
105
|
+
onPointerDown={onHandleDown}
|
|
106
|
+
onPointerMove={onHandleMove}
|
|
107
|
+
onPointerUp={onHandleUp}
|
|
108
|
+
onPointerCancel={onHandleUp}
|
|
109
|
+
title="Drag to move"
|
|
110
|
+
aria-label="Drag floating mic"
|
|
111
|
+
>
|
|
112
|
+
<span className="floating-mic-grip" aria-hidden="true">⠿</span>
|
|
113
|
+
<button
|
|
114
|
+
type="button"
|
|
115
|
+
className="floating-mic-hide"
|
|
116
|
+
// Don't let a click on the hide button start a drag.
|
|
117
|
+
onPointerDown={(e) => e.stopPropagation()}
|
|
118
|
+
onClick={onHide}
|
|
119
|
+
title="Hide floating mic"
|
|
120
|
+
aria-label="Hide floating mic"
|
|
121
|
+
>
|
|
122
|
+
×
|
|
123
|
+
</button>
|
|
124
|
+
</div>
|
|
125
|
+
<VoiceInput variant="floating" onTranscript={onTranscript} />
|
|
126
|
+
</div>
|
|
127
|
+
)
|
|
128
|
+
}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
import { useCallback, useEffect, useRef, useState } from 'react'
|
|
2
|
+
import {
|
|
3
|
+
speech,
|
|
4
|
+
getEngine,
|
|
5
|
+
setEngine as persistEngine,
|
|
6
|
+
getVoiceURI,
|
|
7
|
+
setVoiceURI as persistVoiceURI,
|
|
8
|
+
getRate,
|
|
9
|
+
setRate as persistRate,
|
|
10
|
+
listWebVoices,
|
|
11
|
+
type TtsEngineId,
|
|
12
|
+
type TtsModelProgress,
|
|
13
|
+
} from '../lib/speech'
|
|
14
|
+
|
|
15
|
+
// Titlebar control for "speak agent replies" (Electron desktop only). A speaker
|
|
16
|
+
// toggle drives the main-process speakReplies setting (which runs the transcript
|
|
17
|
+
// tailer); a ▾ dropdown picks the engine, voice, and rate.
|
|
18
|
+
|
|
19
|
+
const SAMPLE = 'This is how spoken agent replies will sound.'
|
|
20
|
+
|
|
21
|
+
export function SpeakControls() {
|
|
22
|
+
const [enabled, setEnabled] = useState(false)
|
|
23
|
+
const [engine, setEngineState] = useState<TtsEngineId>(getEngine)
|
|
24
|
+
const [voices, setVoices] = useState<SpeechSynthesisVoice[]>([])
|
|
25
|
+
const [voiceURI, setVoiceURIState] = useState<string | null>(getVoiceURI)
|
|
26
|
+
const [rate, setRateState] = useState<number>(getRate)
|
|
27
|
+
const [showMenu, setShowMenu] = useState(false)
|
|
28
|
+
const [modelPct, setModelPct] = useState<number | null>(null)
|
|
29
|
+
const [speaking, setSpeaking] = useState(false)
|
|
30
|
+
const [error, setError] = useState<string | null>(null)
|
|
31
|
+
const wrapRef = useRef<HTMLDivElement>(null)
|
|
32
|
+
|
|
33
|
+
// Reflect the persisted main-process setting on mount.
|
|
34
|
+
useEffect(() => {
|
|
35
|
+
let alive = true
|
|
36
|
+
window.worker.getSpeakReplies().then((r) => { if (alive) setEnabled(!!r.enabled) }).catch(() => {})
|
|
37
|
+
return () => { alive = false }
|
|
38
|
+
}, [])
|
|
39
|
+
|
|
40
|
+
// Speak replies as they arrive. Main only forwards when the setting is on, so
|
|
41
|
+
// this is a no-op while disabled.
|
|
42
|
+
useEffect(() => {
|
|
43
|
+
const off = window.worker.onAgentMessage((text) => speech.enqueue(text))
|
|
44
|
+
return off
|
|
45
|
+
}, [])
|
|
46
|
+
|
|
47
|
+
// Surface neural-model download/transcribe progress on the button.
|
|
48
|
+
useEffect(() => {
|
|
49
|
+
speech.onModelProgress = (p: TtsModelProgress | null) => {
|
|
50
|
+
if (p && p.status === 'progress' && typeof p.progress === 'number') {
|
|
51
|
+
setModelPct(Math.min(100, Math.round(p.progress)))
|
|
52
|
+
} else if (!p) {
|
|
53
|
+
setModelPct(null)
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
speech.onActivityChange = (a) => setSpeaking(a)
|
|
57
|
+
speech.onError = (msg) => setError(msg)
|
|
58
|
+
return () => {
|
|
59
|
+
speech.onModelProgress = null
|
|
60
|
+
speech.onActivityChange = null
|
|
61
|
+
speech.onError = null
|
|
62
|
+
}
|
|
63
|
+
}, [])
|
|
64
|
+
|
|
65
|
+
// Auto-dismiss the error chip.
|
|
66
|
+
useEffect(() => {
|
|
67
|
+
if (!error) return
|
|
68
|
+
const t = setTimeout(() => setError(null), 4000)
|
|
69
|
+
return () => clearTimeout(t)
|
|
70
|
+
}, [error])
|
|
71
|
+
|
|
72
|
+
// Web Speech voices populate asynchronously.
|
|
73
|
+
useEffect(() => {
|
|
74
|
+
const load = () => setVoices(listWebVoices())
|
|
75
|
+
load()
|
|
76
|
+
if (typeof speechSynthesis !== 'undefined') {
|
|
77
|
+
speechSynthesis.addEventListener('voiceschanged', load)
|
|
78
|
+
return () => speechSynthesis.removeEventListener('voiceschanged', load)
|
|
79
|
+
}
|
|
80
|
+
}, [])
|
|
81
|
+
|
|
82
|
+
// Close the menu on an outside click.
|
|
83
|
+
useEffect(() => {
|
|
84
|
+
if (!showMenu) return
|
|
85
|
+
const onDown = (e: MouseEvent) => {
|
|
86
|
+
if (!wrapRef.current?.contains(e.target as Node)) setShowMenu(false)
|
|
87
|
+
}
|
|
88
|
+
document.addEventListener('mousedown', onDown)
|
|
89
|
+
return () => document.removeEventListener('mousedown', onDown)
|
|
90
|
+
}, [showMenu])
|
|
91
|
+
|
|
92
|
+
const toggle = useCallback(async () => {
|
|
93
|
+
const next = !enabled
|
|
94
|
+
setEnabled(next)
|
|
95
|
+
// Prime the audio context from this click so neural playback isn't blocked
|
|
96
|
+
// by autoplay policy when a reply later arrives without a gesture.
|
|
97
|
+
if (next) speech.unlock()
|
|
98
|
+
else speech.stop()
|
|
99
|
+
try { await window.worker.setSpeakReplies(next) } catch { /* ignore */ }
|
|
100
|
+
}, [enabled])
|
|
101
|
+
|
|
102
|
+
const chooseEngine = useCallback((id: TtsEngineId) => {
|
|
103
|
+
persistEngine(id)
|
|
104
|
+
setEngineState(id)
|
|
105
|
+
speech.stop()
|
|
106
|
+
// Preload the neural model so the first reply isn't slowed by download.
|
|
107
|
+
if (id === 'neural') speech.warmup()
|
|
108
|
+
}, [])
|
|
109
|
+
|
|
110
|
+
const chooseVoice = useCallback((uri: string) => {
|
|
111
|
+
const v = uri || null
|
|
112
|
+
persistVoiceURI(v)
|
|
113
|
+
setVoiceURIState(v)
|
|
114
|
+
}, [])
|
|
115
|
+
|
|
116
|
+
const changeRate = useCallback((r: number) => {
|
|
117
|
+
persistRate(r)
|
|
118
|
+
setRateState(r)
|
|
119
|
+
}, [])
|
|
120
|
+
|
|
121
|
+
const testVoice = useCallback(() => {
|
|
122
|
+
speech.unlock()
|
|
123
|
+
speech.stop()
|
|
124
|
+
speech.enqueue(SAMPLE)
|
|
125
|
+
}, [])
|
|
126
|
+
|
|
127
|
+
const title = enabled ? 'Spoken replies on — click to mute' : 'Speak agent replies (off)'
|
|
128
|
+
|
|
129
|
+
return (
|
|
130
|
+
<div className="speak-controls" ref={wrapRef}>
|
|
131
|
+
<button
|
|
132
|
+
type="button"
|
|
133
|
+
className={`titlebar-btn titlebar-icon-btn speak-btn ${enabled ? 'active' : ''}`}
|
|
134
|
+
onClick={toggle}
|
|
135
|
+
title={title}
|
|
136
|
+
aria-label="Toggle spoken agent replies"
|
|
137
|
+
aria-pressed={enabled}
|
|
138
|
+
>
|
|
139
|
+
<svg viewBox="0 0 24 24" width="13" height="13" fill="none" stroke="currentColor"
|
|
140
|
+
strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" aria-hidden="true">
|
|
141
|
+
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
|
|
142
|
+
{enabled ? (
|
|
143
|
+
<>
|
|
144
|
+
<path d="M15.54 8.46a5 5 0 0 1 0 7.07" />
|
|
145
|
+
<path d="M19.07 4.93a10 10 0 0 1 0 14.14" />
|
|
146
|
+
</>
|
|
147
|
+
) : (
|
|
148
|
+
<line x1="23" y1="9" x2="17" y2="15" />
|
|
149
|
+
)}
|
|
150
|
+
</svg>
|
|
151
|
+
{modelPct !== null && <span className="speak-pct">{modelPct}%</span>}
|
|
152
|
+
</button>
|
|
153
|
+
{speaking && (
|
|
154
|
+
<button
|
|
155
|
+
type="button"
|
|
156
|
+
className="titlebar-btn titlebar-icon-btn speak-stop"
|
|
157
|
+
onClick={() => speech.stop()}
|
|
158
|
+
title="Stop speaking"
|
|
159
|
+
aria-label="Stop speaking"
|
|
160
|
+
>
|
|
161
|
+
<svg viewBox="0 0 24 24" width="11" height="11" fill="currentColor" aria-hidden="true">
|
|
162
|
+
<rect x="5" y="5" width="14" height="14" rx="2" />
|
|
163
|
+
</svg>
|
|
164
|
+
</button>
|
|
165
|
+
)}
|
|
166
|
+
<button
|
|
167
|
+
type="button"
|
|
168
|
+
className="titlebar-btn speak-caret"
|
|
169
|
+
onClick={() => setShowMenu((v) => !v)}
|
|
170
|
+
title="Voice options"
|
|
171
|
+
aria-label="Voice options"
|
|
172
|
+
aria-expanded={showMenu}
|
|
173
|
+
>
|
|
174
|
+
▾
|
|
175
|
+
</button>
|
|
176
|
+
|
|
177
|
+
{showMenu && (
|
|
178
|
+
<div className="speak-menu" role="menu">
|
|
179
|
+
<div className="speak-menu-head">Engine</div>
|
|
180
|
+
<button
|
|
181
|
+
type="button"
|
|
182
|
+
className={`speak-menu-item ${engine === 'web' ? 'active' : ''}`}
|
|
183
|
+
onClick={() => chooseEngine('web')}
|
|
184
|
+
>
|
|
185
|
+
<span className="speak-menu-check">{engine === 'web' ? '✓' : ''}</span>
|
|
186
|
+
<span>System voice (instant)</span>
|
|
187
|
+
</button>
|
|
188
|
+
<button
|
|
189
|
+
type="button"
|
|
190
|
+
className={`speak-menu-item ${engine === 'neural' ? 'active' : ''}`}
|
|
191
|
+
onClick={() => chooseEngine('neural')}
|
|
192
|
+
>
|
|
193
|
+
<span className="speak-menu-check">{engine === 'neural' ? '✓' : ''}</span>
|
|
194
|
+
<span>Neural voice (downloads)</span>
|
|
195
|
+
</button>
|
|
196
|
+
|
|
197
|
+
{engine === 'web' && (
|
|
198
|
+
<>
|
|
199
|
+
<div className="speak-menu-head">Voice</div>
|
|
200
|
+
<select
|
|
201
|
+
className="speak-select"
|
|
202
|
+
value={voiceURI || ''}
|
|
203
|
+
onChange={(e) => chooseVoice(e.target.value)}
|
|
204
|
+
>
|
|
205
|
+
<option value="">System default</option>
|
|
206
|
+
{voices.map((v) => (
|
|
207
|
+
<option key={v.voiceURI} value={v.voiceURI}>
|
|
208
|
+
{v.name} {v.lang ? `(${v.lang})` : ''}
|
|
209
|
+
</option>
|
|
210
|
+
))}
|
|
211
|
+
</select>
|
|
212
|
+
</>
|
|
213
|
+
)}
|
|
214
|
+
|
|
215
|
+
<div className="speak-menu-head">Rate · {rate.toFixed(1)}×</div>
|
|
216
|
+
<input
|
|
217
|
+
className="speak-rate"
|
|
218
|
+
type="range"
|
|
219
|
+
min={0.5}
|
|
220
|
+
max={2}
|
|
221
|
+
step={0.1}
|
|
222
|
+
value={rate}
|
|
223
|
+
onChange={(e) => changeRate(Number(e.target.value))}
|
|
224
|
+
/>
|
|
225
|
+
|
|
226
|
+
<div className="speak-menu-row">
|
|
227
|
+
<button type="button" className="speak-menu-btn" onClick={testVoice}>Test</button>
|
|
228
|
+
<button type="button" className="speak-menu-btn" onClick={() => speech.stop()}>Stop</button>
|
|
229
|
+
</div>
|
|
230
|
+
</div>
|
|
231
|
+
)}
|
|
232
|
+
{error && <div className="speak-error">{error}</div>}
|
|
233
|
+
</div>
|
|
234
|
+
)
|
|
235
|
+
}
|
|
@@ -43,6 +43,12 @@ type Engine = 'web-speech' | 'local'
|
|
|
43
43
|
type Phase = 'idle' | 'listening' | 'transcribing'
|
|
44
44
|
|
|
45
45
|
const ENGINE_KEY = 'ctlsurf.voiceEngine'
|
|
46
|
+
// Persisted audioinput deviceId for the local engine; null/absent = OS default.
|
|
47
|
+
const DEVICE_KEY = 'ctlsurf.voiceDeviceId'
|
|
48
|
+
|
|
49
|
+
function loadDeviceId(): string | null {
|
|
50
|
+
try { return localStorage.getItem(DEVICE_KEY) || null } catch { return null }
|
|
51
|
+
}
|
|
46
52
|
|
|
47
53
|
const WEB_SPEECH_SUPPORTED = getRecognitionCtor() !== null
|
|
48
54
|
const LOCAL_SUPPORTED =
|
|
@@ -76,9 +82,12 @@ function describeMicError(err: unknown): string {
|
|
|
76
82
|
interface VoiceInputProps {
|
|
77
83
|
// Called once per push-to-talk session with the final transcribed text.
|
|
78
84
|
onTranscript: (text: string) => void
|
|
85
|
+
// 'titlebar' (default) renders the compact titlebar pill; 'floating' renders
|
|
86
|
+
// a round FAB used by the draggable on-canvas mic (see FloatingMic).
|
|
87
|
+
variant?: 'titlebar' | 'floating'
|
|
79
88
|
}
|
|
80
89
|
|
|
81
|
-
export function VoiceInput({ onTranscript }: VoiceInputProps) {
|
|
90
|
+
export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputProps) {
|
|
82
91
|
const [engine, setEngine] = useState<Engine>(loadInitialEngine)
|
|
83
92
|
const [phase, setPhase] = useState<Phase>('idle')
|
|
84
93
|
const [interim, setInterim] = useState('')
|
|
@@ -86,6 +95,14 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
|
|
|
86
95
|
const [error, setError] = useState<string | null>(null)
|
|
87
96
|
const [notice, setNotice] = useState<string | null>(null)
|
|
88
97
|
|
|
98
|
+
// Mic source selection (local engine only)
|
|
99
|
+
const [devices, setDevices] = useState<MediaDeviceInfo[]>([])
|
|
100
|
+
const [selectedDeviceId, setSelectedDeviceId] = useState<string | null>(loadDeviceId)
|
|
101
|
+
const [showDevicePicker, setShowDevicePicker] = useState(false)
|
|
102
|
+
const selectedDeviceIdRef = useRef(selectedDeviceId)
|
|
103
|
+
useEffect(() => { selectedDeviceIdRef.current = selectedDeviceId }, [selectedDeviceId])
|
|
104
|
+
const wrapRef = useRef<HTMLDivElement>(null)
|
|
105
|
+
|
|
89
106
|
// Web Speech refs
|
|
90
107
|
const recognitionRef = useRef<SpeechRecognitionLike | null>(null)
|
|
91
108
|
const finalRef = useRef('')
|
|
@@ -125,6 +142,88 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
|
|
|
125
142
|
streamRef.current = null
|
|
126
143
|
}, [])
|
|
127
144
|
|
|
145
|
+
// ─── Mic source selection ──────────────────────────
|
|
146
|
+
|
|
147
|
+
const refreshDevices = useCallback(async () => {
|
|
148
|
+
if (!navigator.mediaDevices?.enumerateDevices) return
|
|
149
|
+
try {
|
|
150
|
+
const all = await navigator.mediaDevices.enumerateDevices()
|
|
151
|
+
setDevices(all.filter((d) => d.kind === 'audioinput'))
|
|
152
|
+
} catch { /* ignore */ }
|
|
153
|
+
}, [])
|
|
154
|
+
|
|
155
|
+
// Device labels are blank until mic permission is granted, so when the user
|
|
156
|
+
// opens the picker we request a one-shot permission to populate names.
|
|
157
|
+
const ensureDeviceLabels = useCallback(async () => {
|
|
158
|
+
if (!navigator.mediaDevices?.enumerateDevices) return
|
|
159
|
+
try {
|
|
160
|
+
const all = await navigator.mediaDevices.enumerateDevices()
|
|
161
|
+
const inputs = all.filter((d) => d.kind === 'audioinput')
|
|
162
|
+
if (inputs.length && inputs.every((d) => !d.label)) {
|
|
163
|
+
const s = await navigator.mediaDevices.getUserMedia({ audio: true })
|
|
164
|
+
s.getTracks().forEach((t) => t.stop())
|
|
165
|
+
}
|
|
166
|
+
} catch { /* permission denied → we'll show generic names */ }
|
|
167
|
+
await refreshDevices()
|
|
168
|
+
}, [refreshDevices])
|
|
169
|
+
|
|
170
|
+
const toggleDevicePicker = useCallback(() => {
|
|
171
|
+
setShowDevicePicker((open) => !open)
|
|
172
|
+
}, [])
|
|
173
|
+
|
|
174
|
+
// Populate device labels whenever the picker opens (needs mic permission).
|
|
175
|
+
useEffect(() => {
|
|
176
|
+
if (showDevicePicker) void ensureDeviceLabels()
|
|
177
|
+
}, [showDevicePicker, ensureDeviceLabels])
|
|
178
|
+
|
|
179
|
+
const chooseDevice = useCallback((id: string | null) => {
|
|
180
|
+
setSelectedDeviceId(id)
|
|
181
|
+
try {
|
|
182
|
+
if (id) localStorage.setItem(DEVICE_KEY, id)
|
|
183
|
+
else localStorage.removeItem(DEVICE_KEY)
|
|
184
|
+
} catch { /* ignore */ }
|
|
185
|
+
setShowDevicePicker(false)
|
|
186
|
+
}, [])
|
|
187
|
+
|
|
188
|
+
// Acquire a stream honoring the saved device, gracefully falling back to the
|
|
189
|
+
// OS default if that device was unplugged since it was chosen.
|
|
190
|
+
const getStream = useCallback(async (): Promise<MediaStream> => {
|
|
191
|
+
const id = selectedDeviceIdRef.current
|
|
192
|
+
try {
|
|
193
|
+
return await navigator.mediaDevices.getUserMedia({
|
|
194
|
+
audio: id ? { deviceId: { exact: id } } : true,
|
|
195
|
+
})
|
|
196
|
+
} catch (err) {
|
|
197
|
+
if (id && (err as { name?: string })?.name === 'OverconstrainedError') {
|
|
198
|
+
try { localStorage.removeItem(DEVICE_KEY) } catch { /* ignore */ }
|
|
199
|
+
setSelectedDeviceId(null)
|
|
200
|
+
setNotice('Saved microphone unavailable — using system default.')
|
|
201
|
+
return navigator.mediaDevices.getUserMedia({ audio: true })
|
|
202
|
+
}
|
|
203
|
+
throw err
|
|
204
|
+
}
|
|
205
|
+
}, [])
|
|
206
|
+
|
|
207
|
+
// Keep the list fresh when devices are plugged/unplugged.
|
|
208
|
+
useEffect(() => {
|
|
209
|
+
const md = navigator.mediaDevices
|
|
210
|
+
if (!md?.addEventListener) return
|
|
211
|
+
const onChange = () => { void refreshDevices() }
|
|
212
|
+
md.addEventListener('devicechange', onChange)
|
|
213
|
+
void refreshDevices()
|
|
214
|
+
return () => md.removeEventListener('devicechange', onChange)
|
|
215
|
+
}, [refreshDevices])
|
|
216
|
+
|
|
217
|
+
// Close the picker on an outside click.
|
|
218
|
+
useEffect(() => {
|
|
219
|
+
if (!showDevicePicker) return
|
|
220
|
+
const onDocDown = (e: MouseEvent) => {
|
|
221
|
+
if (!wrapRef.current?.contains(e.target as Node)) setShowDevicePicker(false)
|
|
222
|
+
}
|
|
223
|
+
document.addEventListener('mousedown', onDocDown)
|
|
224
|
+
return () => document.removeEventListener('mousedown', onDocDown)
|
|
225
|
+
}, [showDevicePicker])
|
|
226
|
+
|
|
128
227
|
// ─── Web Speech engine ─────────────────────────────
|
|
129
228
|
|
|
130
229
|
const startWebSpeech = useCallback(() => {
|
|
@@ -204,6 +303,7 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
|
|
|
204
303
|
try {
|
|
205
304
|
const text = await transcribeBlob(blob, handleModelProgress)
|
|
206
305
|
if (text) onTranscriptRef.current(text)
|
|
306
|
+
else setNotice('No speech detected — check the mic source (▾).')
|
|
207
307
|
} catch (err) {
|
|
208
308
|
setError('On-device transcription failed')
|
|
209
309
|
console.error('[voice] local transcription failed', err)
|
|
@@ -217,7 +317,7 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
|
|
|
217
317
|
setError(null); setNotice(null); setInterim('')
|
|
218
318
|
cancelGestureRef.current = false
|
|
219
319
|
try {
|
|
220
|
-
const stream = await
|
|
320
|
+
const stream = await getStream()
|
|
221
321
|
// Released during the permission/await — don't record anything.
|
|
222
322
|
if (cancelGestureRef.current) {
|
|
223
323
|
stream.getTracks().forEach((t) => t.stop())
|
|
@@ -238,7 +338,7 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
|
|
|
238
338
|
setError(describeMicError(err))
|
|
239
339
|
console.error('[voice] getUserMedia failed', err)
|
|
240
340
|
}
|
|
241
|
-
}, [runLocalTranscription, stopStream])
|
|
341
|
+
}, [runLocalTranscription, stopStream, getStream])
|
|
242
342
|
|
|
243
343
|
const stopLocal = useCallback(() => {
|
|
244
344
|
cancelGestureRef.current = true
|
|
@@ -291,11 +391,31 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
|
|
|
291
391
|
else if (listening) chip = { kind: 'listening', text: interim || (engine === 'local' ? 'Recording…' : 'Listening…') }
|
|
292
392
|
else if (busy) chip = { kind: 'busy', text: modelPct !== null ? `Downloading voice model… ${modelPct}%` : 'Transcribing…' }
|
|
293
393
|
|
|
394
|
+
const floating = variant === 'floating'
|
|
395
|
+
const btnClass = floating
|
|
396
|
+
? `voice-btn voice-btn-floating ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
|
|
397
|
+
: `titlebar-btn titlebar-icon-btn voice-btn ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
|
|
398
|
+
|
|
399
|
+
// Hide Chromium's synthetic "default"/"communications" aggregate entries from
|
|
400
|
+
// the list; we expose our own "System default" item instead.
|
|
401
|
+
const realDevices = devices.filter(
|
|
402
|
+
(d) => d.deviceId && d.deviceId !== 'default' && d.deviceId !== 'communications',
|
|
403
|
+
)
|
|
404
|
+
const rawDefault = devices.find((d) => d.deviceId === 'default')?.label
|
|
405
|
+
const defaultLabel = rawDefault
|
|
406
|
+
? `System default · ${rawDefault.replace(/^Default\s*-\s*/i, '')}`
|
|
407
|
+
: 'System default'
|
|
408
|
+
const activeLabel = selectedDeviceId
|
|
409
|
+
? (realDevices.find((d) => d.deviceId === selectedDeviceId)?.label || 'Selected microphone')
|
|
410
|
+
: defaultLabel
|
|
411
|
+
// Source selection only affects the local (getUserMedia) engine.
|
|
412
|
+
const showSourcePicker = floating && LOCAL_SUPPORTED
|
|
413
|
+
|
|
294
414
|
return (
|
|
295
|
-
<div className="voice-input-wrap">
|
|
415
|
+
<div className="voice-input-wrap" ref={wrapRef}>
|
|
296
416
|
<button
|
|
297
417
|
type="button"
|
|
298
|
-
className={
|
|
418
|
+
className={btnClass}
|
|
299
419
|
disabled={!ANY_SUPPORTED}
|
|
300
420
|
onPointerDown={handlePointerDown}
|
|
301
421
|
onPointerUp={handlePointerUp}
|
|
@@ -307,7 +427,51 @@ export function VoiceInput({ onTranscript }: VoiceInputProps) {
|
|
|
307
427
|
<span className="voice-icon" aria-hidden="true">🎤</span>
|
|
308
428
|
<span className={`voice-dot ${listening ? 'on' : busy ? 'busy' : 'off'}`} />
|
|
309
429
|
</button>
|
|
310
|
-
{
|
|
430
|
+
{showSourcePicker && (
|
|
431
|
+
<button
|
|
432
|
+
type="button"
|
|
433
|
+
className="voice-source-btn"
|
|
434
|
+
onPointerDown={(e) => e.stopPropagation()}
|
|
435
|
+
onClick={toggleDevicePicker}
|
|
436
|
+
title={`Mic source: ${activeLabel}`}
|
|
437
|
+
aria-label="Choose microphone source"
|
|
438
|
+
aria-expanded={showDevicePicker}
|
|
439
|
+
>
|
|
440
|
+
▾
|
|
441
|
+
</button>
|
|
442
|
+
)}
|
|
443
|
+
{showSourcePicker && showDevicePicker && (
|
|
444
|
+
<div className="voice-source-menu" role="menu">
|
|
445
|
+
<div className="voice-source-head">Microphone source</div>
|
|
446
|
+
<button
|
|
447
|
+
type="button"
|
|
448
|
+
role="menuitemradio"
|
|
449
|
+
aria-checked={selectedDeviceId === null}
|
|
450
|
+
className={`voice-source-item ${selectedDeviceId === null ? 'active' : ''}`}
|
|
451
|
+
onClick={() => chooseDevice(null)}
|
|
452
|
+
>
|
|
453
|
+
<span className="voice-source-check">{selectedDeviceId === null ? '✓' : ''}</span>
|
|
454
|
+
<span className="voice-source-label">{defaultLabel}</span>
|
|
455
|
+
</button>
|
|
456
|
+
{realDevices.map((d, i) => (
|
|
457
|
+
<button
|
|
458
|
+
key={d.deviceId}
|
|
459
|
+
type="button"
|
|
460
|
+
role="menuitemradio"
|
|
461
|
+
aria-checked={selectedDeviceId === d.deviceId}
|
|
462
|
+
className={`voice-source-item ${selectedDeviceId === d.deviceId ? 'active' : ''}`}
|
|
463
|
+
onClick={() => chooseDevice(d.deviceId)}
|
|
464
|
+
>
|
|
465
|
+
<span className="voice-source-check">{selectedDeviceId === d.deviceId ? '✓' : ''}</span>
|
|
466
|
+
<span className="voice-source-label">{d.label || `Microphone ${i + 1}`}</span>
|
|
467
|
+
</button>
|
|
468
|
+
))}
|
|
469
|
+
{realDevices.length === 0 && (
|
|
470
|
+
<div className="voice-source-empty">No microphones found</div>
|
|
471
|
+
)}
|
|
472
|
+
</div>
|
|
473
|
+
)}
|
|
474
|
+
{chip && <div className={`voice-chip ${chip.kind} ${floating ? 'voice-chip-floating' : ''}`}>{chip.text}</div>}
|
|
311
475
|
</div>
|
|
312
476
|
)
|
|
313
477
|
}
|
|
@@ -6,6 +6,37 @@
|
|
|
6
6
|
const MODEL = 'Xenova/whisper-base'
|
|
7
7
|
const TARGET_SAMPLE_RATE = 16000
|
|
8
8
|
|
|
9
|
+
// Whisper hallucinates filler tokens ("you", "Thank you.", "Thanks for
|
|
10
|
+
// watching.") when fed silence. Two guards below tame that:
|
|
11
|
+
// - Below SILENCE_RMS we treat the clip as silent and skip the model entirely.
|
|
12
|
+
// - Between SILENCE_RMS and LOW_CONFIDENCE_RMS we still run the model but drop
|
|
13
|
+
// the result if it's *only* a known filler phrase (likely a hallucination on
|
|
14
|
+
// a quiet clip rather than real speech).
|
|
15
|
+
const SILENCE_RMS = 0.008
|
|
16
|
+
const LOW_CONFIDENCE_RMS = 0.02
|
|
17
|
+
const FILLER_PHRASES = new Set([
|
|
18
|
+
'you',
|
|
19
|
+
'thank you',
|
|
20
|
+
'thank you very much',
|
|
21
|
+
'thank you for watching',
|
|
22
|
+
'thanks for watching',
|
|
23
|
+
'please subscribe',
|
|
24
|
+
])
|
|
25
|
+
|
|
26
|
+
// Root-mean-square amplitude of the clip — a cheap loudness proxy. Normalized
|
|
27
|
+
// speech sits around 0.05–0.15; a silent room is well under 0.005.
|
|
28
|
+
function computeRms(pcm: Float32Array): number {
|
|
29
|
+
if (pcm.length === 0) return 0
|
|
30
|
+
let sum = 0
|
|
31
|
+
for (let i = 0; i < pcm.length; i++) sum += pcm[i] * pcm[i]
|
|
32
|
+
return Math.sqrt(sum / pcm.length)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function isFillerOnly(text: string): boolean {
|
|
36
|
+
const norm = text.toLowerCase().replace(/[.!?,…"']/g, '').replace(/\s+/g, ' ').trim()
|
|
37
|
+
return FILLER_PHRASES.has(norm)
|
|
38
|
+
}
|
|
39
|
+
|
|
9
40
|
export interface ModelProgress {
|
|
10
41
|
status: string
|
|
11
42
|
file?: string
|
|
@@ -77,12 +108,25 @@ async function blobToPcm16k(blob: Blob): Promise<Float32Array | null> {
|
|
|
77
108
|
}
|
|
78
109
|
|
|
79
110
|
export async function transcribeBlob(blob: Blob, onProgress?: (p: ModelProgress) => void): Promise<string> {
|
|
80
|
-
|
|
111
|
+
// Decode first so a silent clip never triggers the (one-time, ~75MB) model
|
|
112
|
+
// download just to be thrown away below.
|
|
81
113
|
const pcm = await blobToPcm16k(blob)
|
|
82
114
|
if (!pcm) return ''
|
|
115
|
+
const rms = computeRms(pcm)
|
|
116
|
+
if (rms < SILENCE_RMS) {
|
|
117
|
+
console.info(`[voice] near-silent clip (rms=${rms.toFixed(4)}); skipping transcription`)
|
|
118
|
+
return ''
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const transcriber = await loadTranscriber(onProgress)
|
|
83
122
|
const result = await transcriber(pcm)
|
|
84
|
-
const text = Array.isArray(result)
|
|
123
|
+
const text = (Array.isArray(result)
|
|
85
124
|
? result.map((r) => r.text).join(' ')
|
|
86
|
-
: result?.text
|
|
87
|
-
|
|
125
|
+
: result?.text || '').trim()
|
|
126
|
+
|
|
127
|
+
if (text && rms < LOW_CONFIDENCE_RMS && isFillerOnly(text)) {
|
|
128
|
+
console.info(`[voice] dropping filler-only output "${text}" from quiet clip (rms=${rms.toFixed(4)})`)
|
|
129
|
+
return ''
|
|
130
|
+
}
|
|
131
|
+
return text
|
|
88
132
|
}
|