@phenx-inc/ctlsurf 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/electron-vite.config.ts +5 -0
  2. package/out/headless/index.mjs +2 -1
  3. package/out/headless/index.mjs.map +2 -2
  4. package/out/main/index.js +3 -0
  5. package/out/renderer/assets/{cssMode-D9-xaWSI.js → cssMode-DbMmcl1h.js} +3 -3
  6. package/out/renderer/assets/{freemarker2-CoRAVxnv.js → freemarker2-CvaHiy92.js} +1 -1
  7. package/out/renderer/assets/{handlebars-B0p9Wgkw.js → handlebars-D58lUIOu.js} +1 -1
  8. package/out/renderer/assets/{html-D_XFJJtO.js → html-D1h1aJbM.js} +1 -1
  9. package/out/renderer/assets/{htmlMode-naWw6PWr.js → htmlMode-BdkAp9qr.js} +3 -3
  10. package/out/renderer/assets/{index-DBt_rov1.js → index-B60JU1yI.js} +419 -107
  11. package/out/renderer/assets/{index-ezC-iarf.css → index-DJFYmHjz.css} +89 -0
  12. package/out/renderer/assets/{javascript-DDLsFUr-.js → javascript-CXqZcnvb.js} +2 -2
  13. package/out/renderer/assets/{jsonMode-Ixhcm5I6.js → jsonMode-BuVr-eSl.js} +3 -3
  14. package/out/renderer/assets/{liquid-BHgSYEHk.js → liquid-LKu0Wd0B.js} +1 -1
  15. package/out/renderer/assets/{lspLanguageFeatures-ClbEdD0U.js → lspLanguageFeatures-Cjr_4HGs.js} +1 -1
  16. package/out/renderer/assets/{mdx-DMngMjHR.js → mdx-Bl84ILla.js} +1 -1
  17. package/out/renderer/assets/ort-wasm-simd-threaded.asyncify-DMmc6YqF.wasm +0 -0
  18. package/out/renderer/assets/{python-D_czoeY2.js → python-0sFd9G1k.js} +1 -1
  19. package/out/renderer/assets/{razor-CLMDGvL7.js → razor-Cqcu1rLJ.js} +1 -1
  20. package/out/renderer/assets/transformers.web-DtSCnG36.js +33668 -0
  21. package/out/renderer/assets/{tsMode-EIuSGG42.js → tsMode-CYd3NUkW.js} +1 -1
  22. package/out/renderer/assets/{typescript-DQkV4kKA.js → typescript-rkc9lhpi.js} +1 -1
  23. package/out/renderer/assets/{xml-DJ0OOQTu.js → xml-EsHEUps1.js} +1 -1
  24. package/out/renderer/assets/{yaml-DxX26XLN.js → yaml-B9-nQ_s2.js} +1 -1
  25. package/out/renderer/index.html +2 -2
  26. package/package.json +2 -1
  27. package/src/main/index.ts +7 -0
  28. package/src/renderer/App.tsx +13 -1
  29. package/src/renderer/components/TerminalPanel.tsx +6 -0
  30. package/src/renderer/components/VoiceInput.tsx +313 -0
  31. package/src/renderer/lib/localWhisper.ts +88 -0
  32. package/src/renderer/styles.css +89 -0
@@ -1,4 +1,4 @@
1
- import { c as createWebWorker, e as editor, U as Uri, a as MarkerTag, M as MarkerSeverity, l as languages, t as typescriptDefaults, R as Range } from "./index-DBt_rov1.js";
1
+ import { c as createWebWorker, e as editor, U as Uri, a as MarkerTag, M as MarkerSeverity, l as languages, t as typescriptDefaults, R as Range } from "./index-B60JU1yI.js";
2
2
  class WorkerManager {
3
3
  constructor(_modeId, _defaults) {
4
4
  this._modeId = _modeId;
@@ -1,4 +1,4 @@
1
- import { l as languages } from "./index-DBt_rov1.js";
1
+ import { l as languages } from "./index-B60JU1yI.js";
2
2
  const conf = {
3
3
  wordPattern: /(-?\d*\.\d\w*)|([^\`\~\!\@\#\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s]+)/g,
4
4
  comments: {
@@ -1,4 +1,4 @@
1
- import { l as languages } from "./index-DBt_rov1.js";
1
+ import { l as languages } from "./index-B60JU1yI.js";
2
2
  const conf = {
3
3
  comments: {
4
4
  blockComment: ["<!--", "-->"]
@@ -1,4 +1,4 @@
1
- import { l as languages } from "./index-DBt_rov1.js";
1
+ import { l as languages } from "./index-B60JU1yI.js";
2
2
  const conf = {
3
3
  comments: {
4
4
  lineComment: "#"
@@ -4,8 +4,8 @@
4
4
  <meta charset="UTF-8" />
5
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
6
  <title>ctlsurf-worker</title>
7
- <script type="module" crossorigin src="./assets/index-DBt_rov1.js"></script>
8
- <link rel="stylesheet" crossorigin href="./assets/index-ezC-iarf.css">
7
+ <script type="module" crossorigin src="./assets/index-B60JU1yI.js"></script>
8
+ <link rel="stylesheet" crossorigin href="./assets/index-DJFYmHjz.css">
9
9
  </head>
10
10
  <body>
11
11
  <div id="root"></div>
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@phenx-inc/ctlsurf",
3
- "version": "0.5.2",
3
+ "version": "0.6.0",
4
4
  "description": "Agent-agnostic terminal and desktop app for ctlsurf — run Claude Code, Codex, or any coding agent with live session logging and remote control",
5
5
  "main": "out/main/index.js",
6
6
  "bin": {
@@ -43,6 +43,7 @@
43
43
  "node": ">=18"
44
44
  },
45
45
  "dependencies": {
46
+ "@huggingface/transformers": "^4.2.0",
46
47
  "@monaco-editor/react": "^4.7.0",
47
48
  "@xterm/addon-fit": "^0.10.0",
48
49
  "@xterm/addon-serialize": "^0.14.0",
package/src/main/index.ts CHANGED
@@ -132,6 +132,13 @@ function createWindow(): void {
132
132
  }
133
133
  })
134
134
 
135
+ // Voice typing needs microphone access. Approve permission requests (there
136
+ // was no handler before, so the renderer already had the default-permissive
137
+ // behavior — this just ensures the mic request resolves to "allow").
138
+ mainWindow.webContents.session.setPermissionRequestHandler((_wc, _permission, callback) => {
139
+ callback(true)
140
+ })
141
+
135
142
  if (process.env.ELECTRON_RENDERER_URL) {
136
143
  mainWindow.loadURL(process.env.ELECTRON_RENDERER_URL)
137
144
  } else {
@@ -1,5 +1,6 @@
1
1
  import { useState, useEffect, useCallback, useRef } from 'react'
2
- import { TerminalPanel, destroyTerminal } from './components/TerminalPanel'
2
+ import { TerminalPanel, destroyTerminal, focusTerminal } from './components/TerminalPanel'
3
+ import { VoiceInput } from './components/VoiceInput'
3
4
  import { CtlsurfPanel } from './components/CtlsurfPanel'
4
5
  import { EditorPanel } from './components/EditorPanel'
5
6
  import { AgentPicker } from './components/AgentPicker'
@@ -206,6 +207,16 @@ export default function App() {
206
207
  }
207
208
  }, [trackingActive])
208
209
 
210
+ // Voice typing: inject the transcribed text into the active terminal exactly
211
+ // as if it were typed (no auto-submit), then refocus so the user can press
212
+ // Enter to send it.
213
+ const handleVoiceTranscript = useCallback((text: string) => {
214
+ const trimmed = text.trim()
215
+ if (!trimmed) return
216
+ window.worker.writePty(activeTabId, trimmed)
217
+ focusTerminal(activeTabId)
218
+ }, [activeTabId])
219
+
209
220
  const cwdRef = useRef<string | null>(null)
210
221
 
211
222
  const handleSpawn = useCallback(async (tabId: string, agent: AgentConfig) => {
@@ -471,6 +482,7 @@ export default function App() {
471
482
  </svg>
472
483
  <span>Tickets</span>
473
484
  </button>
485
+ <VoiceInput onTranscript={handleVoiceTranscript} />
474
486
  <span className="titlebar-separator" />
475
487
  {agents.map(a => {
476
488
  const activeTab = tabs.find(t => t.id === activeTabId)
@@ -185,6 +185,12 @@ function getOrCreateTerminal(tabId: string, onExit: (tabId: string) => void): {
185
185
  return { terminal, fitAddon }
186
186
  }
187
187
 
188
+ // Return keyboard focus to a tab's terminal (e.g. after inserting voice text
189
+ // so the user can immediately press Enter to submit).
190
+ export function focusTerminal(tabId: string): void {
191
+ _terminals.get(tabId)?.terminal.focus()
192
+ }
193
+
188
194
  export function destroyTerminal(tabId: string): void {
189
195
  const state = _terminals.get(tabId)
190
196
  if (!state) return
@@ -0,0 +1,313 @@
1
+ import { useCallback, useEffect, useRef, useState } from 'react'
2
+ import { transcribeBlob, type ModelProgress } from '../lib/localWhisper'
3
+
4
+ // ─── Minimal Web Speech API typings ──────────────────
5
+ // webkitSpeechRecognition isn't in the standard DOM lib, so declare just the
6
+ // surface we use. This API is frequently unavailable inside Electron (Chromium
7
+ // ships without Google's speech backend); when it fails we fall back to a local
8
+ // Whisper model (see ../lib/localWhisper).
9
+
10
+ interface SpeechRecognitionResult {
11
+ isFinal: boolean
12
+ 0: { transcript: string }
13
+ }
14
+ interface SpeechRecognitionEvent {
15
+ resultIndex: number
16
+ results: { length: number;[index: number]: SpeechRecognitionResult }
17
+ }
18
+ interface SpeechRecognitionErrorEvent { error: string }
19
+ interface SpeechRecognitionLike {
20
+ lang: string
21
+ continuous: boolean
22
+ interimResults: boolean
23
+ start: () => void
24
+ stop: () => void
25
+ abort: () => void
26
+ onresult: ((e: SpeechRecognitionEvent) => void) | null
27
+ onerror: ((e: SpeechRecognitionErrorEvent) => void) | null
28
+ onend: (() => void) | null
29
+ }
30
+ type SpeechRecognitionCtor = new () => SpeechRecognitionLike
31
+
32
+ function getRecognitionCtor(): SpeechRecognitionCtor | null {
33
+ const w = window as unknown as {
34
+ SpeechRecognition?: SpeechRecognitionCtor
35
+ webkitSpeechRecognition?: SpeechRecognitionCtor
36
+ }
37
+ return w.SpeechRecognition || w.webkitSpeechRecognition || null
38
+ }
39
+
40
+ // ─── Capabilities & engine selection ─────────────────
41
+
42
+ type Engine = 'web-speech' | 'local'
43
+ type Phase = 'idle' | 'listening' | 'transcribing'
44
+
45
+ const ENGINE_KEY = 'ctlsurf.voiceEngine'
46
+
47
+ const WEB_SPEECH_SUPPORTED = getRecognitionCtor() !== null
48
+ const LOCAL_SUPPORTED =
49
+ typeof navigator !== 'undefined' &&
50
+ !!navigator.mediaDevices?.getUserMedia &&
51
+ typeof MediaRecorder !== 'undefined' &&
52
+ typeof OfflineAudioContext !== 'undefined'
53
+ const ANY_SUPPORTED = WEB_SPEECH_SUPPORTED || LOCAL_SUPPORTED
54
+
55
+ function loadInitialEngine(): Engine {
56
+ if (!WEB_SPEECH_SUPPORTED && LOCAL_SUPPORTED) return 'local'
57
+ try {
58
+ if (localStorage.getItem(ENGINE_KEY) === 'local' && LOCAL_SUPPORTED) return 'local'
59
+ } catch { /* ignore */ }
60
+ return WEB_SPEECH_SUPPORTED ? 'web-speech' : 'local'
61
+ }
62
+
63
+ // Web Speech errors that mean the engine itself is unreachable (vs. a mic
64
+ // permission/hardware problem, which would also break the local fallback).
65
+ function isEngineUnavailable(code: string): boolean {
66
+ return code === 'network' || code === 'service-not-allowed'
67
+ }
68
+
69
+ function describeMicError(err: unknown): string {
70
+ const name = (err as { name?: string })?.name
71
+ if (name === 'NotAllowedError' || name === 'SecurityError') return 'Microphone access denied'
72
+ if (name === 'NotFoundError') return 'No microphone found'
73
+ return 'Could not start microphone'
74
+ }
75
+
76
+ interface VoiceInputProps {
77
+ // Called once per push-to-talk session with the final transcribed text.
78
+ onTranscript: (text: string) => void
79
+ }
80
+
81
+ export function VoiceInput({ onTranscript }: VoiceInputProps) {
82
+ const [engine, setEngine] = useState<Engine>(loadInitialEngine)
83
+ const [phase, setPhase] = useState<Phase>('idle')
84
+ const [interim, setInterim] = useState('')
85
+ const [modelPct, setModelPct] = useState<number | null>(null)
86
+ const [error, setError] = useState<string | null>(null)
87
+ const [notice, setNotice] = useState<string | null>(null)
88
+
89
+ // Web Speech refs
90
+ const recognitionRef = useRef<SpeechRecognitionLike | null>(null)
91
+ const finalRef = useRef('')
92
+ // Local (Whisper) refs
93
+ const streamRef = useRef<MediaStream | null>(null)
94
+ const recorderRef = useRef<MediaRecorder | null>(null)
95
+ const chunksRef = useRef<Blob[]>([])
96
+ // Set true when the user releases before getUserMedia resolves (quick tap).
97
+ const cancelGestureRef = useRef(false)
98
+
99
+ const engineRef = useRef(engine)
100
+ useEffect(() => { engineRef.current = engine }, [engine])
101
+
102
+ const onTranscriptRef = useRef(onTranscript)
103
+ useEffect(() => { onTranscriptRef.current = onTranscript }, [onTranscript])
104
+
105
+ // Auto-dismiss transient chips.
106
+ useEffect(() => {
107
+ if (!error) return
108
+ const t = setTimeout(() => setError(null), 4500)
109
+ return () => clearTimeout(t)
110
+ }, [error])
111
+ useEffect(() => {
112
+ if (!notice) return
113
+ const t = setTimeout(() => setNotice(null), 5000)
114
+ return () => clearTimeout(t)
115
+ }, [notice])
116
+
117
+ const switchToLocal = useCallback((reason: string) => {
118
+ try { localStorage.setItem(ENGINE_KEY, 'local') } catch { /* ignore */ }
119
+ setEngine('local')
120
+ setNotice(reason)
121
+ }, [])
122
+
123
+ const stopStream = useCallback(() => {
124
+ streamRef.current?.getTracks().forEach((t) => t.stop())
125
+ streamRef.current = null
126
+ }, [])
127
+
128
+ // ─── Web Speech engine ─────────────────────────────
129
+
130
+ const startWebSpeech = useCallback(() => {
131
+ const Ctor = getRecognitionCtor()
132
+ if (!Ctor || recognitionRef.current) return
133
+ setError(null); setNotice(null); setInterim('')
134
+ finalRef.current = ''
135
+
136
+ const rec = new Ctor()
137
+ rec.lang = navigator.language || 'en-US'
138
+ rec.continuous = true
139
+ rec.interimResults = true
140
+
141
+ rec.onresult = (event) => {
142
+ let finalText = ''
143
+ let interimText = ''
144
+ for (let i = 0; i < event.results.length; i++) {
145
+ const res = event.results[i]
146
+ if (res.isFinal) finalText += res[0].transcript
147
+ else interimText += res[0].transcript
148
+ }
149
+ finalRef.current = finalText
150
+ setInterim(interimText)
151
+ }
152
+
153
+ rec.onerror = (event) => {
154
+ if (isEngineUnavailable(event.error) && LOCAL_SUPPORTED) {
155
+ // The streamed audio is gone; switch engines and ask for a retry.
156
+ finalRef.current = ''
157
+ switchToLocal('Voice service unavailable — switched to on-device. Press again.')
158
+ } else if (event.error !== 'no-speech' && event.error !== 'aborted') {
159
+ setError(event.error === 'not-allowed' ? 'Microphone access denied' : `Voice error: ${event.error}`)
160
+ }
161
+ }
162
+
163
+ rec.onend = () => {
164
+ const text = finalRef.current.trim()
165
+ recognitionRef.current = null
166
+ setPhase('idle')
167
+ setInterim('')
168
+ if (text) onTranscriptRef.current(text)
169
+ }
170
+
171
+ recognitionRef.current = rec
172
+ try {
173
+ rec.start()
174
+ setPhase('listening')
175
+ } catch (err) {
176
+ recognitionRef.current = null
177
+ setPhase('idle')
178
+ setError('Could not start microphone')
179
+ console.error('[voice] web speech start failed', err)
180
+ }
181
+ }, [switchToLocal])
182
+
183
+ const stopWebSpeech = useCallback(() => {
184
+ try { recognitionRef.current?.stop() } catch { /* already stopped */ }
185
+ }, [])
186
+
187
+ // ─── Local (Whisper) engine ────────────────────────
188
+
189
+ const handleModelProgress = useCallback((p: ModelProgress) => {
190
+ if (p.status === 'progress' && typeof p.progress === 'number') {
191
+ setModelPct(Math.min(100, Math.round(p.progress)))
192
+ }
193
+ }, [])
194
+
195
+ const runLocalTranscription = useCallback(async (rec: MediaRecorder) => {
196
+ stopStream()
197
+ const blob = new Blob(chunksRef.current, { type: rec.mimeType || 'audio/webm' })
198
+ chunksRef.current = []
199
+ recorderRef.current = null
200
+ if (blob.size === 0) { setPhase('idle'); return }
201
+
202
+ setPhase('transcribing')
203
+ setInterim('')
204
+ try {
205
+ const text = await transcribeBlob(blob, handleModelProgress)
206
+ if (text) onTranscriptRef.current(text)
207
+ } catch (err) {
208
+ setError('On-device transcription failed')
209
+ console.error('[voice] local transcription failed', err)
210
+ } finally {
211
+ setPhase('idle')
212
+ setModelPct(null)
213
+ }
214
+ }, [stopStream, handleModelProgress])
215
+
216
+ const startLocal = useCallback(async () => {
217
+ setError(null); setNotice(null); setInterim('')
218
+ cancelGestureRef.current = false
219
+ try {
220
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
221
+ // Released during the permission/await — don't record anything.
222
+ if (cancelGestureRef.current) {
223
+ stream.getTracks().forEach((t) => t.stop())
224
+ setPhase('idle')
225
+ return
226
+ }
227
+ streamRef.current = stream
228
+ chunksRef.current = []
229
+ const rec = new MediaRecorder(stream)
230
+ rec.ondataavailable = (e) => { if (e.data.size) chunksRef.current.push(e.data) }
231
+ rec.onstop = () => { void runLocalTranscription(rec) }
232
+ recorderRef.current = rec
233
+ rec.start()
234
+ setPhase('listening')
235
+ } catch (err) {
236
+ stopStream()
237
+ setPhase('idle')
238
+ setError(describeMicError(err))
239
+ console.error('[voice] getUserMedia failed', err)
240
+ }
241
+ }, [runLocalTranscription, stopStream])
242
+
243
+ const stopLocal = useCallback(() => {
244
+ cancelGestureRef.current = true
245
+ const rec = recorderRef.current
246
+ if (rec && rec.state !== 'inactive') {
247
+ try { rec.stop() } catch { /* ignore */ }
248
+ }
249
+ }, [])
250
+
251
+ // ─── Push-to-talk gesture ──────────────────────────
252
+
253
+ const handlePointerDown = (e: React.PointerEvent) => {
254
+ if (!ANY_SUPPORTED || phase !== 'idle') return
255
+ e.preventDefault()
256
+ e.currentTarget.setPointerCapture?.(e.pointerId)
257
+ if (engineRef.current === 'web-speech' && WEB_SPEECH_SUPPORTED) startWebSpeech()
258
+ else if (LOCAL_SUPPORTED) void startLocal()
259
+ }
260
+ const handlePointerUp = (e: React.PointerEvent) => {
261
+ e.currentTarget.releasePointerCapture?.(e.pointerId)
262
+ if (engineRef.current === 'web-speech') stopWebSpeech()
263
+ else stopLocal()
264
+ }
265
+
266
+ // Clean up on unmount.
267
+ useEffect(() => () => {
268
+ try { recognitionRef.current?.abort() } catch { /* ignore */ }
269
+ try { recorderRef.current?.stop() } catch { /* ignore */ }
270
+ streamRef.current?.getTracks().forEach((t) => t.stop())
271
+ }, [])
272
+
273
+ // ─── Render ────────────────────────────────────────
274
+
275
+ const listening = phase === 'listening'
276
+ const busy = phase === 'transcribing'
277
+
278
+ const title = !ANY_SUPPORTED
279
+ ? 'Voice typing not supported in this build'
280
+ : listening
281
+ ? 'Listening… release to insert'
282
+ : busy
283
+ ? 'Transcribing…'
284
+ : engine === 'local'
285
+ ? 'Hold to talk (on-device) — speech is typed into the terminal'
286
+ : 'Hold to talk — speech is typed into the terminal'
287
+
288
+ let chip: { kind: 'listening' | 'busy' | 'notice' | 'error'; text: string } | null = null
289
+ if (error && phase === 'idle') chip = { kind: 'error', text: error }
290
+ else if (notice && phase === 'idle') chip = { kind: 'notice', text: notice }
291
+ else if (listening) chip = { kind: 'listening', text: interim || (engine === 'local' ? 'Recording…' : 'Listening…') }
292
+ else if (busy) chip = { kind: 'busy', text: modelPct !== null ? `Downloading voice model… ${modelPct}%` : 'Transcribing…' }
293
+
294
+ return (
295
+ <div className="voice-input-wrap">
296
+ <button
297
+ type="button"
298
+ className={`titlebar-btn titlebar-icon-btn voice-btn ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`}
299
+ disabled={!ANY_SUPPORTED}
300
+ onPointerDown={handlePointerDown}
301
+ onPointerUp={handlePointerUp}
302
+ onPointerCancel={handlePointerUp}
303
+ onContextMenu={(e) => e.preventDefault()}
304
+ title={title}
305
+ aria-label="Voice typing (hold to talk)"
306
+ >
307
+ <span className="voice-icon" aria-hidden="true">🎤</span>
308
+ <span className={`voice-dot ${listening ? 'on' : busy ? 'busy' : 'off'}`} />
309
+ </button>
310
+ {chip && <div className={`voice-chip ${chip.kind}`}>{chip.text}</div>}
311
+ </div>
312
+ )
313
+ }
@@ -0,0 +1,88 @@
1
+ // Local, offline speech-to-text via transformers.js (Whisper). Used as the
2
+ // fallback when the browser's Web Speech API is unavailable (the common case
3
+ // inside packaged Electron). The library and the ~75MB model are fetched lazily
4
+ // the first time a local transcription is requested, then cached by the runtime.
5
+
6
+ const MODEL = 'Xenova/whisper-base'
7
+ const TARGET_SAMPLE_RATE = 16000
8
+
9
+ export interface ModelProgress {
10
+ status: string
11
+ file?: string
12
+ progress?: number // 0–100 for the file currently downloading
13
+ loaded?: number
14
+ total?: number
15
+ }
16
+
17
+ // transformers.js is large and node-aware, so import it dynamically (Vite
18
+ // code-splits it into its own chunk that only loads on first local use).
19
+ type Transcriber = (audio: Float32Array, options?: Record<string, unknown>) => Promise<{ text: string } | Array<{ text: string }>>
20
+
21
+ let transcriberPromise: Promise<Transcriber> | null = null
22
+
23
+ export function isLocalModelLoading(): boolean {
24
+ return transcriberPromise !== null
25
+ }
26
+
27
+ export async function loadTranscriber(onProgress?: (p: ModelProgress) => void): Promise<Transcriber> {
28
+ if (!transcriberPromise) {
29
+ transcriberPromise = (async () => {
30
+ const { pipeline, env } = await import('@huggingface/transformers')
31
+ // We don't ship model files; always fetch from the Hugging Face hub.
32
+ env.allowLocalModels = false
33
+ const common = { progress_callback: onProgress as never }
34
+ // Prefer WebGPU for speed when the runtime exposes it; otherwise use the
35
+ // default (WASM) backend, which always works. Guarding on navigator.gpu
36
+ // avoids a wasted partial download when there's no GPU path at all.
37
+ const hasWebGpu = typeof navigator !== 'undefined' && 'gpu' in navigator
38
+ if (hasWebGpu) {
39
+ try {
40
+ return (await pipeline('automatic-speech-recognition', MODEL, { ...common, device: 'webgpu' })) as unknown as Transcriber
41
+ } catch (err) {
42
+ console.warn('[voice] WebGPU backend failed, falling back to WASM', err)
43
+ }
44
+ }
45
+ return (await pipeline('automatic-speech-recognition', MODEL, common)) as unknown as Transcriber
46
+ })()
47
+ // Allow a later retry if the first load fails (e.g. offline on first use).
48
+ transcriberPromise.catch(() => { transcriberPromise = null })
49
+ }
50
+ return transcriberPromise
51
+ }
52
+
53
+ // Decode a recorded audio Blob and resample it to mono 16kHz Float32 PCM, which
54
+ // is what Whisper expects. Returns null for empty/undecodable clips.
55
+ async function blobToPcm16k(blob: Blob): Promise<Float32Array | null> {
56
+ if (blob.size === 0) return null
57
+ const arrayBuffer = await blob.arrayBuffer()
58
+ const AudioCtx = window.AudioContext || (window as unknown as { webkitAudioContext: typeof AudioContext }).webkitAudioContext
59
+ const ctx = new AudioCtx()
60
+ let decoded: AudioBuffer
61
+ try {
62
+ decoded = await ctx.decodeAudioData(arrayBuffer)
63
+ } catch {
64
+ return null
65
+ } finally {
66
+ ctx.close()
67
+ }
68
+ const length = Math.ceil(decoded.duration * TARGET_SAMPLE_RATE)
69
+ if (length < 1) return null
70
+ const offline = new OfflineAudioContext(1, length, TARGET_SAMPLE_RATE)
71
+ const source = offline.createBufferSource()
72
+ source.buffer = decoded
73
+ source.connect(offline.destination)
74
+ source.start()
75
+ const rendered = await offline.startRendering()
76
+ return rendered.getChannelData(0)
77
+ }
78
+
79
+ export async function transcribeBlob(blob: Blob, onProgress?: (p: ModelProgress) => void): Promise<string> {
80
+ const transcriber = await loadTranscriber(onProgress)
81
+ const pcm = await blobToPcm16k(blob)
82
+ if (!pcm) return ''
83
+ const result = await transcriber(pcm)
84
+ const text = Array.isArray(result)
85
+ ? result.map((r) => r.text).join(' ')
86
+ : result?.text
87
+ return (text || '').trim()
88
+ }
@@ -570,6 +570,95 @@ html, body, #root {
570
570
  line-height: 1;
571
571
  }
572
572
 
573
+ /* Voice typing (push-to-talk mic) */
574
+ .voice-input-wrap {
575
+ position: relative;
576
+ display: inline-flex;
577
+ }
578
+ .voice-btn {
579
+ user-select: none;
580
+ -webkit-user-select: none;
581
+ touch-action: none;
582
+ }
583
+ .voice-btn:disabled {
584
+ opacity: 0.4;
585
+ cursor: not-allowed;
586
+ }
587
+ .voice-btn.listening {
588
+ color: #f7768e;
589
+ border-color: #f7768e;
590
+ background: #1f2335;
591
+ }
592
+ .voice-btn.busy {
593
+ color: #e0af68;
594
+ border-color: #e0af68;
595
+ background: #1f2335;
596
+ }
597
+ .voice-icon {
598
+ font-size: 13px;
599
+ line-height: 1;
600
+ }
601
+ .voice-dot {
602
+ width: 6px;
603
+ height: 6px;
604
+ border-radius: 50%;
605
+ display: inline-block;
606
+ vertical-align: middle;
607
+ background: #565f89;
608
+ }
609
+ .voice-dot.on {
610
+ background: #f7768e;
611
+ box-shadow: 0 0 4px #f7768e;
612
+ animation: voice-pulse 1s ease-in-out infinite;
613
+ }
614
+ .voice-dot.busy {
615
+ background: #e0af68;
616
+ box-shadow: 0 0 4px #e0af68;
617
+ animation: voice-pulse 0.8s ease-in-out infinite;
618
+ }
619
+ @keyframes voice-pulse {
620
+ 0%, 100% { opacity: 1; }
621
+ 50% { opacity: 0.3; }
622
+ }
623
+ .voice-chip {
624
+ position: absolute;
625
+ top: 100%;
626
+ right: 0;
627
+ margin-top: 6px;
628
+ max-width: 320px;
629
+ padding: 4px 9px;
630
+ border-radius: 5px;
631
+ font-size: 11px;
632
+ line-height: 1.3;
633
+ white-space: nowrap;
634
+ overflow: hidden;
635
+ text-overflow: ellipsis;
636
+ z-index: 50;
637
+ pointer-events: none;
638
+ border: 1px solid #3b3d57;
639
+ }
640
+ .voice-chip.listening {
641
+ background: #1f2335;
642
+ color: #a9b1d6;
643
+ }
644
+ .voice-chip.busy {
645
+ background: #1f2335;
646
+ color: #e0af68;
647
+ border-color: #e0af68;
648
+ }
649
+ .voice-chip.notice {
650
+ background: #1f2335;
651
+ color: #e0af68;
652
+ border-color: #e0af68;
653
+ white-space: normal;
654
+ }
655
+ .voice-chip.error {
656
+ background: #2d2030;
657
+ color: #f7768e;
658
+ border-color: #f7768e;
659
+ white-space: normal;
660
+ }
661
+
573
662
  /* Editor panel */
574
663
  .editor-panel {
575
664
  display: flex;