npm - @phenx-inc/ctlsurf - Versions diffs - 0.7.0 → 0.8.0 - Mend

@phenx-inc/ctlsurf 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/src/renderer/components/SpeakControls.tsx ADDED Viewed

@@ -0,0 +1,235 @@
+import { useCallback, useEffect, useRef, useState } from 'react'
+import {
+  speech,
+  getEngine,
+  setEngine as persistEngine,
+  getVoiceURI,
+  setVoiceURI as persistVoiceURI,
+  getRate,
+  setRate as persistRate,
+  listWebVoices,
+  type TtsEngineId,
+  type TtsModelProgress,
+} from '../lib/speech'
+// Titlebar control for "speak agent replies" (Electron desktop only). A speaker
+// toggle drives the main-process speakReplies setting (which runs the transcript
+// tailer); a ▾ dropdown picks the engine, voice, and rate.
+const SAMPLE = 'This is how spoken agent replies will sound.'
+export function SpeakControls() {
+  const [enabled, setEnabled] = useState(false)
+  const [engine, setEngineState] = useState<TtsEngineId>(getEngine)
+  const [voices, setVoices] = useState<SpeechSynthesisVoice[]>([])
+  const [voiceURI, setVoiceURIState] = useState<string | null>(getVoiceURI)
+  const [rate, setRateState] = useState<number>(getRate)
+  const [showMenu, setShowMenu] = useState(false)
+  const [modelPct, setModelPct] = useState<number | null>(null)
+  const [speaking, setSpeaking] = useState(false)
+  const [error, setError] = useState<string | null>(null)
+  const wrapRef = useRef<HTMLDivElement>(null)
+  // Reflect the persisted main-process setting on mount.
+  useEffect(() => {
+    let alive = true
+    window.worker.getSpeakReplies().then((r) => { if (alive) setEnabled(!!r.enabled) }).catch(() => {})
+    return () => { alive = false }
+  }, [])
+  // Speak replies as they arrive. Main only forwards when the setting is on, so
+  // this is a no-op while disabled.
+  useEffect(() => {
+    const off = window.worker.onAgentMessage((text) => speech.enqueue(text))
+    return off
+  }, [])
+  // Surface neural-model download/transcribe progress on the button.
+  useEffect(() => {
+    speech.onModelProgress = (p: TtsModelProgress | null) => {
+      if (p && p.status === 'progress' && typeof p.progress === 'number') {
+        setModelPct(Math.min(100, Math.round(p.progress)))
+      } else if (!p) {
+        setModelPct(null)
+      }
+    }
+    speech.onActivityChange = (a) => setSpeaking(a)
+    speech.onError = (msg) => setError(msg)
+    return () => {
+      speech.onModelProgress = null
+      speech.onActivityChange = null
+      speech.onError = null
+    }
+  }, [])
+  // Auto-dismiss the error chip.
+  useEffect(() => {
+    if (!error) return
+    const t = setTimeout(() => setError(null), 4000)
+    return () => clearTimeout(t)
+  }, [error])
+  // Web Speech voices populate asynchronously.
+  useEffect(() => {
+    const load = () => setVoices(listWebVoices())
+    load()
+    if (typeof speechSynthesis !== 'undefined') {
+      speechSynthesis.addEventListener('voiceschanged', load)
+      return () => speechSynthesis.removeEventListener('voiceschanged', load)
+    }
+  }, [])
+  // Close the menu on an outside click.
+  useEffect(() => {
+    if (!showMenu) return
+    const onDown = (e: MouseEvent) => {
+      if (!wrapRef.current?.contains(e.target as Node)) setShowMenu(false)
+    }
+    document.addEventListener('mousedown', onDown)
+    return () => document.removeEventListener('mousedown', onDown)
+  }, [showMenu])
+  const toggle = useCallback(async () => {
+    const next = !enabled
+    setEnabled(next)
+    // Prime the audio context from this click so neural playback isn't blocked
+    // by autoplay policy when a reply later arrives without a gesture.
+    if (next) speech.unlock()
+    else speech.stop()
+    try { await window.worker.setSpeakReplies(next) } catch { /* ignore */ }
+  }, [enabled])
+  const chooseEngine = useCallback((id: TtsEngineId) => {
+    persistEngine(id)
+    setEngineState(id)
+    speech.stop()
+    // Preload the neural model so the first reply isn't slowed by download.
+    if (id === 'neural') speech.warmup()
+  }, [])
+  const chooseVoice = useCallback((uri: string) => {
+    const v = uri || null
+    persistVoiceURI(v)
+    setVoiceURIState(v)
+  }, [])
+  const changeRate = useCallback((r: number) => {
+    persistRate(r)
+    setRateState(r)
+  }, [])
+  const testVoice = useCallback(() => {
+    speech.unlock()
+    speech.stop()
+    speech.enqueue(SAMPLE)
+  }, [])
+  const title = enabled ? 'Spoken replies on — click to mute' : 'Speak agent replies (off)'
+  return (
+    <div className="speak-controls" ref={wrapRef}>
+      <button
+        type="button"
+        className={`titlebar-btn titlebar-icon-btn speak-btn ${enabled ? 'active' : ''}`}
+        onClick={toggle}
+        title={title}
+        aria-label="Toggle spoken agent replies"
+        aria-pressed={enabled}
+      >
+        <svg viewBox="0 0 24 24" width="13" height="13" fill="none" stroke="currentColor"
+          strokeWidth="2" strokeLinecap="round" strokeLinejoin="round" aria-hidden="true">
+          <polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
+          {enabled ? (
+            <>
+              <path d="M15.54 8.46a5 5 0 0 1 0 7.07" />
+              <path d="M19.07 4.93a10 10 0 0 1 0 14.14" />
+            </>
+          ) : (
+            <line x1="23" y1="9" x2="17" y2="15" />
+          )}
+        </svg>
+        {modelPct !== null && <span className="speak-pct">{modelPct}%</span>}
+      </button>
+      {speaking && (
+        <button
+          type="button"
+          className="titlebar-btn titlebar-icon-btn speak-stop"
+          onClick={() => speech.stop()}
+          title="Stop speaking"
+          aria-label="Stop speaking"
+        >
+          <svg viewBox="0 0 24 24" width="11" height="11" fill="currentColor" aria-hidden="true">
+            <rect x="5" y="5" width="14" height="14" rx="2" />
+          </svg>
+        </button>
+      )}
+      <button
+        type="button"
+        className="titlebar-btn speak-caret"
+        onClick={() => setShowMenu((v) => !v)}
+        title="Voice options"
+        aria-label="Voice options"
+        aria-expanded={showMenu}
+      >
+        ▾
+      </button>
+      {showMenu && (
+        <div className="speak-menu" role="menu">
+          <div className="speak-menu-head">Engine</div>
+          <button
+            type="button"
+            className={`speak-menu-item ${engine === 'web' ? 'active' : ''}`}
+            onClick={() => chooseEngine('web')}
+          >
+            <span className="speak-menu-check">{engine === 'web' ? '✓' : ''}</span>
+            <span>System voice (instant)</span>
+          </button>
+          <button
+            type="button"
+            className={`speak-menu-item ${engine === 'neural' ? 'active' : ''}`}
+            onClick={() => chooseEngine('neural')}
+          >
+            <span className="speak-menu-check">{engine === 'neural' ? '✓' : ''}</span>
+            <span>Neural voice (downloads)</span>
+          </button>
+          {engine === 'web' && (
+            <>
+              <div className="speak-menu-head">Voice</div>
+              <select
+                className="speak-select"
+                value={voiceURI || ''}
+                onChange={(e) => chooseVoice(e.target.value)}
+              >
+                <option value="">System default</option>
+                {voices.map((v) => (
+                  <option key={v.voiceURI} value={v.voiceURI}>
+                    {v.name} {v.lang ? `(${v.lang})` : ''}
+                  </option>
+                ))}
+              </select>
+            </>
+          )}
+          <div className="speak-menu-head">Rate · {rate.toFixed(1)}×</div>
+          <input
+            className="speak-rate"
+            type="range"
+            min={0.5}
+            max={2}
+            step={0.1}
+            value={rate}
+            onChange={(e) => changeRate(Number(e.target.value))}
+          />
+          <div className="speak-menu-row">
+            <button type="button" className="speak-menu-btn" onClick={testVoice}>Test</button>
+            <button type="button" className="speak-menu-btn" onClick={() => speech.stop()}>Stop</button>
+          </div>
+        </div>
+      )}
+      {error && <div className="speak-error">{error}</div>}
+    </div>
+  )
+}

package/src/renderer/components/VoiceInput.tsx CHANGED Viewed

@@ -43,6 +43,12 @@ type Engine = 'web-speech' | 'local'
 type Phase = 'idle' | 'listening' | 'transcribing'
 const ENGINE_KEY = 'ctlsurf.voiceEngine'
+// Persisted audioinput deviceId for the local engine; null/absent = OS default.
+const DEVICE_KEY = 'ctlsurf.voiceDeviceId'
+function loadDeviceId(): string | null {
+  try { return localStorage.getItem(DEVICE_KEY) || null } catch { return null }
+}
 const WEB_SPEECH_SUPPORTED = getRecognitionCtor() !== null
 const LOCAL_SUPPORTED =
@@ -89,6 +95,14 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
   const [error, setError] = useState<string | null>(null)
   const [notice, setNotice] = useState<string | null>(null)
+  // Mic source selection (local engine only)
+  const [devices, setDevices] = useState<MediaDeviceInfo[]>([])
+  const [selectedDeviceId, setSelectedDeviceId] = useState<string | null>(loadDeviceId)
+  const [showDevicePicker, setShowDevicePicker] = useState(false)
+  const selectedDeviceIdRef = useRef(selectedDeviceId)
+  useEffect(() => { selectedDeviceIdRef.current = selectedDeviceId }, [selectedDeviceId])
+  const wrapRef = useRef<HTMLDivElement>(null)
   // Web Speech refs
   const recognitionRef = useRef<SpeechRecognitionLike | null>(null)
   const finalRef = useRef('')
@@ -128,6 +142,88 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
     streamRef.current = null
   }, [])
+  // ─── Mic source selection ──────────────────────────
+  const refreshDevices = useCallback(async () => {
+    if (!navigator.mediaDevices?.enumerateDevices) return
+    try {
+      const all = await navigator.mediaDevices.enumerateDevices()
+      setDevices(all.filter((d) => d.kind === 'audioinput'))
+    } catch { /* ignore */ }
+  }, [])
+  // Device labels are blank until mic permission is granted, so when the user
+  // opens the picker we request a one-shot permission to populate names.
+  const ensureDeviceLabels = useCallback(async () => {
+    if (!navigator.mediaDevices?.enumerateDevices) return
+    try {
+      const all = await navigator.mediaDevices.enumerateDevices()
+      const inputs = all.filter((d) => d.kind === 'audioinput')
+      if (inputs.length && inputs.every((d) => !d.label)) {
+        const s = await navigator.mediaDevices.getUserMedia({ audio: true })
+        s.getTracks().forEach((t) => t.stop())
+      }
+    } catch { /* permission denied → we'll show generic names */ }
+    await refreshDevices()
+  }, [refreshDevices])
+  const toggleDevicePicker = useCallback(() => {
+    setShowDevicePicker((open) => !open)
+  }, [])
+  // Populate device labels whenever the picker opens (needs mic permission).
+  useEffect(() => {
+    if (showDevicePicker) void ensureDeviceLabels()
+  }, [showDevicePicker, ensureDeviceLabels])
+  const chooseDevice = useCallback((id: string | null) => {
+    setSelectedDeviceId(id)
+    try {
+      if (id) localStorage.setItem(DEVICE_KEY, id)
+      else localStorage.removeItem(DEVICE_KEY)
+    } catch { /* ignore */ }
+    setShowDevicePicker(false)
+  }, [])
+  // Acquire a stream honoring the saved device, gracefully falling back to the
+  // OS default if that device was unplugged since it was chosen.
+  const getStream = useCallback(async (): Promise<MediaStream> => {
+    const id = selectedDeviceIdRef.current
+    try {
+      return await navigator.mediaDevices.getUserMedia({
+        audio: id ? { deviceId: { exact: id } } : true,
+      })
+    } catch (err) {
+      if (id && (err as { name?: string })?.name === 'OverconstrainedError') {
+        try { localStorage.removeItem(DEVICE_KEY) } catch { /* ignore */ }
+        setSelectedDeviceId(null)
+        setNotice('Saved microphone unavailable — using system default.')
+        return navigator.mediaDevices.getUserMedia({ audio: true })
+      }
+      throw err
+    }
+  }, [])
+  // Keep the list fresh when devices are plugged/unplugged.
+  useEffect(() => {
+    const md = navigator.mediaDevices
+    if (!md?.addEventListener) return
+    const onChange = () => { void refreshDevices() }
+    md.addEventListener('devicechange', onChange)
+    void refreshDevices()
+    return () => md.removeEventListener('devicechange', onChange)
+  }, [refreshDevices])
+  // Close the picker on an outside click.
+  useEffect(() => {
+    if (!showDevicePicker) return
+    const onDocDown = (e: MouseEvent) => {
+      if (!wrapRef.current?.contains(e.target as Node)) setShowDevicePicker(false)
+    }
+    document.addEventListener('mousedown', onDocDown)
+    return () => document.removeEventListener('mousedown', onDocDown)
+  }, [showDevicePicker])
   // ─── Web Speech engine ─────────────────────────────
   const startWebSpeech = useCallback(() => {
@@ -207,6 +303,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
     try {
       const text = await transcribeBlob(blob, handleModelProgress)
       if (text) onTranscriptRef.current(text)
+      else setNotice('No speech detected — check the mic source (▾).')
     } catch (err) {
       setError('On-device transcription failed')
       console.error('[voice] local transcription failed', err)
@@ -220,7 +317,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
     setError(null); setNotice(null); setInterim('')
     cancelGestureRef.current = false
     try {
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+      const stream = await getStream()
       // Released during the permission/await — don't record anything.
       if (cancelGestureRef.current) {
         stream.getTracks().forEach((t) => t.stop())
@@ -241,7 +338,7 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
       setError(describeMicError(err))
       console.error('[voice] getUserMedia failed', err)
     }
-  }, [runLocalTranscription, stopStream])
+  }, [runLocalTranscription, stopStream, getStream])
   const stopLocal = useCallback(() => {
     cancelGestureRef.current = true
@@ -299,8 +396,23 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
     ? `voice-btn voice-btn-floating ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
     : `titlebar-btn titlebar-icon-btn voice-btn ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`
+  // Hide Chromium's synthetic "default"/"communications" aggregate entries from
+  // the list; we expose our own "System default" item instead.
+  const realDevices = devices.filter(
+    (d) => d.deviceId && d.deviceId !== 'default' && d.deviceId !== 'communications',
+  )
+  const rawDefault = devices.find((d) => d.deviceId === 'default')?.label
+  const defaultLabel = rawDefault
+    ? `System default · ${rawDefault.replace(/^Default\s*-\s*/i, '')}`
+    : 'System default'
+  const activeLabel = selectedDeviceId
+    ? (realDevices.find((d) => d.deviceId === selectedDeviceId)?.label || 'Selected microphone')
+    : defaultLabel
+  // Source selection only affects the local (getUserMedia) engine.
+  const showSourcePicker = floating && LOCAL_SUPPORTED
   return (
-    <div className="voice-input-wrap">
+    <div className="voice-input-wrap" ref={wrapRef}>
       <button
         type="button"
         className={btnClass}
@@ -315,6 +427,50 @@ export function VoiceInput({ onTranscript, variant = 'titlebar' }: VoiceInputPro
         <span className="voice-icon" aria-hidden="true">🎤</span>
         <span className={`voice-dot ${listening ? 'on' : busy ? 'busy' : 'off'}`} />
       </button>
+      {showSourcePicker && (
+        <button
+          type="button"
+          className="voice-source-btn"
+          onPointerDown={(e) => e.stopPropagation()}
+          onClick={toggleDevicePicker}
+          title={`Mic source: ${activeLabel}`}
+          aria-label="Choose microphone source"
+          aria-expanded={showDevicePicker}
+        >
+          ▾
+        </button>
+      )}
+      {showSourcePicker && showDevicePicker && (
+        <div className="voice-source-menu" role="menu">
+          <div className="voice-source-head">Microphone source</div>
+          <button
+            type="button"
+            role="menuitemradio"
+            aria-checked={selectedDeviceId === null}
+            className={`voice-source-item ${selectedDeviceId === null ? 'active' : ''}`}
+            onClick={() => chooseDevice(null)}
+          >
+            <span className="voice-source-check">{selectedDeviceId === null ? '✓' : ''}</span>
+            <span className="voice-source-label">{defaultLabel}</span>
+          </button>
+          {realDevices.map((d, i) => (
+            <button
+              key={d.deviceId}
+              type="button"
+              role="menuitemradio"
+              aria-checked={selectedDeviceId === d.deviceId}
+              className={`voice-source-item ${selectedDeviceId === d.deviceId ? 'active' : ''}`}
+              onClick={() => chooseDevice(d.deviceId)}
+            >
+              <span className="voice-source-check">{selectedDeviceId === d.deviceId ? '✓' : ''}</span>
+              <span className="voice-source-label">{d.label || `Microphone ${i + 1}`}</span>
+            </button>
+          ))}
+          {realDevices.length === 0 && (
+            <div className="voice-source-empty">No microphones found</div>
+          )}
+        </div>
+      )}
       {chip && <div className={`voice-chip ${chip.kind} ${floating ? 'voice-chip-floating' : ''}`}>{chip.text}</div>}
     </div>
   )

package/src/renderer/lib/localWhisper.ts CHANGED Viewed

@@ -6,6 +6,37 @@
 const MODEL = 'Xenova/whisper-base'
 const TARGET_SAMPLE_RATE = 16000
+// Whisper hallucinates filler tokens ("you", "Thank you.", "Thanks for
+// watching.") when fed silence. Two guards below tame that:
+//  - Below SILENCE_RMS we treat the clip as silent and skip the model entirely.
+//  - Between SILENCE_RMS and LOW_CONFIDENCE_RMS we still run the model but drop
+//    the result if it's *only* a known filler phrase (likely a hallucination on
+//    a quiet clip rather than real speech).
+const SILENCE_RMS = 0.008
+const LOW_CONFIDENCE_RMS = 0.02
+const FILLER_PHRASES = new Set([
+  'you',
+  'thank you',
+  'thank you very much',
+  'thank you for watching',
+  'thanks for watching',
+  'please subscribe',
+])
+// Root-mean-square amplitude of the clip — a cheap loudness proxy. Normalized
+// speech sits around 0.05–0.15; a silent room is well under 0.005.
+function computeRms(pcm: Float32Array): number {
+  if (pcm.length === 0) return 0
+  let sum = 0
+  for (let i = 0; i < pcm.length; i++) sum += pcm[i] * pcm[i]
+  return Math.sqrt(sum / pcm.length)
+}
+function isFillerOnly(text: string): boolean {
+  const norm = text.toLowerCase().replace(/[.!?,…"']/g, '').replace(/\s+/g, ' ').trim()
+  return FILLER_PHRASES.has(norm)
+}
 export interface ModelProgress {
   status: string
   file?: string
@@ -77,12 +108,25 @@ async function blobToPcm16k(blob: Blob): Promise<Float32Array | null> {
 }
 export async function transcribeBlob(blob: Blob, onProgress?: (p: ModelProgress) => void): Promise<string> {
-  const transcriber = await loadTranscriber(onProgress)
+  // Decode first so a silent clip never triggers the (one-time, ~75MB) model
+  // download just to be thrown away below.
   const pcm = await blobToPcm16k(blob)
   if (!pcm) return ''
+  const rms = computeRms(pcm)
+  if (rms < SILENCE_RMS) {
+    console.info(`[voice] near-silent clip (rms=${rms.toFixed(4)}); skipping transcription`)
+    return ''
+  }
+  const transcriber = await loadTranscriber(onProgress)
   const result = await transcriber(pcm)
-  const text = Array.isArray(result)
+  const text = (Array.isArray(result)
     ? result.map((r) => r.text).join(' ')
-    : result?.text
-  return (text || '').trim()
+    : result?.text || '').trim()
+  if (text && rms < LOW_CONFIDENCE_RMS && isFillerOnly(text)) {
+    console.info(`[voice] dropping filler-only output "${text}" from quiet clip (rms=${rms.toFixed(4)})`)
+    return ''
+  }
+  return text
 }