npm - @phenx-inc/ctlsurf - Versions diffs - 0.5.2 → 0.7.0 - Mend

@phenx-inc/ctlsurf 0.5.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/src/renderer/lib/localWhisper.ts ADDED Viewed

@@ -0,0 +1,88 @@
+// Local, offline speech-to-text via transformers.js (Whisper). Used as the
+// fallback when the browser's Web Speech API is unavailable (the common case
+// inside packaged Electron). The library and the ~75MB model are fetched lazily
+// the first time a local transcription is requested, then cached by the runtime.
+const MODEL = 'Xenova/whisper-base'
+const TARGET_SAMPLE_RATE = 16000
+export interface ModelProgress {
+  status: string
+  file?: string
+  progress?: number // 0–100 for the file currently downloading
+  loaded?: number
+  total?: number
+}
+// transformers.js is large and node-aware, so import it dynamically (Vite
+// code-splits it into its own chunk that only loads on first local use).
+type Transcriber = (audio: Float32Array, options?: Record<string, unknown>) => Promise<{ text: string } | Array<{ text: string }>>
+let transcriberPromise: Promise<Transcriber> | null = null
+export function isLocalModelLoading(): boolean {
+  return transcriberPromise !== null
+}
+export async function loadTranscriber(onProgress?: (p: ModelProgress) => void): Promise<Transcriber> {
+  if (!transcriberPromise) {
+    transcriberPromise = (async () => {
+      const { pipeline, env } = await import('@huggingface/transformers')
+      // We don't ship model files; always fetch from the Hugging Face hub.
+      env.allowLocalModels = false
+      const common = { progress_callback: onProgress as never }
+      // Prefer WebGPU for speed when the runtime exposes it; otherwise use the
+      // default (WASM) backend, which always works. Guarding on navigator.gpu
+      // avoids a wasted partial download when there's no GPU path at all.
+      const hasWebGpu = typeof navigator !== 'undefined' && 'gpu' in navigator
+      if (hasWebGpu) {
+        try {
+          return (await pipeline('automatic-speech-recognition', MODEL, { ...common, device: 'webgpu' })) as unknown as Transcriber
+        } catch (err) {
+          console.warn('[voice] WebGPU backend failed, falling back to WASM', err)
+        }
+      }
+      return (await pipeline('automatic-speech-recognition', MODEL, common)) as unknown as Transcriber
+    })()
+    // Allow a later retry if the first load fails (e.g. offline on first use).
+    transcriberPromise.catch(() => { transcriberPromise = null })
+  }
+  return transcriberPromise
+}
+// Decode a recorded audio Blob and resample it to mono 16kHz Float32 PCM, which
+// is what Whisper expects. Returns null for empty/undecodable clips.
+async function blobToPcm16k(blob: Blob): Promise<Float32Array | null> {
+  if (blob.size === 0) return null
+  const arrayBuffer = await blob.arrayBuffer()
+  const AudioCtx = window.AudioContext || (window as unknown as { webkitAudioContext: typeof AudioContext }).webkitAudioContext
+  const ctx = new AudioCtx()
+  let decoded: AudioBuffer
+  try {
+    decoded = await ctx.decodeAudioData(arrayBuffer)
+  } catch {
+    return null
+  } finally {
+    ctx.close()
+  }
+  const length = Math.ceil(decoded.duration * TARGET_SAMPLE_RATE)
+  if (length < 1) return null
+  const offline = new OfflineAudioContext(1, length, TARGET_SAMPLE_RATE)
+  const source = offline.createBufferSource()
+  source.buffer = decoded
+  source.connect(offline.destination)
+  source.start()
+  const rendered = await offline.startRendering()
+  return rendered.getChannelData(0)
+}
+export async function transcribeBlob(blob: Blob, onProgress?: (p: ModelProgress) => void): Promise<string> {
+  const transcriber = await loadTranscriber(onProgress)
+  const pcm = await blobToPcm16k(blob)
+  if (!pcm) return ''
+  const result = await transcriber(pcm)
+  const text = Array.isArray(result)
+    ? result.map((r) => r.text).join(' ')
+    : result?.text
+  return (text || '').trim()
+}

package/src/renderer/styles.css CHANGED Viewed

@@ -570,6 +570,169 @@ html, body, #root {
   line-height: 1;
 }
+/* Voice typing (push-to-talk mic) */
+.voice-input-wrap {
+  position: relative;
+  display: inline-flex;
+}
+.voice-btn {
+  user-select: none;
+  -webkit-user-select: none;
+  touch-action: none;
+}
+.voice-btn:disabled {
+  opacity: 0.4;
+  cursor: not-allowed;
+}
+.voice-btn.listening {
+  color: #f7768e;
+  border-color: #f7768e;
+  background: #1f2335;
+}
+.voice-btn.busy {
+  color: #e0af68;
+  border-color: #e0af68;
+  background: #1f2335;
+}
+.voice-icon {
+  font-size: 13px;
+  line-height: 1;
+}
+.voice-dot {
+  width: 6px;
+  height: 6px;
+  border-radius: 50%;
+  display: inline-block;
+  vertical-align: middle;
+  background: #565f89;
+}
+.voice-dot.on {
+  background: #f7768e;
+  box-shadow: 0 0 4px #f7768e;
+  animation: voice-pulse 1s ease-in-out infinite;
+}
+.voice-dot.busy {
+  background: #e0af68;
+  box-shadow: 0 0 4px #e0af68;
+  animation: voice-pulse 0.8s ease-in-out infinite;
+}
+@keyframes voice-pulse {
+  0%, 100% { opacity: 1; }
+  50% { opacity: 0.3; }
+}
+.voice-chip {
+  position: absolute;
+  top: 100%;
+  right: 0;
+  margin-top: 6px;
+  max-width: 320px;
+  padding: 4px 9px;
+  border-radius: 5px;
+  font-size: 11px;
+  line-height: 1.3;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  z-index: 50;
+  pointer-events: none;
+  border: 1px solid #3b3d57;
+}
+.voice-chip.listening {
+  background: #1f2335;
+  color: #a9b1d6;
+}
+.voice-chip.busy {
+  background: #1f2335;
+  color: #e0af68;
+  border-color: #e0af68;
+}
+.voice-chip.notice {
+  background: #1f2335;
+  color: #e0af68;
+  border-color: #e0af68;
+  white-space: normal;
+}
+.voice-chip.error {
+  background: #2d2030;
+  color: #f7768e;
+  border-color: #f7768e;
+  white-space: normal;
+}
+/* Floating push-to-talk mic (draggable, dismissable FAB) */
+.floating-mic {
+  position: fixed;
+  z-index: 200;
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 4px;
+  padding: 4px 4px 6px;
+  background: #16161e;
+  border: 1px solid #3b3d57;
+  border-radius: 12px;
+  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.45);
+  user-select: none;
+  -webkit-user-select: none;
+}
+.floating-mic-handle {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  width: 100%;
+  cursor: grab;
+  touch-action: none;
+}
+.floating-mic-handle:active { cursor: grabbing; }
+.floating-mic-grip {
+  color: #565f89;
+  font-size: 12px;
+  line-height: 1;
+  padding: 0 2px;
+}
+.floating-mic-hide {
+  background: transparent;
+  border: none;
+  color: #565f89;
+  font-size: 15px;
+  line-height: 1;
+  cursor: pointer;
+  padding: 0 2px;
+}
+.floating-mic-hide:hover { color: #f7768e; }
+.voice-btn-floating {
+  width: 48px;
+  height: 48px;
+  border-radius: 50%;
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  background: #2a2b3d;
+  border: 1px solid #3b3d57;
+  color: #a9b1d6;
+  cursor: pointer;
+  position: relative;
+  transition: all 0.15s;
+}
+.voice-btn-floating:hover { border-color: #565f89; }
+.voice-btn-floating .voice-icon { font-size: 22px; }
+.voice-btn-floating .voice-dot {
+  position: absolute;
+  top: 5px;
+  right: 5px;
+}
+/* Floating chip sits above the round button rather than below it. */
+.voice-chip-floating {
+  top: auto;
+  bottom: 100%;
+  right: auto;
+  left: 50%;
+  transform: translateX(-50%);
+  margin-top: 0;
+  margin-bottom: 8px;
+}
 /* Editor panel */
 .editor-panel {
   display: flex;