npm - @phenx-inc/ctlsurf - Versions diffs - 0.6.0 → 0.8.0 - Mend

@phenx-inc/ctlsurf 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/src/renderer/lib/speech.ts ADDED Viewed

@@ -0,0 +1,299 @@
+// Text-to-speech for spoken agent replies (Electron desktop only). Two engines
+// the user can pick between:
+//   - 'web'    → the built-in Web Speech synthesis API (OS voices, no download)
+//   - 'neural' → a local neural model via transformers.js (heavier, downloads
+//                on first use, more natural)
+// A small queue serializes utterances so overlapping replies don't talk over
+// each other, and stop() flushes everything.
+export type TtsEngineId = 'web' | 'neural'
+const ENGINE_KEY = 'ctlsurf.tts.engine'
+const VOICE_KEY = 'ctlsurf.tts.voiceURI'
+const RATE_KEY = 'ctlsurf.tts.rate'
+// Cap so a runaway reply doesn't narrate for minutes; split into sentence-ish
+// chunks so long passages stay reliable on the Web Speech backend.
+const MAX_SPEAK_CHARS = 1600
+const MAX_CHUNK_CHARS = 280
+// ─── Text cleaning ───────────────────────────────────
+// Turn a markdown-ish assistant reply into something listenable: code blocks
+// are condensed to a short spoken marker rather than read symbol-by-symbol.
+export function cleanForSpeech(input: string): string {
+  let t = input
+  // Fenced code blocks → "code block, N lines."
+  t = t.replace(/```[^\n]*\n?([\s\S]*?)```/g, (_m, body: string) => {
+    const lines = body.replace(/\n+$/, '').split('\n').filter((l) => l.trim().length).length
+    return lines > 0 ? ` (code block, ${lines} ${lines === 1 ? 'line' : 'lines'}) ` : ' (code) '
+  })
+  // Leftover/unterminated fence
+  t = t.replace(/```/g, ' (code) ')
+  // Inline code → spoken contents without backticks
+  t = t.replace(/`([^`]+)`/g, '$1')
+  // Markdown links [text](url) → text
+  t = t.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
+  // Bare URLs → "link"
+  t = t.replace(/https?:\/\/\S+/g, 'link')
+  // File paths with many slashes are noise when read aloud → basename
+  t = t.replace(/(?:[\w.-]*\/){2,}([\w.-]+)/g, '$1')
+  // Strip leftover markdown emphasis / heading / list / blockquote markers
+  t = t.replace(/[*_#>]+/g, ' ')
+  t = t.replace(/^\s*[-•]\s+/gm, ', ')
+  // HTML tags, if any slipped through
+  t = t.replace(/<[^>]+>/g, ' ')
+  // Collapse whitespace
+  t = t.replace(/\s+/g, ' ').trim()
+  if (t.length > MAX_SPEAK_CHARS) {
+    t = t.slice(0, MAX_SPEAK_CHARS).replace(/\s+\S*$/, '') + '…'
+  }
+  return t
+}
+function splitIntoChunks(text: string): string[] {
+  if (text.length <= MAX_CHUNK_CHARS) return text ? [text] : []
+  const sentences = text.match(/[^.!?]+[.!?]+|\S[^.!?]*$/g) || [text]
+  const chunks: string[] = []
+  let buf = ''
+  for (const s of sentences) {
+    if ((buf + s).length > MAX_CHUNK_CHARS && buf) {
+      chunks.push(buf.trim())
+      buf = ''
+    }
+    // A single sentence longer than the cap is hard-split.
+    if (s.length > MAX_CHUNK_CHARS) {
+      for (let i = 0; i < s.length; i += MAX_CHUNK_CHARS) chunks.push(s.slice(i, i + MAX_CHUNK_CHARS).trim())
+    } else {
+      buf += s
+    }
+  }
+  if (buf.trim()) chunks.push(buf.trim())
+  return chunks
+}
+// ─── Preferences ─────────────────────────────────────
+export function getEngine(): TtsEngineId {
+  try { return localStorage.getItem(ENGINE_KEY) === 'neural' ? 'neural' : 'web' } catch { return 'web' }
+}
+export function setEngine(id: TtsEngineId): void {
+  try { localStorage.setItem(ENGINE_KEY, id) } catch { /* ignore */ }
+}
+export function getVoiceURI(): string | null {
+  try { return localStorage.getItem(VOICE_KEY) || null } catch { return null }
+}
+export function setVoiceURI(uri: string | null): void {
+  try { uri ? localStorage.setItem(VOICE_KEY, uri) : localStorage.removeItem(VOICE_KEY) } catch { /* ignore */ }
+}
+export function getRate(): number {
+  try {
+    const n = Number(localStorage.getItem(RATE_KEY))
+    return Number.isFinite(n) && n >= 0.5 && n <= 2 ? n : 1
+  } catch { return 1 }
+}
+export function setRate(rate: number): void {
+  try { localStorage.setItem(RATE_KEY, String(rate)) } catch { /* ignore */ }
+}
+export function listWebVoices(): SpeechSynthesisVoice[] {
+  if (typeof speechSynthesis === 'undefined') return []
+  return speechSynthesis.getVoices()
+}
+// ─── Neural engine (transformers.js) ─────────────────
+// Self-contained neural voice: a VITS model that needs no speaker-embeddings
+// file and no separate vocoder (unlike SpeechT5), so first use has far fewer
+// ways to fail. English; output is mono 16kHz PCM.
+const NEURAL_MODEL = 'Xenova/mms-tts-eng'
+type RawAudio = { audio: Float32Array; sampling_rate: number }
+type Synthesizer = (text: string, options?: Record<string, unknown>) => Promise<RawAudio | RawAudio[]>
+let synthPromise: Promise<Synthesizer> | null = null
+export interface TtsModelProgress { status: string; progress?: number }
+function loadSynthesizer(onProgress?: (p: TtsModelProgress) => void): Promise<Synthesizer> {
+  if (!synthPromise) {
+    synthPromise = (async () => {
+      const { pipeline, env } = await import('@huggingface/transformers')
+      env.allowLocalModels = false
+      const common = { progress_callback: onProgress as never }
+      // WebGPU (Metal on macOS) is far faster than the WASM CPU backend for
+      // inference. Fall back to WASM if it's unavailable or the model has an op
+      // WebGPU can't run — same pattern as the Whisper path.
+      const hasWebGpu = typeof navigator !== 'undefined' && 'gpu' in navigator
+      if (hasWebGpu) {
+        try {
+          const s = (await pipeline('text-to-speech', NEURAL_MODEL, { ...common, device: 'webgpu' })) as unknown as Synthesizer
+          console.info('[tts] neural backend: webgpu')
+          return s
+        } catch (err) {
+          console.warn('[tts] WebGPU backend failed, falling back to WASM', err)
+        }
+      }
+      const s = (await pipeline('text-to-speech', NEURAL_MODEL, common)) as unknown as Synthesizer
+      console.info('[tts] neural backend: wasm')
+      return s
+    })()
+    synthPromise.catch(() => { synthPromise = null })
+  }
+  return synthPromise
+}
+export function isNeuralModelLoading(): boolean {
+  return synthPromise !== null
+}
+// ─── Controller ──────────────────────────────────────
+class SpeechController {
+  private queue: string[] = []
+  private draining = false
+  private audioCtx: AudioContext | null = null
+  private currentSource: AudioBufferSourceNode | null = null
+  private generation = 0 // bumped by stop() to abort in-flight work
+  private active = false
+  onModelProgress: ((p: TtsModelProgress | null) => void) | null = null
+  // Fires true while a reply is being spoken/queued, false when idle — drives
+  // the visible Stop button.
+  onActivityChange: ((active: boolean) => void) | null = null
+  onError: ((message: string) => void) | null = null
+  // Prime/resume the AudioContext from a user gesture so neural playback isn't
+  // blocked by the browser autoplay policy (the System engine is unaffected).
+  unlock(): void {
+    try {
+      const ctx = this.ensureCtx()
+      if (ctx.state === 'suspended') void ctx.resume()
+    } catch { /* ignore */ }
+  }
+  // Start loading the neural model in the background (e.g. when the user picks
+  // the neural engine) so the first utterance doesn't pay download/compile time.
+  warmup(): void {
+    if (getEngine() !== 'neural') return
+    void loadSynthesizer((p) => this.onModelProgress?.(p))
+      .then(() => this.onModelProgress?.(null))
+      .catch(() => { this.onModelProgress?.(null) })
+  }
+  enqueue(rawText: string): void {
+    const text = cleanForSpeech(rawText)
+    if (!text) return
+    this.queue.push(...splitIntoChunks(text))
+    void this.drain()
+  }
+  stop(): void {
+    this.generation++
+    this.queue = []
+    this.draining = false
+    if (typeof speechSynthesis !== 'undefined') {
+      try { speechSynthesis.cancel() } catch { /* ignore */ }
+    }
+    if (this.currentSource) {
+      try { this.currentSource.stop() } catch { /* ignore */ }
+      this.currentSource = null
+    }
+    this.onModelProgress?.(null)
+    this.setActive(false)
+  }
+  private ensureCtx(): AudioContext {
+    if (!this.audioCtx) {
+      const Ctx = window.AudioContext || (window as unknown as { webkitAudioContext: typeof AudioContext }).webkitAudioContext
+      this.audioCtx = new Ctx()
+    }
+    return this.audioCtx
+  }
+  private setActive(a: boolean): void {
+    if (this.active === a) return
+    this.active = a
+    this.onActivityChange?.(a)
+  }
+  private async drain(): Promise<void> {
+    if (this.draining) return
+    this.draining = true
+    const gen = this.generation
+    this.setActive(true)
+    while (this.queue.length && gen === this.generation) {
+      const chunk = this.queue.shift() as string
+      try {
+        if (getEngine() === 'neural') await this.speakNeural(chunk, gen)
+        else await this.speakWeb(chunk, gen)
+      } catch (err) {
+        console.error('[tts] speak failed', err)
+        this.onModelProgress?.(null)
+        const detail = err instanceof Error ? err.message : String(err)
+        this.onError?.(getEngine() === 'neural' ? `Neural voice failed: ${detail}` : 'Speech failed')
+      }
+    }
+    if (gen === this.generation) {
+      this.draining = false
+      this.setActive(false)
+    }
+  }
+  private speakWeb(text: string, gen: number): Promise<void> {
+    return new Promise((resolve) => {
+      if (typeof speechSynthesis === 'undefined' || gen !== this.generation) return resolve()
+      const u = new SpeechSynthesisUtterance(text)
+      u.rate = getRate()
+      const wantUri = getVoiceURI()
+      if (wantUri) {
+        const v = speechSynthesis.getVoices().find((vv) => vv.voiceURI === wantUri)
+        if (v) u.voice = v
+      }
+      u.onend = () => resolve()
+      u.onerror = () => resolve()
+      speechSynthesis.speak(u)
+    })
+  }
+  private async speakNeural(text: string, gen: number): Promise<void> {
+    console.info('[tts] neural: loading model…')
+    this.onModelProgress?.({ status: 'loading' })
+    const synth = await loadSynthesizer((p) => this.onModelProgress?.(p))
+    if (gen !== this.generation) return
+    console.info('[tts] neural: synthesizing', JSON.stringify(text.slice(0, 60)))
+    const out = await synth(text)
+    const raw = Array.isArray(out) ? out[0] : out
+    this.onModelProgress?.(null)
+    if (gen !== this.generation) return
+    if (!raw?.audio?.length) throw new Error('neural synth returned no audio')
+    console.info(`[tts] neural: playing ${raw.audio.length} samples @ ${raw.sampling_rate}Hz`)
+    await this.playPcm(raw.audio, raw.sampling_rate, gen)
+  }
+  private async playPcm(pcm: Float32Array, sampleRate: number, gen: number): Promise<void> {
+    if (gen !== this.generation) return
+    const ctx = this.ensureCtx()
+    // Autoplay policy can leave the context suspended; resume before playing.
+    if (ctx.state === 'suspended') {
+      try { await ctx.resume() } catch { /* ignore */ }
+    }
+    if (gen !== this.generation) return
+    return new Promise((resolve) => {
+      const buffer = ctx.createBuffer(1, pcm.length, sampleRate)
+      buffer.getChannelData(0).set(pcm)
+      const source = ctx.createBufferSource()
+      source.buffer = buffer
+      source.connect(ctx.destination)
+      source.onended = () => {
+        if (this.currentSource === source) this.currentSource = null
+        resolve()
+      }
+      this.currentSource = source
+      source.start()
+    })
+  }
+}
+export const speech = new SpeechController()

package/src/renderer/styles.css CHANGED Viewed

@@ -659,6 +659,275 @@ html, body, #root {
   white-space: normal;
 }
+/* Floating push-to-talk mic (draggable, dismissable FAB) */
+.floating-mic {
+  position: fixed;
+  z-index: 200;
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  gap: 4px;
+  padding: 4px 4px 6px;
+  background: #16161e;
+  border: 1px solid #3b3d57;
+  border-radius: 12px;
+  box-shadow: 0 4px 16px rgba(0, 0, 0, 0.45);
+  user-select: none;
+  -webkit-user-select: none;
+}
+.floating-mic-handle {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  width: 100%;
+  cursor: grab;
+  touch-action: none;
+}
+.floating-mic-handle:active { cursor: grabbing; }
+.floating-mic-grip {
+  color: #565f89;
+  font-size: 12px;
+  line-height: 1;
+  padding: 0 2px;
+}
+.floating-mic-hide {
+  background: transparent;
+  border: none;
+  color: #565f89;
+  font-size: 15px;
+  line-height: 1;
+  cursor: pointer;
+  padding: 0 2px;
+}
+.floating-mic-hide:hover { color: #f7768e; }
+.voice-btn-floating {
+  width: 48px;
+  height: 48px;
+  border-radius: 50%;
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  background: #2a2b3d;
+  border: 1px solid #3b3d57;
+  color: #a9b1d6;
+  cursor: pointer;
+  position: relative;
+  transition: all 0.15s;
+}
+.voice-btn-floating:hover { border-color: #565f89; }
+.voice-btn-floating .voice-icon { font-size: 22px; }
+.voice-btn-floating .voice-dot {
+  position: absolute;
+  top: 5px;
+  right: 5px;
+}
+/* Floating chip sits above the round button rather than below it. */
+.voice-chip-floating {
+  top: auto;
+  bottom: 100%;
+  right: auto;
+  left: 50%;
+  transform: translateX(-50%);
+  margin-top: 0;
+  margin-bottom: 8px;
+}
+/* Mic source picker (small caret badge on the round mic + dropdown) */
+.voice-source-btn {
+  position: absolute;
+  bottom: -3px;
+  right: -3px;
+  width: 18px;
+  height: 18px;
+  border-radius: 50%;
+  display: inline-flex;
+  align-items: center;
+  justify-content: center;
+  padding: 0;
+  font-size: 10px;
+  line-height: 1;
+  background: #2a2b3d;
+  border: 1px solid #3b3d57;
+  color: #a9b1d6;
+  cursor: pointer;
+  z-index: 60;
+}
+.voice-source-btn:hover {
+  border-color: #7aa2f7;
+  color: #c0caf5;
+}
+.voice-source-menu {
+  position: absolute;
+  bottom: 100%;
+  right: 0;
+  margin-bottom: 10px;
+  min-width: 200px;
+  max-width: 280px;
+  padding: 4px;
+  background: #16161e;
+  border: 1px solid #3b3d57;
+  border-radius: 8px;
+  box-shadow: 0 6px 20px rgba(0, 0, 0, 0.5);
+  z-index: 300;
+}
+.voice-source-head {
+  font-size: 10px;
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+  color: #565f89;
+  padding: 4px 8px 6px;
+}
+.voice-source-item {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  width: 100%;
+  padding: 5px 8px;
+  border: none;
+  border-radius: 5px;
+  background: transparent;
+  color: #a9b1d6;
+  font-size: 12px;
+  text-align: left;
+  cursor: pointer;
+}
+.voice-source-item:hover { background: #1f2335; }
+.voice-source-item.active { color: #7aa2f7; }
+.voice-source-check {
+  flex: 0 0 12px;
+  width: 12px;
+  color: #7aa2f7;
+  font-size: 11px;
+}
+.voice-source-label {
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+.voice-source-empty {
+  padding: 6px 8px;
+  font-size: 11px;
+  color: #565f89;
+}
+/* Spoken-replies titlebar control */
+.speak-controls {
+  position: relative;
+  display: inline-flex;
+  align-items: center;
+}
+.speak-btn.active {
+  color: #7aa2f7;
+  border-color: #7aa2f7;
+}
+.speak-pct {
+  font-size: 9px;
+  margin-left: 3px;
+  color: #e0af68;
+}
+.speak-caret {
+  padding: 0 3px;
+  font-size: 10px;
+  min-width: 0;
+}
+.speak-stop {
+  color: #f7768e;
+  border-color: #f7768e;
+}
+.speak-stop:hover { background: #2d2030; }
+.speak-error {
+  position: absolute;
+  top: 100%;
+  right: 0;
+  margin-top: 6px;
+  max-width: 340px;
+  padding: 4px 9px;
+  border-radius: 5px;
+  font-size: 11px;
+  line-height: 1.3;
+  white-space: normal;
+  word-break: break-word;
+  background: #2d2030;
+  color: #f7768e;
+  border: 1px solid #f7768e;
+  z-index: 60;
+}
+.speak-menu {
+  position: absolute;
+  top: 100%;
+  right: 0;
+  margin-top: 6px;
+  min-width: 220px;
+  max-width: 280px;
+  padding: 6px;
+  background: #16161e;
+  border: 1px solid #3b3d57;
+  border-radius: 8px;
+  box-shadow: 0 6px 20px rgba(0, 0, 0, 0.5);
+  z-index: 300;
+}
+.speak-menu-head {
+  font-size: 10px;
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+  color: #565f89;
+  padding: 6px 6px 4px;
+}
+.speak-menu-item {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  width: 100%;
+  padding: 5px 6px;
+  border: none;
+  border-radius: 5px;
+  background: transparent;
+  color: #a9b1d6;
+  font-size: 12px;
+  text-align: left;
+  cursor: pointer;
+}
+.speak-menu-item:hover { background: #1f2335; }
+.speak-menu-item.active { color: #7aa2f7; }
+.speak-menu-check {
+  flex: 0 0 12px;
+  width: 12px;
+  color: #7aa2f7;
+  font-size: 11px;
+}
+.speak-select {
+  width: 100%;
+  margin: 2px 0 4px;
+  padding: 4px 6px;
+  background: #1f2335;
+  color: #a9b1d6;
+  border: 1px solid #3b3d57;
+  border-radius: 5px;
+  font-size: 12px;
+}
+.speak-rate {
+  width: 100%;
+  margin: 2px 0 6px;
+  accent-color: #7aa2f7;
+}
+.speak-menu-row {
+  display: flex;
+  gap: 6px;
+  padding: 2px 0 0;
+}
+.speak-menu-btn {
+  flex: 1;
+  padding: 5px 6px;
+  background: #2a2b3d;
+  color: #a9b1d6;
+  border: 1px solid #3b3d57;
+  border-radius: 5px;
+  font-size: 12px;
+  cursor: pointer;
+}
+.speak-menu-btn:hover { border-color: #7aa2f7; color: #c0caf5; }
 /* Editor panel */
 .editor-panel {
   display: flex;