npm - @phenx-inc/ctlsurf - Versions diffs - 0.5.2 → 0.6.0 - Mend

@phenx-inc/ctlsurf 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/out/renderer/assets/{tsMode-EIuSGG42.js → tsMode-CYd3NUkW.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { c as createWebWorker, e as editor, U as Uri, a as MarkerTag, M as MarkerSeverity, l as languages, t as typescriptDefaults, R as Range } from "./index-DBt_rov1.js";
+import { c as createWebWorker, e as editor, U as Uri, a as MarkerTag, M as MarkerSeverity, l as languages, t as typescriptDefaults, R as Range } from "./index-B60JU1yI.js";
 class WorkerManager {
   constructor(_modeId, _defaults) {
     this._modeId = _modeId;

package/out/renderer/assets/{typescript-DQkV4kKA.js → typescript-rkc9lhpi.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { l as languages } from "./index-DBt_rov1.js";
+import { l as languages } from "./index-B60JU1yI.js";
 const conf = {
   wordPattern: /(-?\d*\.\d\w*)|([^\`\~\!\@\#\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s]+)/g,
   comments: {

package/out/renderer/assets/{xml-DJ0OOQTu.js → xml-EsHEUps1.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { l as languages } from "./index-DBt_rov1.js";
+import { l as languages } from "./index-B60JU1yI.js";
 const conf = {
   comments: {
     blockComment: ["<!--", "-->"]

package/out/renderer/assets/{yaml-DxX26XLN.js → yaml-B9-nQ_s2.js} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { l as languages } from "./index-DBt_rov1.js";
+import { l as languages } from "./index-B60JU1yI.js";
 const conf = {
   comments: {
     lineComment: "#"

package/out/renderer/index.html CHANGED Viewed

@@ -4,8 +4,8 @@
   <meta charset="UTF-8" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>ctlsurf-worker</title>
-  <script type="module" crossorigin src="./assets/index-DBt_rov1.js"></script>
-  <link rel="stylesheet" crossorigin href="./assets/index-ezC-iarf.css">
+  <script type="module" crossorigin src="./assets/index-B60JU1yI.js"></script>
+  <link rel="stylesheet" crossorigin href="./assets/index-DJFYmHjz.css">
 </head>
 <body>
   <div id="root"></div>

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@phenx-inc/ctlsurf",
-  "version": "0.5.2",
+  "version": "0.6.0",
   "description": "Agent-agnostic terminal and desktop app for ctlsurf — run Claude Code, Codex, or any coding agent with live session logging and remote control",
   "main": "out/main/index.js",
   "bin": {
@@ -43,6 +43,7 @@
     "node": ">=18"
   },
   "dependencies": {
+    "@huggingface/transformers": "^4.2.0",
     "@monaco-editor/react": "^4.7.0",
     "@xterm/addon-fit": "^0.10.0",
     "@xterm/addon-serialize": "^0.14.0",

package/src/main/index.ts CHANGED Viewed

@@ -132,6 +132,13 @@ function createWindow(): void {
     }
   })
+  // Voice typing needs microphone access. Approve permission requests (there
+  // was no handler before, so the renderer already had the default-permissive
+  // behavior — this just ensures the mic request resolves to "allow").
+  mainWindow.webContents.session.setPermissionRequestHandler((_wc, _permission, callback) => {
+    callback(true)
+  })
   if (process.env.ELECTRON_RENDERER_URL) {
     mainWindow.loadURL(process.env.ELECTRON_RENDERER_URL)
   } else {

package/src/renderer/App.tsx CHANGED Viewed

@@ -1,5 +1,6 @@
 import { useState, useEffect, useCallback, useRef } from 'react'
-import { TerminalPanel, destroyTerminal } from './components/TerminalPanel'
+import { TerminalPanel, destroyTerminal, focusTerminal } from './components/TerminalPanel'
+import { VoiceInput } from './components/VoiceInput'
 import { CtlsurfPanel } from './components/CtlsurfPanel'
 import { EditorPanel } from './components/EditorPanel'
 import { AgentPicker } from './components/AgentPicker'
@@ -206,6 +207,16 @@ export default function App() {
     }
   }, [trackingActive])
+  // Voice typing: inject the transcribed text into the active terminal exactly
+  // as if it were typed (no auto-submit), then refocus so the user can press
+  // Enter to send it.
+  const handleVoiceTranscript = useCallback((text: string) => {
+    const trimmed = text.trim()
+    if (!trimmed) return
+    window.worker.writePty(activeTabId, trimmed)
+    focusTerminal(activeTabId)
+  }, [activeTabId])
   const cwdRef = useRef<string | null>(null)
   const handleSpawn = useCallback(async (tabId: string, agent: AgentConfig) => {
@@ -471,6 +482,7 @@ export default function App() {
             </svg>
             <span>Tickets</span>
           </button>
+          <VoiceInput onTranscript={handleVoiceTranscript} />
           <span className="titlebar-separator" />
           {agents.map(a => {
             const activeTab = tabs.find(t => t.id === activeTabId)

package/src/renderer/components/TerminalPanel.tsx CHANGED Viewed

@@ -185,6 +185,12 @@ function getOrCreateTerminal(tabId: string, onExit: (tabId: string) => void): {
   return { terminal, fitAddon }
 }
+// Return keyboard focus to a tab's terminal (e.g. after inserting voice text
+// so the user can immediately press Enter to submit).
+export function focusTerminal(tabId: string): void {
+  _terminals.get(tabId)?.terminal.focus()
+}
 export function destroyTerminal(tabId: string): void {
   const state = _terminals.get(tabId)
   if (!state) return

package/src/renderer/components/VoiceInput.tsx ADDED Viewed

@@ -0,0 +1,313 @@
+import { useCallback, useEffect, useRef, useState } from 'react'
+import { transcribeBlob, type ModelProgress } from '../lib/localWhisper'
+// ─── Minimal Web Speech API typings ──────────────────
+// webkitSpeechRecognition isn't in the standard DOM lib, so declare just the
+// surface we use. This API is frequently unavailable inside Electron (Chromium
+// ships without Google's speech backend); when it fails we fall back to a local
+// Whisper model (see ../lib/localWhisper).
+interface SpeechRecognitionResult {
+  isFinal: boolean
+  0: { transcript: string }
+}
+interface SpeechRecognitionEvent {
+  resultIndex: number
+  results: { length: number;[index: number]: SpeechRecognitionResult }
+}
+interface SpeechRecognitionErrorEvent { error: string }
+interface SpeechRecognitionLike {
+  lang: string
+  continuous: boolean
+  interimResults: boolean
+  start: () => void
+  stop: () => void
+  abort: () => void
+  onresult: ((e: SpeechRecognitionEvent) => void) | null
+  onerror: ((e: SpeechRecognitionErrorEvent) => void) | null
+  onend: (() => void) | null
+}
+type SpeechRecognitionCtor = new () => SpeechRecognitionLike
+function getRecognitionCtor(): SpeechRecognitionCtor | null {
+  const w = window as unknown as {
+    SpeechRecognition?: SpeechRecognitionCtor
+    webkitSpeechRecognition?: SpeechRecognitionCtor
+  }
+  return w.SpeechRecognition || w.webkitSpeechRecognition || null
+}
+// ─── Capabilities & engine selection ─────────────────
+type Engine = 'web-speech' | 'local'
+type Phase = 'idle' | 'listening' | 'transcribing'
+const ENGINE_KEY = 'ctlsurf.voiceEngine'
+const WEB_SPEECH_SUPPORTED = getRecognitionCtor() !== null
+const LOCAL_SUPPORTED =
+  typeof navigator !== 'undefined' &&
+  !!navigator.mediaDevices?.getUserMedia &&
+  typeof MediaRecorder !== 'undefined' &&
+  typeof OfflineAudioContext !== 'undefined'
+const ANY_SUPPORTED = WEB_SPEECH_SUPPORTED || LOCAL_SUPPORTED
+function loadInitialEngine(): Engine {
+  if (!WEB_SPEECH_SUPPORTED && LOCAL_SUPPORTED) return 'local'
+  try {
+    if (localStorage.getItem(ENGINE_KEY) === 'local' && LOCAL_SUPPORTED) return 'local'
+  } catch { /* ignore */ }
+  return WEB_SPEECH_SUPPORTED ? 'web-speech' : 'local'
+}
+// Web Speech errors that mean the engine itself is unreachable (vs. a mic
+// permission/hardware problem, which would also break the local fallback).
+function isEngineUnavailable(code: string): boolean {
+  return code === 'network' || code === 'service-not-allowed'
+}
+function describeMicError(err: unknown): string {
+  const name = (err as { name?: string })?.name
+  if (name === 'NotAllowedError' || name === 'SecurityError') return 'Microphone access denied'
+  if (name === 'NotFoundError') return 'No microphone found'
+  return 'Could not start microphone'
+}
+interface VoiceInputProps {
+  // Called once per push-to-talk session with the final transcribed text.
+  onTranscript: (text: string) => void
+}
+export function VoiceInput({ onTranscript }: VoiceInputProps) {
+  const [engine, setEngine] = useState<Engine>(loadInitialEngine)
+  const [phase, setPhase] = useState<Phase>('idle')
+  const [interim, setInterim] = useState('')
+  const [modelPct, setModelPct] = useState<number | null>(null)
+  const [error, setError] = useState<string | null>(null)
+  const [notice, setNotice] = useState<string | null>(null)
+  // Web Speech refs
+  const recognitionRef = useRef<SpeechRecognitionLike | null>(null)
+  const finalRef = useRef('')
+  // Local (Whisper) refs
+  const streamRef = useRef<MediaStream | null>(null)
+  const recorderRef = useRef<MediaRecorder | null>(null)
+  const chunksRef = useRef<Blob[]>([])
+  // Set true when the user releases before getUserMedia resolves (quick tap).
+  const cancelGestureRef = useRef(false)
+  const engineRef = useRef(engine)
+  useEffect(() => { engineRef.current = engine }, [engine])
+  const onTranscriptRef = useRef(onTranscript)
+  useEffect(() => { onTranscriptRef.current = onTranscript }, [onTranscript])
+  // Auto-dismiss transient chips.
+  useEffect(() => {
+    if (!error) return
+    const t = setTimeout(() => setError(null), 4500)
+    return () => clearTimeout(t)
+  }, [error])
+  useEffect(() => {
+    if (!notice) return
+    const t = setTimeout(() => setNotice(null), 5000)
+    return () => clearTimeout(t)
+  }, [notice])
+  const switchToLocal = useCallback((reason: string) => {
+    try { localStorage.setItem(ENGINE_KEY, 'local') } catch { /* ignore */ }
+    setEngine('local')
+    setNotice(reason)
+  }, [])
+  const stopStream = useCallback(() => {
+    streamRef.current?.getTracks().forEach((t) => t.stop())
+    streamRef.current = null
+  }, [])
+  // ─── Web Speech engine ─────────────────────────────
+  const startWebSpeech = useCallback(() => {
+    const Ctor = getRecognitionCtor()
+    if (!Ctor || recognitionRef.current) return
+    setError(null); setNotice(null); setInterim('')
+    finalRef.current = ''
+    const rec = new Ctor()
+    rec.lang = navigator.language || 'en-US'
+    rec.continuous = true
+    rec.interimResults = true
+    rec.onresult = (event) => {
+      let finalText = ''
+      let interimText = ''
+      for (let i = 0; i < event.results.length; i++) {
+        const res = event.results[i]
+        if (res.isFinal) finalText += res[0].transcript
+        else interimText += res[0].transcript
+      }
+      finalRef.current = finalText
+      setInterim(interimText)
+    }
+    rec.onerror = (event) => {
+      if (isEngineUnavailable(event.error) && LOCAL_SUPPORTED) {
+        // The streamed audio is gone; switch engines and ask for a retry.
+        finalRef.current = ''
+        switchToLocal('Voice service unavailable — switched to on-device. Press again.')
+      } else if (event.error !== 'no-speech' && event.error !== 'aborted') {
+        setError(event.error === 'not-allowed' ? 'Microphone access denied' : `Voice error: ${event.error}`)
+      }
+    }
+    rec.onend = () => {
+      const text = finalRef.current.trim()
+      recognitionRef.current = null
+      setPhase('idle')
+      setInterim('')
+      if (text) onTranscriptRef.current(text)
+    }
+    recognitionRef.current = rec
+    try {
+      rec.start()
+      setPhase('listening')
+    } catch (err) {
+      recognitionRef.current = null
+      setPhase('idle')
+      setError('Could not start microphone')
+      console.error('[voice] web speech start failed', err)
+    }
+  }, [switchToLocal])
+  const stopWebSpeech = useCallback(() => {
+    try { recognitionRef.current?.stop() } catch { /* already stopped */ }
+  }, [])
+  // ─── Local (Whisper) engine ────────────────────────
+  const handleModelProgress = useCallback((p: ModelProgress) => {
+    if (p.status === 'progress' && typeof p.progress === 'number') {
+      setModelPct(Math.min(100, Math.round(p.progress)))
+    }
+  }, [])
+  const runLocalTranscription = useCallback(async (rec: MediaRecorder) => {
+    stopStream()
+    const blob = new Blob(chunksRef.current, { type: rec.mimeType || 'audio/webm' })
+    chunksRef.current = []
+    recorderRef.current = null
+    if (blob.size === 0) { setPhase('idle'); return }
+    setPhase('transcribing')
+    setInterim('')
+    try {
+      const text = await transcribeBlob(blob, handleModelProgress)
+      if (text) onTranscriptRef.current(text)
+    } catch (err) {
+      setError('On-device transcription failed')
+      console.error('[voice] local transcription failed', err)
+    } finally {
+      setPhase('idle')
+      setModelPct(null)
+    }
+  }, [stopStream, handleModelProgress])
+  const startLocal = useCallback(async () => {
+    setError(null); setNotice(null); setInterim('')
+    cancelGestureRef.current = false
+    try {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+      // Released during the permission/await — don't record anything.
+      if (cancelGestureRef.current) {
+        stream.getTracks().forEach((t) => t.stop())
+        setPhase('idle')
+        return
+      }
+      streamRef.current = stream
+      chunksRef.current = []
+      const rec = new MediaRecorder(stream)
+      rec.ondataavailable = (e) => { if (e.data.size) chunksRef.current.push(e.data) }
+      rec.onstop = () => { void runLocalTranscription(rec) }
+      recorderRef.current = rec
+      rec.start()
+      setPhase('listening')
+    } catch (err) {
+      stopStream()
+      setPhase('idle')
+      setError(describeMicError(err))
+      console.error('[voice] getUserMedia failed', err)
+    }
+  }, [runLocalTranscription, stopStream])
+  const stopLocal = useCallback(() => {
+    cancelGestureRef.current = true
+    const rec = recorderRef.current
+    if (rec && rec.state !== 'inactive') {
+      try { rec.stop() } catch { /* ignore */ }
+    }
+  }, [])
+  // ─── Push-to-talk gesture ──────────────────────────
+  const handlePointerDown = (e: React.PointerEvent) => {
+    if (!ANY_SUPPORTED || phase !== 'idle') return
+    e.preventDefault()
+    e.currentTarget.setPointerCapture?.(e.pointerId)
+    if (engineRef.current === 'web-speech' && WEB_SPEECH_SUPPORTED) startWebSpeech()
+    else if (LOCAL_SUPPORTED) void startLocal()
+  }
+  const handlePointerUp = (e: React.PointerEvent) => {
+    e.currentTarget.releasePointerCapture?.(e.pointerId)
+    if (engineRef.current === 'web-speech') stopWebSpeech()
+    else stopLocal()
+  }
+  // Clean up on unmount.
+  useEffect(() => () => {
+    try { recognitionRef.current?.abort() } catch { /* ignore */ }
+    try { recorderRef.current?.stop() } catch { /* ignore */ }
+    streamRef.current?.getTracks().forEach((t) => t.stop())
+  }, [])
+  // ─── Render ────────────────────────────────────────
+  const listening = phase === 'listening'
+  const busy = phase === 'transcribing'
+  const title = !ANY_SUPPORTED
+    ? 'Voice typing not supported in this build'
+    : listening
+      ? 'Listening… release to insert'
+      : busy
+        ? 'Transcribing…'
+        : engine === 'local'
+          ? 'Hold to talk (on-device) — speech is typed into the terminal'
+          : 'Hold to talk — speech is typed into the terminal'
+  let chip: { kind: 'listening' | 'busy' | 'notice' | 'error'; text: string } | null = null
+  if (error && phase === 'idle') chip = { kind: 'error', text: error }
+  else if (notice && phase === 'idle') chip = { kind: 'notice', text: notice }
+  else if (listening) chip = { kind: 'listening', text: interim || (engine === 'local' ? 'Recording…' : 'Listening…') }
+  else if (busy) chip = { kind: 'busy', text: modelPct !== null ? `Downloading voice model… ${modelPct}%` : 'Transcribing…' }
+  return (
+    <div className="voice-input-wrap">
+      <button
+        type="button"
+        className={`titlebar-btn titlebar-icon-btn voice-btn ${listening ? 'listening' : ''} ${busy ? 'busy' : ''}`}
+        disabled={!ANY_SUPPORTED}
+        onPointerDown={handlePointerDown}
+        onPointerUp={handlePointerUp}
+        onPointerCancel={handlePointerUp}
+        onContextMenu={(e) => e.preventDefault()}
+        title={title}
+        aria-label="Voice typing (hold to talk)"
+      >
+        <span className="voice-icon" aria-hidden="true">🎤</span>
+        <span className={`voice-dot ${listening ? 'on' : busy ? 'busy' : 'off'}`} />
+      </button>
+      {chip && <div className={`voice-chip ${chip.kind}`}>{chip.text}</div>}
+    </div>
+  )
+}

package/src/renderer/lib/localWhisper.ts ADDED Viewed

@@ -0,0 +1,88 @@
+// Local, offline speech-to-text via transformers.js (Whisper). Used as the
+// fallback when the browser's Web Speech API is unavailable (the common case
+// inside packaged Electron). The library and the ~75MB model are fetched lazily
+// the first time a local transcription is requested, then cached by the runtime.
+const MODEL = 'Xenova/whisper-base'
+const TARGET_SAMPLE_RATE = 16000
+export interface ModelProgress {
+  status: string
+  file?: string
+  progress?: number // 0–100 for the file currently downloading
+  loaded?: number
+  total?: number
+}
+// transformers.js is large and node-aware, so import it dynamically (Vite
+// code-splits it into its own chunk that only loads on first local use).
+type Transcriber = (audio: Float32Array, options?: Record<string, unknown>) => Promise<{ text: string } | Array<{ text: string }>>
+let transcriberPromise: Promise<Transcriber> | null = null
+export function isLocalModelLoading(): boolean {
+  return transcriberPromise !== null
+}
+export async function loadTranscriber(onProgress?: (p: ModelProgress) => void): Promise<Transcriber> {
+  if (!transcriberPromise) {
+    transcriberPromise = (async () => {
+      const { pipeline, env } = await import('@huggingface/transformers')
+      // We don't ship model files; always fetch from the Hugging Face hub.
+      env.allowLocalModels = false
+      const common = { progress_callback: onProgress as never }
+      // Prefer WebGPU for speed when the runtime exposes it; otherwise use the
+      // default (WASM) backend, which always works. Guarding on navigator.gpu
+      // avoids a wasted partial download when there's no GPU path at all.
+      const hasWebGpu = typeof navigator !== 'undefined' && 'gpu' in navigator
+      if (hasWebGpu) {
+        try {
+          return (await pipeline('automatic-speech-recognition', MODEL, { ...common, device: 'webgpu' })) as unknown as Transcriber
+        } catch (err) {
+          console.warn('[voice] WebGPU backend failed, falling back to WASM', err)
+        }
+      }
+      return (await pipeline('automatic-speech-recognition', MODEL, common)) as unknown as Transcriber
+    })()
+    // Allow a later retry if the first load fails (e.g. offline on first use).
+    transcriberPromise.catch(() => { transcriberPromise = null })
+  }
+  return transcriberPromise
+}
+// Decode a recorded audio Blob and resample it to mono 16kHz Float32 PCM, which
+// is what Whisper expects. Returns null for empty/undecodable clips.
+async function blobToPcm16k(blob: Blob): Promise<Float32Array | null> {
+  if (blob.size === 0) return null
+  const arrayBuffer = await blob.arrayBuffer()
+  const AudioCtx = window.AudioContext || (window as unknown as { webkitAudioContext: typeof AudioContext }).webkitAudioContext
+  const ctx = new AudioCtx()
+  let decoded: AudioBuffer
+  try {
+    decoded = await ctx.decodeAudioData(arrayBuffer)
+  } catch {
+    return null
+  } finally {
+    ctx.close()
+  }
+  const length = Math.ceil(decoded.duration * TARGET_SAMPLE_RATE)
+  if (length < 1) return null
+  const offline = new OfflineAudioContext(1, length, TARGET_SAMPLE_RATE)
+  const source = offline.createBufferSource()
+  source.buffer = decoded
+  source.connect(offline.destination)
+  source.start()
+  const rendered = await offline.startRendering()
+  return rendered.getChannelData(0)
+}
+export async function transcribeBlob(blob: Blob, onProgress?: (p: ModelProgress) => void): Promise<string> {
+  const transcriber = await loadTranscriber(onProgress)
+  const pcm = await blobToPcm16k(blob)
+  if (!pcm) return ''
+  const result = await transcriber(pcm)
+  const text = Array.isArray(result)
+    ? result.map((r) => r.text).join(' ')
+    : result?.text
+  return (text || '').trim()
+}

package/src/renderer/styles.css CHANGED Viewed

@@ -570,6 +570,95 @@ html, body, #root {
   line-height: 1;
 }
+/* Voice typing (push-to-talk mic) */
+.voice-input-wrap {
+  position: relative;
+  display: inline-flex;
+}
+.voice-btn {
+  user-select: none;
+  -webkit-user-select: none;
+  touch-action: none;
+}
+.voice-btn:disabled {
+  opacity: 0.4;
+  cursor: not-allowed;
+}
+.voice-btn.listening {
+  color: #f7768e;
+  border-color: #f7768e;
+  background: #1f2335;
+}
+.voice-btn.busy {
+  color: #e0af68;
+  border-color: #e0af68;
+  background: #1f2335;
+}
+.voice-icon {
+  font-size: 13px;
+  line-height: 1;
+}
+.voice-dot {
+  width: 6px;
+  height: 6px;
+  border-radius: 50%;
+  display: inline-block;
+  vertical-align: middle;
+  background: #565f89;
+}
+.voice-dot.on {
+  background: #f7768e;
+  box-shadow: 0 0 4px #f7768e;
+  animation: voice-pulse 1s ease-in-out infinite;
+}
+.voice-dot.busy {
+  background: #e0af68;
+  box-shadow: 0 0 4px #e0af68;
+  animation: voice-pulse 0.8s ease-in-out infinite;
+}
+@keyframes voice-pulse {
+  0%, 100% { opacity: 1; }
+  50% { opacity: 0.3; }
+}
+.voice-chip {
+  position: absolute;
+  top: 100%;
+  right: 0;
+  margin-top: 6px;
+  max-width: 320px;
+  padding: 4px 9px;
+  border-radius: 5px;
+  font-size: 11px;
+  line-height: 1.3;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  z-index: 50;
+  pointer-events: none;
+  border: 1px solid #3b3d57;
+}
+.voice-chip.listening {
+  background: #1f2335;
+  color: #a9b1d6;
+}
+.voice-chip.busy {
+  background: #1f2335;
+  color: #e0af68;
+  border-color: #e0af68;
+}
+.voice-chip.notice {
+  background: #1f2335;
+  color: #e0af68;
+  border-color: #e0af68;
+  white-space: normal;
+}
+.voice-chip.error {
+  background: #2d2030;
+  color: #f7768e;
+  border-color: #f7768e;
+  white-space: normal;
+}
 /* Editor panel */
 .editor-panel {
   display: flex;