@phenx-inc/ctlsurf 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/out/headless/index.mjs +26 -10
  2. package/out/headless/index.mjs.map +2 -2
  3. package/out/main/index.js +31 -9
  4. package/out/preload/index.js +8 -0
  5. package/out/renderer/assets/{cssMode-eTXVdAkZ.js → cssMode-BQN8v2ok.js} +3 -3
  6. package/out/renderer/assets/{freemarker2-B5BKaiK4.js → freemarker2-DbxGYYVp.js} +1 -1
  7. package/out/renderer/assets/{handlebars-BIdLd2wU.js → handlebars-3auU1CAd.js} +1 -1
  8. package/out/renderer/assets/{html-BXL4cnLS.js → html-D8xFiRmI.js} +1 -1
  9. package/out/renderer/assets/{htmlMode-46N3XG2c.js → htmlMode-M3MApZ4n.js} +3 -3
  10. package/out/renderer/assets/{index-dRvutfbl.js → index---H6cxNl.js} +696 -33
  11. package/out/renderer/assets/{index-Cf-RsxoC.css → index-B-iM7dFC.css} +195 -0
  12. package/out/renderer/assets/{javascript-n_iZZzDX.js → javascript-BO_ViZM5.js} +2 -2
  13. package/out/renderer/assets/{jsonMode-DXDczSNu.js → jsonMode-CKp2zvZu.js} +3 -3
  14. package/out/renderer/assets/{liquid-B1QweUh7.js → liquid-C1eHcrht.js} +1 -1
  15. package/out/renderer/assets/{lspLanguageFeatures-DqzMqkRk.js → lspLanguageFeatures-CHWJx_Tl.js} +1 -1
  16. package/out/renderer/assets/{mdx-BCv8lm5e.js → mdx-Qqdtk7fL.js} +1 -1
  17. package/out/renderer/assets/{python-BLNzYwDv.js → python-DKu7rNbs.js} +1 -1
  18. package/out/renderer/assets/{razor-CvAww8bG.js → razor-BOMpCo6z.js} +1 -1
  19. package/out/renderer/assets/{tsMode-C7m6Kr5E.js → tsMode-yAjlPR-D.js} +1 -1
  20. package/out/renderer/assets/{typescript-DhPw4VVg.js → typescript-BiJRCUcL.js} +1 -1
  21. package/out/renderer/assets/{xml-B0WLFJ2U.js → xml-D4PvYeQq.js} +1 -1
  22. package/out/renderer/assets/{yaml-BWyn9Wd7.js → yaml-BeHVkmnS.js} +1 -1
  23. package/out/renderer/index.html +2 -2
  24. package/package.json +1 -1
  25. package/src/main/index.ts +7 -0
  26. package/src/main/orchestrator.ts +38 -9
  27. package/src/preload/index.ts +11 -0
  28. package/src/renderer/App.tsx +5 -0
  29. package/src/renderer/components/SpeakControls.tsx +235 -0
  30. package/src/renderer/components/VoiceInput.tsx +159 -3
  31. package/src/renderer/lib/localWhisper.ts +48 -4
  32. package/src/renderer/lib/speech.ts +299 -0
  33. package/src/renderer/styles.css +195 -0
@@ -0,0 +1,299 @@
1
+ // Text-to-speech for spoken agent replies (Electron desktop only). Two engines
2
+ // the user can pick between:
3
+ // - 'web' → the built-in Web Speech synthesis API (OS voices, no download)
4
+ // - 'neural' → a local neural model via transformers.js (heavier, downloads
5
+ // on first use, more natural)
6
+ // A small queue serializes utterances so overlapping replies don't talk over
7
+ // each other, and stop() flushes everything.
8
+
9
+ export type TtsEngineId = 'web' | 'neural'
10
+
11
+ const ENGINE_KEY = 'ctlsurf.tts.engine'
12
+ const VOICE_KEY = 'ctlsurf.tts.voiceURI'
13
+ const RATE_KEY = 'ctlsurf.tts.rate'
14
+
15
+ // Cap so a runaway reply doesn't narrate for minutes; split into sentence-ish
16
+ // chunks so long passages stay reliable on the Web Speech backend.
17
+ const MAX_SPEAK_CHARS = 1600
18
+ const MAX_CHUNK_CHARS = 280
19
+
20
+ // ─── Text cleaning ───────────────────────────────────
21
+
22
+ // Turn a markdown-ish assistant reply into something listenable: code blocks
23
+ // are condensed to a short spoken marker rather than read symbol-by-symbol.
24
+ export function cleanForSpeech(input: string): string {
25
+ let t = input
26
+
27
+ // Fenced code blocks → "code block, N lines."
28
+ t = t.replace(/```[^\n]*\n?([\s\S]*?)```/g, (_m, body: string) => {
29
+ const lines = body.replace(/\n+$/, '').split('\n').filter((l) => l.trim().length).length
30
+ return lines > 0 ? ` (code block, ${lines} ${lines === 1 ? 'line' : 'lines'}) ` : ' (code) '
31
+ })
32
+ // Leftover/unterminated fence
33
+ t = t.replace(/```/g, ' (code) ')
34
+
35
+ // Inline code → spoken contents without backticks
36
+ t = t.replace(/`([^`]+)`/g, '$1')
37
+ // Markdown links [text](url) → text
38
+ t = t.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
39
+ // Bare URLs → "link"
40
+ t = t.replace(/https?:\/\/\S+/g, 'link')
41
+ // File paths with many slashes are noise when read aloud → basename
42
+ t = t.replace(/(?:[\w.-]*\/){2,}([\w.-]+)/g, '$1')
43
+ // Strip leftover markdown emphasis / heading / list / blockquote markers
44
+ t = t.replace(/[*_#>]+/g, ' ')
45
+ t = t.replace(/^\s*[-•]\s+/gm, ', ')
46
+ // HTML tags, if any slipped through
47
+ t = t.replace(/<[^>]+>/g, ' ')
48
+ // Collapse whitespace
49
+ t = t.replace(/\s+/g, ' ').trim()
50
+
51
+ if (t.length > MAX_SPEAK_CHARS) {
52
+ t = t.slice(0, MAX_SPEAK_CHARS).replace(/\s+\S*$/, '') + '…'
53
+ }
54
+ return t
55
+ }
56
+
57
+ function splitIntoChunks(text: string): string[] {
58
+ if (text.length <= MAX_CHUNK_CHARS) return text ? [text] : []
59
+ const sentences = text.match(/[^.!?]+[.!?]+|\S[^.!?]*$/g) || [text]
60
+ const chunks: string[] = []
61
+ let buf = ''
62
+ for (const s of sentences) {
63
+ if ((buf + s).length > MAX_CHUNK_CHARS && buf) {
64
+ chunks.push(buf.trim())
65
+ buf = ''
66
+ }
67
+ // A single sentence longer than the cap is hard-split.
68
+ if (s.length > MAX_CHUNK_CHARS) {
69
+ for (let i = 0; i < s.length; i += MAX_CHUNK_CHARS) chunks.push(s.slice(i, i + MAX_CHUNK_CHARS).trim())
70
+ } else {
71
+ buf += s
72
+ }
73
+ }
74
+ if (buf.trim()) chunks.push(buf.trim())
75
+ return chunks
76
+ }
77
+
78
+ // ─── Preferences ─────────────────────────────────────
79
+
80
+ export function getEngine(): TtsEngineId {
81
+ try { return localStorage.getItem(ENGINE_KEY) === 'neural' ? 'neural' : 'web' } catch { return 'web' }
82
+ }
83
+ export function setEngine(id: TtsEngineId): void {
84
+ try { localStorage.setItem(ENGINE_KEY, id) } catch { /* ignore */ }
85
+ }
86
+ export function getVoiceURI(): string | null {
87
+ try { return localStorage.getItem(VOICE_KEY) || null } catch { return null }
88
+ }
89
+ export function setVoiceURI(uri: string | null): void {
90
+ try { uri ? localStorage.setItem(VOICE_KEY, uri) : localStorage.removeItem(VOICE_KEY) } catch { /* ignore */ }
91
+ }
92
+ export function getRate(): number {
93
+ try {
94
+ const n = Number(localStorage.getItem(RATE_KEY))
95
+ return Number.isFinite(n) && n >= 0.5 && n <= 2 ? n : 1
96
+ } catch { return 1 }
97
+ }
98
+ export function setRate(rate: number): void {
99
+ try { localStorage.setItem(RATE_KEY, String(rate)) } catch { /* ignore */ }
100
+ }
101
+
102
+ export function listWebVoices(): SpeechSynthesisVoice[] {
103
+ if (typeof speechSynthesis === 'undefined') return []
104
+ return speechSynthesis.getVoices()
105
+ }
106
+
107
+ // ─── Neural engine (transformers.js) ─────────────────
108
+
109
+ // Self-contained neural voice: a VITS model that needs no speaker-embeddings
110
+ // file and no separate vocoder (unlike SpeechT5), so first use has far fewer
111
+ // ways to fail. English; output is mono 16kHz PCM.
112
+ const NEURAL_MODEL = 'Xenova/mms-tts-eng'
113
+
114
+ type RawAudio = { audio: Float32Array; sampling_rate: number }
115
+ type Synthesizer = (text: string, options?: Record<string, unknown>) => Promise<RawAudio | RawAudio[]>
116
+ let synthPromise: Promise<Synthesizer> | null = null
117
+
118
+ export interface TtsModelProgress { status: string; progress?: number }
119
+
120
+ function loadSynthesizer(onProgress?: (p: TtsModelProgress) => void): Promise<Synthesizer> {
121
+ if (!synthPromise) {
122
+ synthPromise = (async () => {
123
+ const { pipeline, env } = await import('@huggingface/transformers')
124
+ env.allowLocalModels = false
125
+ const common = { progress_callback: onProgress as never }
126
+ // WebGPU (Metal on macOS) is far faster than the WASM CPU backend for
127
+ // inference. Fall back to WASM if it's unavailable or the model has an op
128
+ // WebGPU can't run — same pattern as the Whisper path.
129
+ const hasWebGpu = typeof navigator !== 'undefined' && 'gpu' in navigator
130
+ if (hasWebGpu) {
131
+ try {
132
+ const s = (await pipeline('text-to-speech', NEURAL_MODEL, { ...common, device: 'webgpu' })) as unknown as Synthesizer
133
+ console.info('[tts] neural backend: webgpu')
134
+ return s
135
+ } catch (err) {
136
+ console.warn('[tts] WebGPU backend failed, falling back to WASM', err)
137
+ }
138
+ }
139
+ const s = (await pipeline('text-to-speech', NEURAL_MODEL, common)) as unknown as Synthesizer
140
+ console.info('[tts] neural backend: wasm')
141
+ return s
142
+ })()
143
+ synthPromise.catch(() => { synthPromise = null })
144
+ }
145
+ return synthPromise
146
+ }
147
+
148
+ export function isNeuralModelLoading(): boolean {
149
+ return synthPromise !== null
150
+ }
151
+
152
+ // ─── Controller ──────────────────────────────────────
153
+
154
+ class SpeechController {
155
+ private queue: string[] = []
156
+ private draining = false
157
+ private audioCtx: AudioContext | null = null
158
+ private currentSource: AudioBufferSourceNode | null = null
159
+ private generation = 0 // bumped by stop() to abort in-flight work
160
+ private active = false
161
+ onModelProgress: ((p: TtsModelProgress | null) => void) | null = null
162
+ // Fires true while a reply is being spoken/queued, false when idle — drives
163
+ // the visible Stop button.
164
+ onActivityChange: ((active: boolean) => void) | null = null
165
+ onError: ((message: string) => void) | null = null
166
+
167
+ // Prime/resume the AudioContext from a user gesture so neural playback isn't
168
+ // blocked by the browser autoplay policy (the System engine is unaffected).
169
+ unlock(): void {
170
+ try {
171
+ const ctx = this.ensureCtx()
172
+ if (ctx.state === 'suspended') void ctx.resume()
173
+ } catch { /* ignore */ }
174
+ }
175
+
176
+ // Start loading the neural model in the background (e.g. when the user picks
177
+ // the neural engine) so the first utterance doesn't pay download/compile time.
178
+ warmup(): void {
179
+ if (getEngine() !== 'neural') return
180
+ void loadSynthesizer((p) => this.onModelProgress?.(p))
181
+ .then(() => this.onModelProgress?.(null))
182
+ .catch(() => { this.onModelProgress?.(null) })
183
+ }
184
+
185
+ enqueue(rawText: string): void {
186
+ const text = cleanForSpeech(rawText)
187
+ if (!text) return
188
+ this.queue.push(...splitIntoChunks(text))
189
+ void this.drain()
190
+ }
191
+
192
+ stop(): void {
193
+ this.generation++
194
+ this.queue = []
195
+ this.draining = false
196
+ if (typeof speechSynthesis !== 'undefined') {
197
+ try { speechSynthesis.cancel() } catch { /* ignore */ }
198
+ }
199
+ if (this.currentSource) {
200
+ try { this.currentSource.stop() } catch { /* ignore */ }
201
+ this.currentSource = null
202
+ }
203
+ this.onModelProgress?.(null)
204
+ this.setActive(false)
205
+ }
206
+
207
+ private ensureCtx(): AudioContext {
208
+ if (!this.audioCtx) {
209
+ const Ctx = window.AudioContext || (window as unknown as { webkitAudioContext: typeof AudioContext }).webkitAudioContext
210
+ this.audioCtx = new Ctx()
211
+ }
212
+ return this.audioCtx
213
+ }
214
+
215
+ private setActive(a: boolean): void {
216
+ if (this.active === a) return
217
+ this.active = a
218
+ this.onActivityChange?.(a)
219
+ }
220
+
221
+ private async drain(): Promise<void> {
222
+ if (this.draining) return
223
+ this.draining = true
224
+ const gen = this.generation
225
+ this.setActive(true)
226
+ while (this.queue.length && gen === this.generation) {
227
+ const chunk = this.queue.shift() as string
228
+ try {
229
+ if (getEngine() === 'neural') await this.speakNeural(chunk, gen)
230
+ else await this.speakWeb(chunk, gen)
231
+ } catch (err) {
232
+ console.error('[tts] speak failed', err)
233
+ this.onModelProgress?.(null)
234
+ const detail = err instanceof Error ? err.message : String(err)
235
+ this.onError?.(getEngine() === 'neural' ? `Neural voice failed: ${detail}` : 'Speech failed')
236
+ }
237
+ }
238
+ if (gen === this.generation) {
239
+ this.draining = false
240
+ this.setActive(false)
241
+ }
242
+ }
243
+
244
+ private speakWeb(text: string, gen: number): Promise<void> {
245
+ return new Promise((resolve) => {
246
+ if (typeof speechSynthesis === 'undefined' || gen !== this.generation) return resolve()
247
+ const u = new SpeechSynthesisUtterance(text)
248
+ u.rate = getRate()
249
+ const wantUri = getVoiceURI()
250
+ if (wantUri) {
251
+ const v = speechSynthesis.getVoices().find((vv) => vv.voiceURI === wantUri)
252
+ if (v) u.voice = v
253
+ }
254
+ u.onend = () => resolve()
255
+ u.onerror = () => resolve()
256
+ speechSynthesis.speak(u)
257
+ })
258
+ }
259
+
260
+ private async speakNeural(text: string, gen: number): Promise<void> {
261
+ console.info('[tts] neural: loading model…')
262
+ this.onModelProgress?.({ status: 'loading' })
263
+ const synth = await loadSynthesizer((p) => this.onModelProgress?.(p))
264
+ if (gen !== this.generation) return
265
+ console.info('[tts] neural: synthesizing', JSON.stringify(text.slice(0, 60)))
266
+ const out = await synth(text)
267
+ const raw = Array.isArray(out) ? out[0] : out
268
+ this.onModelProgress?.(null)
269
+ if (gen !== this.generation) return
270
+ if (!raw?.audio?.length) throw new Error('neural synth returned no audio')
271
+ console.info(`[tts] neural: playing ${raw.audio.length} samples @ ${raw.sampling_rate}Hz`)
272
+ await this.playPcm(raw.audio, raw.sampling_rate, gen)
273
+ }
274
+
275
+ private async playPcm(pcm: Float32Array, sampleRate: number, gen: number): Promise<void> {
276
+ if (gen !== this.generation) return
277
+ const ctx = this.ensureCtx()
278
+ // Autoplay policy can leave the context suspended; resume before playing.
279
+ if (ctx.state === 'suspended') {
280
+ try { await ctx.resume() } catch { /* ignore */ }
281
+ }
282
+ if (gen !== this.generation) return
283
+ return new Promise((resolve) => {
284
+ const buffer = ctx.createBuffer(1, pcm.length, sampleRate)
285
+ buffer.getChannelData(0).set(pcm)
286
+ const source = ctx.createBufferSource()
287
+ source.buffer = buffer
288
+ source.connect(ctx.destination)
289
+ source.onended = () => {
290
+ if (this.currentSource === source) this.currentSource = null
291
+ resolve()
292
+ }
293
+ this.currentSource = source
294
+ source.start()
295
+ })
296
+ }
297
+ }
298
+
299
+ export const speech = new SpeechController()
@@ -733,6 +733,201 @@ html, body, #root {
733
733
  margin-bottom: 8px;
734
734
  }
735
735
 
736
+ /* Mic source picker (small caret badge on the round mic + dropdown) */
737
+ .voice-source-btn {
738
+ position: absolute;
739
+ bottom: -3px;
740
+ right: -3px;
741
+ width: 18px;
742
+ height: 18px;
743
+ border-radius: 50%;
744
+ display: inline-flex;
745
+ align-items: center;
746
+ justify-content: center;
747
+ padding: 0;
748
+ font-size: 10px;
749
+ line-height: 1;
750
+ background: #2a2b3d;
751
+ border: 1px solid #3b3d57;
752
+ color: #a9b1d6;
753
+ cursor: pointer;
754
+ z-index: 60;
755
+ }
756
+ .voice-source-btn:hover {
757
+ border-color: #7aa2f7;
758
+ color: #c0caf5;
759
+ }
760
+ .voice-source-menu {
761
+ position: absolute;
762
+ bottom: 100%;
763
+ right: 0;
764
+ margin-bottom: 10px;
765
+ min-width: 200px;
766
+ max-width: 280px;
767
+ padding: 4px;
768
+ background: #16161e;
769
+ border: 1px solid #3b3d57;
770
+ border-radius: 8px;
771
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.5);
772
+ z-index: 300;
773
+ }
774
+ .voice-source-head {
775
+ font-size: 10px;
776
+ text-transform: uppercase;
777
+ letter-spacing: 0.04em;
778
+ color: #565f89;
779
+ padding: 4px 8px 6px;
780
+ }
781
+ .voice-source-item {
782
+ display: flex;
783
+ align-items: center;
784
+ gap: 6px;
785
+ width: 100%;
786
+ padding: 5px 8px;
787
+ border: none;
788
+ border-radius: 5px;
789
+ background: transparent;
790
+ color: #a9b1d6;
791
+ font-size: 12px;
792
+ text-align: left;
793
+ cursor: pointer;
794
+ }
795
+ .voice-source-item:hover { background: #1f2335; }
796
+ .voice-source-item.active { color: #7aa2f7; }
797
+ .voice-source-check {
798
+ flex: 0 0 12px;
799
+ width: 12px;
800
+ color: #7aa2f7;
801
+ font-size: 11px;
802
+ }
803
+ .voice-source-label {
804
+ overflow: hidden;
805
+ text-overflow: ellipsis;
806
+ white-space: nowrap;
807
+ }
808
+ .voice-source-empty {
809
+ padding: 6px 8px;
810
+ font-size: 11px;
811
+ color: #565f89;
812
+ }
813
+
814
+ /* Spoken-replies titlebar control */
815
+ .speak-controls {
816
+ position: relative;
817
+ display: inline-flex;
818
+ align-items: center;
819
+ }
820
+ .speak-btn.active {
821
+ color: #7aa2f7;
822
+ border-color: #7aa2f7;
823
+ }
824
+ .speak-pct {
825
+ font-size: 9px;
826
+ margin-left: 3px;
827
+ color: #e0af68;
828
+ }
829
+ .speak-caret {
830
+ padding: 0 3px;
831
+ font-size: 10px;
832
+ min-width: 0;
833
+ }
834
+ .speak-stop {
835
+ color: #f7768e;
836
+ border-color: #f7768e;
837
+ }
838
+ .speak-stop:hover { background: #2d2030; }
839
+ .speak-error {
840
+ position: absolute;
841
+ top: 100%;
842
+ right: 0;
843
+ margin-top: 6px;
844
+ max-width: 340px;
845
+ padding: 4px 9px;
846
+ border-radius: 5px;
847
+ font-size: 11px;
848
+ line-height: 1.3;
849
+ white-space: normal;
850
+ word-break: break-word;
851
+ background: #2d2030;
852
+ color: #f7768e;
853
+ border: 1px solid #f7768e;
854
+ z-index: 60;
855
+ }
856
+ .speak-menu {
857
+ position: absolute;
858
+ top: 100%;
859
+ right: 0;
860
+ margin-top: 6px;
861
+ min-width: 220px;
862
+ max-width: 280px;
863
+ padding: 6px;
864
+ background: #16161e;
865
+ border: 1px solid #3b3d57;
866
+ border-radius: 8px;
867
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.5);
868
+ z-index: 300;
869
+ }
870
+ .speak-menu-head {
871
+ font-size: 10px;
872
+ text-transform: uppercase;
873
+ letter-spacing: 0.04em;
874
+ color: #565f89;
875
+ padding: 6px 6px 4px;
876
+ }
877
+ .speak-menu-item {
878
+ display: flex;
879
+ align-items: center;
880
+ gap: 6px;
881
+ width: 100%;
882
+ padding: 5px 6px;
883
+ border: none;
884
+ border-radius: 5px;
885
+ background: transparent;
886
+ color: #a9b1d6;
887
+ font-size: 12px;
888
+ text-align: left;
889
+ cursor: pointer;
890
+ }
891
+ .speak-menu-item:hover { background: #1f2335; }
892
+ .speak-menu-item.active { color: #7aa2f7; }
893
+ .speak-menu-check {
894
+ flex: 0 0 12px;
895
+ width: 12px;
896
+ color: #7aa2f7;
897
+ font-size: 11px;
898
+ }
899
+ .speak-select {
900
+ width: 100%;
901
+ margin: 2px 0 4px;
902
+ padding: 4px 6px;
903
+ background: #1f2335;
904
+ color: #a9b1d6;
905
+ border: 1px solid #3b3d57;
906
+ border-radius: 5px;
907
+ font-size: 12px;
908
+ }
909
+ .speak-rate {
910
+ width: 100%;
911
+ margin: 2px 0 6px;
912
+ accent-color: #7aa2f7;
913
+ }
914
+ .speak-menu-row {
915
+ display: flex;
916
+ gap: 6px;
917
+ padding: 2px 0 0;
918
+ }
919
+ .speak-menu-btn {
920
+ flex: 1;
921
+ padding: 5px 6px;
922
+ background: #2a2b3d;
923
+ color: #a9b1d6;
924
+ border: 1px solid #3b3d57;
925
+ border-radius: 5px;
926
+ font-size: 12px;
927
+ cursor: pointer;
928
+ }
929
+ .speak-menu-btn:hover { border-color: #7aa2f7; color: #c0caf5; }
930
+
736
931
  /* Editor panel */
737
932
  .editor-panel {
738
933
  display: flex;