speech-to-speech 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,1118 @@
1
+ import { getDefaultRealTimeVADOptions, MicVAD } from '@ricky0123/vad-web';
2
+ import * as piperTts from '@realtimex/piper-tts-web';
3
+
4
+ // src/stt/reset-stt-logic.ts
5
+ var ResetSTTLogic = class {
6
+ constructor(options = {}) {
7
+ this.partialTranscript = "";
8
+ this.maxSilenceMs = options.maxSilenceMs ?? 2e3;
9
+ this.maxUtteranceMs = options.maxUtteranceMs ?? 15e3;
10
+ this.onReset = options.onReset;
11
+ this.now = options.now ?? (() => Date.now());
12
+ const start = this.now();
13
+ this.utteranceStartedAt = start;
14
+ this.lastActivityAt = start;
15
+ }
16
+ recordSpeechActivity(timestamp) {
17
+ const now = timestamp ?? this.now();
18
+ this.lastActivityAt = now;
19
+ if (!this.utteranceStartedAt) {
20
+ this.utteranceStartedAt = now;
21
+ }
22
+ }
23
+ updatePartialTranscript(partial, timestamp) {
24
+ this.partialTranscript = partial;
25
+ this.recordSpeechActivity(timestamp);
26
+ }
27
+ shouldReset(timestamp) {
28
+ const now = timestamp ?? this.now();
29
+ const silenceElapsed = now - this.lastActivityAt;
30
+ const utteranceElapsed = now - this.utteranceStartedAt;
31
+ if (silenceElapsed >= this.maxSilenceMs) {
32
+ return "silence";
33
+ }
34
+ if (utteranceElapsed >= this.maxUtteranceMs) {
35
+ return "utterance-complete";
36
+ }
37
+ return null;
38
+ }
39
+ maybeReset(timestamp) {
40
+ const reason = this.shouldReset(timestamp);
41
+ if (reason) {
42
+ this.reset(reason, timestamp);
43
+ }
44
+ return reason;
45
+ }
46
+ forceReset(reason = "manual", timestamp) {
47
+ this.reset(reason, timestamp);
48
+ }
49
+ reset(reason, timestamp) {
50
+ const now = timestamp ?? this.now();
51
+ const stats = {
52
+ utteranceStartedAt: this.utteranceStartedAt,
53
+ lastActivityAt: this.lastActivityAt,
54
+ partialTranscript: this.partialTranscript
55
+ };
56
+ this.utteranceStartedAt = now;
57
+ this.lastActivityAt = now;
58
+ this.partialTranscript = "";
59
+ if (this.onReset) {
60
+ this.onReset(reason, stats);
61
+ }
62
+ }
63
+ };
64
+ var VADController = class {
65
+ constructor(options) {
66
+ this.vad = null;
67
+ this.voiceStartListeners = /* @__PURE__ */ new Set();
68
+ this.voiceStopListeners = /* @__PURE__ */ new Set();
69
+ this.running = false;
70
+ this.options = options;
71
+ }
72
+ async start() {
73
+ if (this.running && this.vad) {
74
+ if (!this.vad.listening) {
75
+ await this.vad.start();
76
+ }
77
+ return;
78
+ }
79
+ if (typeof navigator === "undefined" || !navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
80
+ throw new Error("Microphone access is not available.");
81
+ }
82
+ try {
83
+ const ortAny = window.ort;
84
+ if (ortAny && ortAny.env && ortAny.env.wasm) {
85
+ ortAny.env.wasm.wasmPaths = "/ort/";
86
+ }
87
+ if (!this.vad) {
88
+ const defaultOptions = getDefaultRealTimeVADOptions("v5");
89
+ this.vad = await MicVAD.new({
90
+ ...defaultOptions,
91
+ startOnLoad: false,
92
+ onSpeechStart: () => {
93
+ this.emitVoiceStart();
94
+ },
95
+ onSpeechEnd: (audio) => {
96
+ this.emitVoiceStop();
97
+ },
98
+ onVADMisfire: () => {
99
+ },
100
+ minSpeechMs: this.options?.minSpeechMs || 150,
101
+ positiveSpeechThreshold: 0.5,
102
+ negativeSpeechThreshold: 0.35,
103
+ redemptionMs: this.options?.minSilenceMs || 450,
104
+ preSpeechPadMs: 50,
105
+ processorType: "ScriptProcessor",
106
+ onnxWASMBasePath: "/ort/",
107
+ baseAssetPath: "/vad/",
108
+ workletOptions: {}
109
+ });
110
+ }
111
+ if (!this.vad.listening) {
112
+ await this.vad.start();
113
+ }
114
+ this.running = true;
115
+ } catch (error) {
116
+ this.running = false;
117
+ throw new Error(
118
+ error?.message || "Failed to initialize voice activity detector"
119
+ );
120
+ }
121
+ }
122
+ stop() {
123
+ if (!this.running || !this.vad) return;
124
+ try {
125
+ this.vad.pause();
126
+ this.running = false;
127
+ } catch (error) {
128
+ }
129
+ }
130
+ destroy() {
131
+ this.stop();
132
+ if (this.vad) {
133
+ try {
134
+ this.vad.destroy();
135
+ } catch (error) {
136
+ }
137
+ this.vad = null;
138
+ }
139
+ this.voiceStartListeners.clear();
140
+ this.voiceStopListeners.clear();
141
+ }
142
+ isActive() {
143
+ return this.running && this.vad !== null && this.vad.listening;
144
+ }
145
+ onVoiceStart(listener) {
146
+ this.voiceStartListeners.add(listener);
147
+ return () => this.voiceStartListeners.delete(listener);
148
+ }
149
+ onVoiceStop(listener) {
150
+ this.voiceStopListeners.add(listener);
151
+ return () => this.voiceStopListeners.delete(listener);
152
+ }
153
+ emitVoiceStart() {
154
+ this.voiceStartListeners.forEach((listener) => {
155
+ try {
156
+ listener();
157
+ } catch (error) {
158
+ console.error("Error in voice start listener:", error);
159
+ }
160
+ });
161
+ }
162
+ emitVoiceStop() {
163
+ this.voiceStopListeners.forEach((listener) => {
164
+ try {
165
+ listener();
166
+ } catch (error) {
167
+ console.error("Error in voice stop listener:", error);
168
+ }
169
+ });
170
+ }
171
+ };
172
+
173
+ // src/stt/stt-logic.ts
174
+ var ResetSTTLogic2 = class {
175
+ constructor(onLog, onTranscript, options = {}) {
176
+ this.isListening = false;
177
+ this.fullTranscript = "";
178
+ this.heardWords = [];
179
+ this.onWordsUpdate = null;
180
+ this.onMicTimeUpdate = null;
181
+ this.onRestartMetrics = null;
182
+ this.micOnTime = 0;
183
+ this.sessionDuration = 3e4;
184
+ this.lastTickTime = 0;
185
+ this.micTimeInterval = null;
186
+ this.restartCount = 0;
187
+ this.isRestarting = false;
188
+ this.isRecognitionRunning = false;
189
+ this.lastInterimTranscript = "";
190
+ this.lastInterimSaveTime = 0;
191
+ this.interimSaveInterval = 1e3;
192
+ this.lastInterimResultTime = 0;
193
+ this.lastSavedLength = 0;
194
+ this.transcriptBeforeRestart = "";
195
+ this.sessionStartTranscript = "";
196
+ this.sessionId = 0;
197
+ this.awaitingRestartFirstResultId = null;
198
+ this.lastWasFinal = false;
199
+ this.restartMetrics = {};
200
+ this.isAutoRestarting = false;
201
+ this.onLog = onLog;
202
+ this.onTranscript = onTranscript;
203
+ this.options = {
204
+ sessionDurationMs: options.sessionDurationMs ?? 3e4,
205
+ interimSaveIntervalMs: options.interimSaveIntervalMs ?? 5e3,
206
+ preserveTranscriptOnStart: options.preserveTranscriptOnStart ?? false
207
+ };
208
+ this.sessionDuration = this.options.sessionDurationMs;
209
+ this.interimSaveInterval = this.options.interimSaveIntervalMs;
210
+ const SpeechRecognitionAPI = window.SpeechRecognition || window.webkitSpeechRecognition;
211
+ if (!SpeechRecognitionAPI) {
212
+ this.onLog("Speech Recognition API not supported", "error");
213
+ throw new Error("Speech Recognition API not available");
214
+ }
215
+ this.recognition = new SpeechRecognitionAPI();
216
+ this.setupRecognition();
217
+ }
218
+ setWordsUpdateCallback(callback) {
219
+ this.onWordsUpdate = callback;
220
+ }
221
+ setMicTimeUpdateCallback(callback) {
222
+ this.onMicTimeUpdate = callback;
223
+ }
224
+ setRestartMetricsCallback(callback) {
225
+ this.onRestartMetrics = callback;
226
+ }
227
+ setVadCallbacks(onSpeechStart, onSpeechEnd) {
228
+ this.onUserSpeechStart = onSpeechStart || void 0;
229
+ this.onUserSpeechEnd = onSpeechEnd || void 0;
230
+ }
231
+ getSessionDurationMs() {
232
+ return this.sessionDuration;
233
+ }
234
+ isInAutoRestart() {
235
+ return this.isAutoRestarting;
236
+ }
237
+ getFullTranscript() {
238
+ if (this.transcriptBeforeRestart.length > 0) {
239
+ if (this.fullTranscript.length > 0) {
240
+ return (this.transcriptBeforeRestart + " " + this.fullTranscript).trim();
241
+ }
242
+ return this.transcriptBeforeRestart;
243
+ }
244
+ return this.fullTranscript;
245
+ }
246
+ clearTranscript() {
247
+ this.fullTranscript = "";
248
+ this.transcriptBeforeRestart = "";
249
+ this.sessionStartTranscript = "";
250
+ this.heardWords = [];
251
+ }
252
+ setupRecognition() {
253
+ this.recognition.lang = "en-US";
254
+ this.recognition.interimResults = true;
255
+ this.recognition.continuous = true;
256
+ this.recognition.maxAlternatives = 1;
257
+ this.resultHandler = (event) => {
258
+ const speechEvent = event;
259
+ let completeTranscript = "";
260
+ for (let i = 0; i < speechEvent.results.length; i++) {
261
+ completeTranscript += speechEvent.results[i][0].transcript + " ";
262
+ }
263
+ completeTranscript = completeTranscript.trim();
264
+ const isFinal = speechEvent.results[speechEvent.results.length - 1].isFinal;
265
+ completeTranscript = this.collapseRepeats(completeTranscript);
266
+ this.lastInterimTranscript = completeTranscript;
267
+ this.lastInterimResultTime = Date.now();
268
+ if (this.awaitingRestartFirstResultId != null) {
269
+ const rid = this.awaitingRestartFirstResultId;
270
+ if (this.restartMetrics[rid] && !this.restartMetrics[rid].firstResultAt) {
271
+ this.restartMetrics[rid].firstResultAt = Date.now();
272
+ const delta = this.restartMetrics[rid].firstResultAt - this.restartMetrics[rid].requestedAt;
273
+ this.onLog(
274
+ `\u{1F514} First result after restart #${rid} in ${delta}ms`,
275
+ "info"
276
+ );
277
+ this.awaitingRestartFirstResultId = null;
278
+ }
279
+ }
280
+ this.onLog(
281
+ `[${isFinal ? "FINAL" : "INTERIM"}] "${completeTranscript}"`,
282
+ isFinal ? "info" : "warning"
283
+ );
284
+ if (!isFinal && this.lastWasFinal) {
285
+ try {
286
+ this.onUserSpeechStart?.();
287
+ } catch {
288
+ }
289
+ }
290
+ this.lastWasFinal = isFinal;
291
+ if (isFinal) {
292
+ this.fullTranscript = (this.sessionStartTranscript + " " + completeTranscript).trim();
293
+ this.fullTranscript = this.collapseRepeats(this.fullTranscript);
294
+ this.heardWords = this.fullTranscript.split(/\s+/).filter((word) => word.length > 0);
295
+ this.onTranscript(this.getFullTranscript());
296
+ this.lastSavedLength = this.fullTranscript.length;
297
+ if (this.onWordsUpdate) this.onWordsUpdate(this.heardWords);
298
+ this.lastInterimTranscript = "";
299
+ if (this.awaitingRestartFirstResultId != null) {
300
+ const rid = this.awaitingRestartFirstResultId;
301
+ if (this.restartMetrics[rid] && !this.restartMetrics[rid].firstResultAt) {
302
+ this.restartMetrics[rid].firstResultAt = Date.now();
303
+ this.restartMetrics[rid].startedAt || this.restartMetrics[rid].startAttemptAt || Date.now();
304
+ const firstResultDelta = this.restartMetrics[rid].firstResultAt - this.restartMetrics[rid].requestedAt;
305
+ this.onLog(
306
+ `\u{1F514} First result after restart #${rid} in ${firstResultDelta}ms`,
307
+ "info"
308
+ );
309
+ this.awaitingRestartFirstResultId = null;
310
+ }
311
+ }
312
+ }
313
+ };
314
+ this.recognition.addEventListener("result", this.resultHandler);
315
+ this.errorHandler = (event) => {
316
+ const errorEvent = event;
317
+ if (errorEvent.error === "aborted" && this.isRestarting) {
318
+ this.onLog("Aborted during restart (ignored)", "info");
319
+ this.isRecognitionRunning = false;
320
+ return;
321
+ }
322
+ this.onLog(`Error: ${errorEvent.error}`, "error");
323
+ if (errorEvent.error === "no-speech" || errorEvent.error === "audio-capture" || errorEvent.error === "network") {
324
+ setTimeout(() => {
325
+ if (this.isListening && !this.isRestarting && !this.isRecognitionRunning) {
326
+ try {
327
+ this.recognition.start();
328
+ this.isRecognitionRunning = true;
329
+ this.sessionId++;
330
+ } catch (e) {
331
+ this.onLog(`Failed restart after error: ${e}`, "error");
332
+ }
333
+ }
334
+ }, 500);
335
+ } else {
336
+ this.onLog(
337
+ `Unhandled SpeechRecognition error: ${errorEvent.error}`,
338
+ "warning"
339
+ );
340
+ }
341
+ };
342
+ this.recognition.addEventListener("error", this.errorHandler);
343
+ this.endHandler = () => {
344
+ this.isRecognitionRunning = false;
345
+ if (this.isListening && !this.isRestarting) {
346
+ setTimeout(() => {
347
+ if (this.isListening && !this.isRestarting) {
348
+ try {
349
+ this.recognition.start();
350
+ this.isRecognitionRunning = true;
351
+ this.sessionId++;
352
+ this.onLog(
353
+ `\u{1F501} Auto-resumed recognition after end (session ${this.sessionId})`,
354
+ "info"
355
+ );
356
+ } catch (e) {
357
+ this.onLog(`Failed to auto-start after end: ${e}`, "error");
358
+ }
359
+ }
360
+ }, 100);
361
+ }
362
+ };
363
+ this.recognition.addEventListener("end", this.endHandler);
364
+ this.startHandler = () => {
365
+ this.isRecognitionRunning = true;
366
+ const rid = this.awaitingRestartFirstResultId;
367
+ if (rid != null && this.restartMetrics[rid]) {
368
+ if (!this.restartMetrics[rid].startedAt) {
369
+ this.restartMetrics[rid].startedAt = Date.now();
370
+ this.onLog(
371
+ `\u25B6\uFE0F Restart #${rid} recognition started in ${this.restartMetrics[rid].startedAt - this.restartMetrics[rid].requestedAt}ms`,
372
+ "info"
373
+ );
374
+ }
375
+ }
376
+ };
377
+ this.recognition.addEventListener("start", this.startHandler);
378
+ }
379
+ waitForEventOnce(eventName, timeoutMs) {
380
+ return new Promise((resolve) => {
381
+ let timer = null;
382
+ const handler = (ev) => {
383
+ if (timer !== null) clearTimeout(timer);
384
+ this.recognition.removeEventListener(eventName, handler);
385
+ resolve(ev);
386
+ };
387
+ this.recognition.addEventListener(eventName, handler);
388
+ timer = window.setTimeout(() => {
389
+ this.recognition.removeEventListener(eventName, handler);
390
+ resolve(null);
391
+ }, timeoutMs);
392
+ });
393
+ }
394
+ startMicTimer() {
395
+ this.lastTickTime = Date.now();
396
+ this.lastInterimSaveTime = Date.now();
397
+ this.micTimeInterval = window.setInterval(() => {
398
+ if (this.isListening) {
399
+ const now = Date.now();
400
+ const elapsed = now - this.lastTickTime;
401
+ this.micOnTime += elapsed;
402
+ this.lastTickTime = now;
403
+ if (now - this.lastInterimSaveTime >= this.interimSaveInterval) {
404
+ this.saveInterimToFinal();
405
+ this.lastInterimSaveTime = now;
406
+ }
407
+ if (this.micOnTime >= this.sessionDuration) {
408
+ if (!this.isRestarting) this.performRestart();
409
+ }
410
+ if (this.onMicTimeUpdate) this.onMicTimeUpdate(this.micOnTime);
411
+ }
412
+ }, 100);
413
+ }
414
+ stopMicTimer() {
415
+ if (this.micTimeInterval) {
416
+ clearInterval(this.micTimeInterval);
417
+ this.micTimeInterval = null;
418
+ }
419
+ }
420
+ saveInterimToFinal() {
421
+ if (!this.lastInterimTranscript) return;
422
+ const now = Date.now();
423
+ if (now - this.lastInterimResultTime > this.interimSaveInterval && this.lastInterimTranscript.length > this.lastSavedLength) {
424
+ this.fullTranscript = (this.fullTranscript + " " + this.lastInterimTranscript).trim();
425
+ this.fullTranscript = this.collapseRepeats(this.fullTranscript);
426
+ this.lastSavedLength = this.fullTranscript.length;
427
+ if (this.onWordsUpdate) {
428
+ const words = this.fullTranscript.split(/\s+/).filter((w) => w.length > 0);
429
+ this.onWordsUpdate(words);
430
+ }
431
+ this.onTranscript(this.getFullTranscript());
432
+ }
433
+ }
434
+ getSuffixToAppend(base, current) {
435
+ if (!base || base.length === 0) return current;
436
+ if (!current || current.length === 0) return "";
437
+ base = base.trim();
438
+ current = current.trim();
439
+ if (current.startsWith(base)) {
440
+ return current.slice(base.length).trim();
441
+ }
442
+ const maxOverlap = Math.min(base.length, current.length);
443
+ for (let overlap = maxOverlap; overlap > 0; overlap--) {
444
+ if (base.endsWith(current.slice(0, overlap))) {
445
+ return current.slice(overlap).trim();
446
+ }
447
+ }
448
+ return current;
449
+ }
450
+ collapseRepeats(text) {
451
+ if (!text || text.trim().length === 0) return text.trim();
452
+ let normalized = text.replace(/\s+/g, " ").trim();
453
+ const n = normalized.length;
454
+ const lps = new Array(n).fill(0);
455
+ for (let i = 1; i < n; i++) {
456
+ let j = lps[i - 1];
457
+ while (j > 0 && normalized[i] !== normalized[j]) j = lps[j - 1];
458
+ if (normalized[i] === normalized[j]) j++;
459
+ lps[i] = j;
460
+ }
461
+ const period = n - lps[n - 1];
462
+ if (period < n && n % period === 0) {
463
+ return normalized.slice(0, period).trim();
464
+ }
465
+ const words = normalized.split(" ");
466
+ for (let block = Math.min(20, Math.floor(words.length / 2)); block >= 1; block--) {
467
+ let i = 0;
468
+ while (i + 2 * block <= words.length) {
469
+ let blockA = words.slice(i, i + block).join(" ");
470
+ let blockB = words.slice(i + block, i + 2 * block).join(" ");
471
+ if (blockA === blockB) {
472
+ words.splice(i + block, block);
473
+ } else {
474
+ i++;
475
+ }
476
+ }
477
+ }
478
+ const collapsedWords = [];
479
+ for (const w of words) {
480
+ if (collapsedWords.length === 0 || collapsedWords[collapsedWords.length - 1] !== w)
481
+ collapsedWords.push(w);
482
+ }
483
+ return collapsedWords.join(" ").trim();
484
+ }
485
+ performRestart() {
486
+ if (!this.isListening || this.isRestarting) return;
487
+ const restartStartTime = Date.now();
488
+ this.restartCount++;
489
+ this.isRestarting = true;
490
+ this.isAutoRestarting = true;
491
+ const rid = ++this.sessionId;
492
+ this.awaitingRestartFirstResultId = rid;
493
+ this.restartMetrics[rid] = { requestedAt: restartStartTime };
494
+ this.onLog(
495
+ `\u{1F504} [AUTO-RESTART] Session ${rid} - buffering transcript, waiting for silence...`,
496
+ "warning"
497
+ );
498
+ if (this.lastInterimTranscript.trim().length > 0) {
499
+ this.saveInterimToFinal();
500
+ }
501
+ this.transcriptBeforeRestart = this.getFullTranscript();
502
+ this.fullTranscript = "";
503
+ this.sessionStartTranscript = "";
504
+ this.lastInterimTranscript = "";
505
+ this.heardWords = [];
506
+ this.stopMicTimer();
507
+ const stopTimeout = 600;
508
+ const startTimeout = 1e3;
509
+ const firstResultTimeout = 2e3;
510
+ const stopNow = async () => {
511
+ try {
512
+ if (this.isRecognitionRunning) {
513
+ this.recognition.stop();
514
+ } else {
515
+ this.onLog("Recognition not running at stop attempt", "warning");
516
+ }
517
+ } catch (err) {
518
+ this.onLog(`Stop threw: ${err}`, "warning");
519
+ }
520
+ const endEvent = await this.waitForEventOnce("end", stopTimeout);
521
+ if (!endEvent) {
522
+ try {
523
+ this.recognition.abort();
524
+ } catch (err) {
525
+ this.onLog(`Abort also failed: ${err}`, "error");
526
+ }
527
+ await this.waitForEventOnce("end", 300);
528
+ }
529
+ this.restartMetrics[rid].stopAt = Date.now();
530
+ };
531
+ (async () => {
532
+ await stopNow();
533
+ this.restartMetrics[rid].startAttemptAt = Date.now();
534
+ try {
535
+ if (!this.isRecognitionRunning) {
536
+ this.sessionId = rid;
537
+ this.recognition.start();
538
+ } else {
539
+ this.onLog(
540
+ "Recognition already running at restart time; skipping start.",
541
+ "warning"
542
+ );
543
+ }
544
+ } catch (e) {
545
+ this.onLog(`Failed to start recognition after restart: ${e}`, "error");
546
+ }
547
+ const startEv = await this.waitForEventOnce("start", startTimeout);
548
+ if (startEv) {
549
+ this.restartMetrics[rid].startedAt = Date.now();
550
+ } else {
551
+ this.onLog(
552
+ `Restart #${rid} did not produce start event within ${startTimeout}ms`,
553
+ "warning"
554
+ );
555
+ }
556
+ const resEv = await this.waitForEventOnce("result", firstResultTimeout);
557
+ if (resEv) {
558
+ if (this.restartMetrics[rid])
559
+ this.restartMetrics[rid].firstResultAt = Date.now();
560
+ const firstResultDelta = (this.restartMetrics[rid].firstResultAt || Date.now()) - (this.restartMetrics[rid].requestedAt || Date.now());
561
+ this.onLog(
562
+ `\u{1F514} First result after restart #${rid} in ${firstResultDelta}ms`,
563
+ "info"
564
+ );
565
+ } else {
566
+ this.onLog(
567
+ `Restart #${rid} produced no result within ${firstResultTimeout}ms`,
568
+ "warning"
569
+ );
570
+ }
571
+ const startedAt = this.restartMetrics[rid].startedAt || this.restartMetrics[rid].startAttemptAt || Date.now();
572
+ const restartDuration = startedAt - this.restartMetrics[rid].requestedAt;
573
+ if (this.onRestartMetrics)
574
+ this.onRestartMetrics(this.restartCount, restartDuration);
575
+ this.onLog(
576
+ `\u2705 Session ${rid} restarted in ${restartDuration}ms - resuming from silence gate`,
577
+ "info"
578
+ );
579
+ this.startMicTimer();
580
+ this.isRestarting = false;
581
+ this.isAutoRestarting = false;
582
+ })();
583
+ }
584
+ start() {
585
+ if (this.isListening) return;
586
+ try {
587
+ this.isListening = true;
588
+ if (!this.options.preserveTranscriptOnStart) {
589
+ this.fullTranscript = "";
590
+ this.heardWords = [];
591
+ this.transcriptBeforeRestart = "";
592
+ this.sessionStartTranscript = "";
593
+ } else {
594
+ this.sessionStartTranscript = this.fullTranscript;
595
+ }
596
+ this.micOnTime = 0;
597
+ this.restartCount = 0;
598
+ this.lastSavedLength = 0;
599
+ this.lastInterimTranscript = "";
600
+ this.lastWasFinal = false;
601
+ if (!this.isRecognitionRunning) {
602
+ this.sessionId++;
603
+ this.recognition.start();
604
+ this.isRecognitionRunning = true;
605
+ }
606
+ this.startMicTimer();
607
+ this.onLog(
608
+ "Listening started (auto-restart every 30s of mic time)",
609
+ "info"
610
+ );
611
+ } catch (error) {
612
+ this.isListening = false;
613
+ this.onLog(`Failed to start: ${error}`, "error");
614
+ }
615
+ }
616
+ stop() {
617
+ if (!this.isListening) return;
618
+ try {
619
+ this.isListening = false;
620
+ this.isAutoRestarting = false;
621
+ this.stopMicTimer();
622
+ this.recognition.stop();
623
+ this.isRecognitionRunning = false;
624
+ this.onLog(
625
+ `Stopped listening (total mic time: ${(this.micOnTime / 1e3).toFixed(
626
+ 1
627
+ )}s, restarts: ${this.restartCount})`,
628
+ "info"
629
+ );
630
+ } catch (error) {
631
+ this.onLog(`Failed to stop: ${error}`, "error");
632
+ }
633
+ }
634
+ destroy() {
635
+ this.isListening = false;
636
+ this.stopMicTimer();
637
+ try {
638
+ this.recognition.abort?.();
639
+ } catch (e) {
640
+ }
641
+ try {
642
+ if (this.resultHandler)
643
+ this.recognition.removeEventListener("result", this.resultHandler);
644
+ if (this.errorHandler)
645
+ this.recognition.removeEventListener("error", this.errorHandler);
646
+ if (this.endHandler)
647
+ this.recognition.removeEventListener(
648
+ "end",
649
+ this.endHandler
650
+ );
651
+ if (this.startHandler)
652
+ this.recognition.removeEventListener(
653
+ "start",
654
+ this.startHandler
655
+ );
656
+ } catch (e) {
657
+ }
658
+ }
659
+ };
660
+ var STTLogic = class extends ResetSTTLogic2 {
661
+ };
662
+
663
+ // src/tts/prepare-piper-voice.ts
664
+ function preparePiperVoice(config) {
665
+ const modelPath = config.modelPath ?? `voices/${config.voiceId}.onnx`;
666
+ return {
667
+ voiceId: config.voiceId,
668
+ modelPath,
669
+ sampleRate: config.sampleRate ?? 22050,
670
+ inference: {
671
+ lengthScale: config.lengthScale ?? 1,
672
+ noiseScale: config.noiseScale ?? 0.667
673
+ },
674
+ metadata: {
675
+ speaker: config.speaker ?? "default"
676
+ }
677
+ };
678
+ }
679
+
680
+ // src/tts/stream-tokens-to-speech.ts
681
+ function isAsyncIterable(value) {
682
+ return typeof value[Symbol.asyncIterator] === "function";
683
+ }
684
+ var sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
685
+ async function streamTokensToSpeech(tokens, options = {}) {
686
+ const chunkSize = options.chunkSize ?? 40;
687
+ const delayMs = options.delayMs ?? 0;
688
+ let buffer = "";
689
+ let chunksEmitted = 0;
690
+ let characters = 0;
691
+ const emit = async () => {
692
+ if (!buffer) return;
693
+ characters += buffer.length;
694
+ chunksEmitted += 1;
695
+ if (options.onChunk) {
696
+ await options.onChunk(buffer);
697
+ }
698
+ buffer = "";
699
+ if (delayMs > 0) {
700
+ await sleep(delayMs);
701
+ }
702
+ };
703
+ if (isAsyncIterable(tokens)) {
704
+ for await (const token of tokens) {
705
+ buffer += token;
706
+ if (buffer.length >= chunkSize) {
707
+ await emit();
708
+ }
709
+ }
710
+ } else {
711
+ for (const token of tokens) {
712
+ buffer += token;
713
+ if (buffer.length >= chunkSize) {
714
+ await emit();
715
+ }
716
+ }
717
+ }
718
+ if (buffer) {
719
+ await emit();
720
+ }
721
+ return { chunksEmitted, characters };
722
+ }
723
+
724
+ // src/tts/ort-setup.ts
725
+ async function createOrtEnvironment(config = {}) {
726
+ const providers = config.providers ?? (config.device === "webgpu" ? ["webgpu", "wasm"] : ["wasm"]);
727
+ const environment = {
728
+ device: config.device ?? "cpu",
729
+ logLevel: config.logLevel ?? "warning",
730
+ providers,
731
+ initialized: false,
732
+ async init() {
733
+ this.initialized = true;
734
+ }
735
+ };
736
+ await environment.init();
737
+ return environment;
738
+ }
739
+
740
+ // src/tts/piper.ts
741
+ var voiceCache = /* @__PURE__ */ new Map();
742
+ var ortEnv = null;
743
+ async function ensureOrtReady(config = {}) {
744
+ if (ortEnv) return ortEnv;
745
+ ortEnv = await createOrtEnvironment(config);
746
+ return ortEnv;
747
+ }
748
+ async function ensureVoiceLoaded(config) {
749
+ const cached = voiceCache.get(config.voiceId);
750
+ if (cached) return cached;
751
+ const voice = preparePiperVoice(config);
752
+ voiceCache.set(config.voiceId, voice);
753
+ return voice;
754
+ }
755
+ async function warmupPiper(voiceConfig, synth, text = "warmup") {
756
+ const voice = await ensureVoiceLoaded(voiceConfig);
757
+ await synth(text, voice);
758
+ }
759
+ function resetVoiceCache() {
760
+ voiceCache.clear();
761
+ }
762
+ function getBackendLabel(device) {
763
+ if (!device) return "auto";
764
+ return device === "webgpu" ? "WebGPU" : "CPU";
765
+ }
766
+ function isCorruptModelError(error) {
767
+ if (!error) return false;
768
+ const msg = typeof error === "string" ? error : error.message;
769
+ if (!msg) return false;
770
+ return /corrupt|checksum|integrity/i.test(msg);
771
+ }
772
+ async function* synthesizerWorker(textQueue, voiceConfig, synth) {
773
+ const voice = await ensureVoiceLoaded(voiceConfig);
774
+ for await (const text of textQueue) {
775
+ yield synth(text, voice);
776
+ }
777
+ }
778
+ async function playerWorker(audioQueue, play) {
779
+ for await (const audio of audioQueue) {
780
+ await play(audio);
781
+ }
782
+ }
783
+ function nextBoundaryIndex(text) {
784
+ const idx = text.search(/[.!?,]/);
785
+ return idx >= 0 ? idx : -1;
786
+ }
787
+ function emitSentence(queue, sentence) {
788
+ const trimmed = sentence.trim();
789
+ if (trimmed) {
790
+ queue.put(trimmed);
791
+ }
792
+ }
793
+ function handleChunk(state, chunk, queue) {
794
+ state.buffer += chunk;
795
+ let boundary = nextBoundaryIndex(state.buffer);
796
+ while (boundary >= 0) {
797
+ const sentence = state.buffer.slice(0, boundary + 1);
798
+ state.buffer = state.buffer.slice(boundary + 1);
799
+ emitSentence(queue, sentence);
800
+ boundary = nextBoundaryIndex(state.buffer);
801
+ }
802
+ }
803
+ function getAsyncIterator(source) {
804
+ if (source[Symbol.asyncIterator]) {
805
+ return source;
806
+ }
807
+ return {
808
+ async *[Symbol.asyncIterator]() {
809
+ for (const item of source) {
810
+ yield item;
811
+ }
812
+ }
813
+ };
814
+ }
815
+ var SimpleQueue = class {
816
+ constructor() {
817
+ this.buffer = [];
818
+ this.resolvers = [];
819
+ }
820
+ put(item) {
821
+ if (this.resolvers.length > 0) {
822
+ const resolve = this.resolvers.shift();
823
+ resolve?.({ value: item, done: false });
824
+ } else {
825
+ this.buffer.push(item);
826
+ }
827
+ }
828
+ size() {
829
+ return this.buffer.length;
830
+ }
831
+ async get() {
832
+ if (this.buffer.length > 0) {
833
+ return this.buffer.shift();
834
+ }
835
+ return new Promise((resolve) => {
836
+ this.resolvers.push(({ value }) => resolve(value));
837
+ });
838
+ }
839
+ async *[Symbol.asyncIterator]() {
840
+ while (true) {
841
+ const value = await this.get();
842
+ yield value;
843
+ }
844
+ }
845
+ };
846
+
847
+ // src/tts/use-streaming-tts.ts
848
+ var defaultSynth = async (text) => text;
849
+ var defaultPlayer = async () => void 0;
850
+ function useStreamingTTS(options) {
851
+ const textQueue = new SimpleQueue();
852
+ const bufferState = { buffer: "" };
853
+ let ready = false;
854
+ let stopped = false;
855
+ let voice = null;
856
+ const synth = options.synth ?? defaultSynth;
857
+ const play = options.play ?? defaultPlayer;
858
+ const chunkSize = options.chunkSize ?? 48;
859
+ const delayMs = options.delayMs ?? 0;
860
+ async function ensureReady() {
861
+ if (ready) return;
862
+ await ensureOrtReady(options.ort ?? {});
863
+ voice = await ensureVoiceLoaded(options.voice);
864
+ ready = true;
865
+ }
866
+ async function addChunk(text) {
867
+ handleChunk(bufferState, text, textQueue);
868
+ if (bufferState.buffer.length >= chunkSize) {
869
+ emitSentence(textQueue, bufferState.buffer);
870
+ bufferState.buffer = "";
871
+ }
872
+ }
873
+ async function finishStreaming() {
874
+ if (bufferState.buffer) {
875
+ emitSentence(textQueue, bufferState.buffer);
876
+ bufferState.buffer = "";
877
+ }
878
+ }
879
+ function stop() {
880
+ stopped = true;
881
+ }
882
+ async function synthAndPlayChunk(text) {
883
+ await ensureReady();
884
+ const audio = await synth(text, voice);
885
+ await play(audio);
886
+ }
887
+ async function processQueue() {
888
+ await ensureReady();
889
+ const tokenIterator = getAsyncIterator(textQueue);
890
+ const audioIterator = synthesizerWorker(tokenIterator, options.voice, synth);
891
+ await playerWorker(audioIterator, play);
892
+ }
893
+ function createTokenIterable(text) {
894
+ return text.split(/\s+/g).filter(Boolean);
895
+ }
896
+ async function streamTokens(tokens) {
897
+ await ensureReady();
898
+ await streamTokensToSpeech(tokens, {
899
+ chunkSize,
900
+ delayMs,
901
+ onChunk: async (chunk) => {
902
+ if (stopped) return;
903
+ await synthAndPlayChunk(chunk);
904
+ }
905
+ });
906
+ }
907
+ processQueue().catch(() => void 0);
908
+ streamTokens(textQueue).catch(() => void 0);
909
+ return {
910
+ ensureReady,
911
+ addChunk,
912
+ finishStreaming,
913
+ stop,
914
+ synthAndPlayChunk,
915
+ processQueue,
916
+ createTokenIterable
917
+ };
918
+ }
919
+ var DEFAULT_VOICE_ID = "en_US-hfc_female-medium";
920
+ var TTSLogic = class {
921
+ constructor(config = {}) {
922
+ this.ready = false;
923
+ this.voiceLoaded = false;
924
+ this.config = {
925
+ voiceId: DEFAULT_VOICE_ID,
926
+ sampleRate: 22050,
927
+ ...config
928
+ };
929
+ }
930
+ /**
931
+ * Initialize the synthesizer by loading the voice model
932
+ */
933
+ async initialize() {
934
+ if (this.ready) return;
935
+ try {
936
+ const voiceId = this.config.voiceId;
937
+ console.log("\u{1F4CD} Loading Piper voice:", voiceId);
938
+ const storedVoices = await piperTts.stored();
939
+ const alreadyCached = Array.isArray(storedVoices) ? storedVoices.includes(voiceId) : false;
940
+ if (!alreadyCached) {
941
+ console.log("\u2B07\uFE0F Downloading voice model...");
942
+ await piperTts.download(voiceId, (progress) => {
943
+ if (progress?.total) {
944
+ const pct = Math.round(progress.loaded * 100 / progress.total);
945
+ console.log(`\u2B07\uFE0F Downloading: ${pct}%`);
946
+ }
947
+ });
948
+ } else {
949
+ console.log("\u2713 Voice found in cache");
950
+ }
951
+ this.voiceLoaded = true;
952
+ this.ready = true;
953
+ console.log("\u2713 Piper synthesizer initialized");
954
+ } catch (error) {
955
+ throw new Error(`Failed to initialize Piper synthesizer: ${error}`);
956
+ }
957
+ }
958
+ /**
959
+ * Check if the synthesizer is ready
960
+ */
961
+ isReady() {
962
+ return this.ready;
963
+ }
964
+ /**
965
+ * Synthesize speech from text
966
+ * @param text - Text to convert to speech
967
+ * @returns Audio data as WAV Blob and Float32Array
968
+ */
969
+ async synthesize(text) {
970
+ if (!this.ready) {
971
+ throw new Error("Synthesizer not initialized. Call initialize() first.");
972
+ }
973
+ const trimmed = text?.trim();
974
+ if (!trimmed) {
975
+ throw new Error("No text provided for synthesis");
976
+ }
977
+ try {
978
+ const wavBlob = await piperTts.predict({
979
+ text: trimmed,
980
+ voiceId: this.config.voiceId
981
+ });
982
+ const arrayBuffer = await wavBlob.arrayBuffer();
983
+ const audioContext = new (window.AudioContext || window.webkitAudioContext)();
984
+ const decodedBuffer = await audioContext.decodeAudioData(arrayBuffer);
985
+ const audioData = decodedBuffer.getChannelData(0);
986
+ audioContext.close();
987
+ return {
988
+ audioBlob: wavBlob,
989
+ audio: audioData,
990
+ sampleRate: decodedBuffer.sampleRate,
991
+ duration: decodedBuffer.duration
992
+ };
993
+ } catch (error) {
994
+ throw new Error(`Synthesis failed: ${error}`);
995
+ }
996
+ }
997
+ /**
998
+ * Synthesize and return WAV Blob only (faster, no decoding)
999
+ */
1000
+ async synthesizeToBlob(text) {
1001
+ if (!this.ready) {
1002
+ throw new Error("Synthesizer not initialized. Call initialize() first.");
1003
+ }
1004
+ const trimmed = text?.trim();
1005
+ if (!trimmed) {
1006
+ throw new Error("No text provided for synthesis");
1007
+ }
1008
+ return piperTts.predict({
1009
+ text: trimmed,
1010
+ voiceId: this.config.voiceId
1011
+ });
1012
+ }
1013
+ /**
1014
+ * Stop current synthesis (not directly supported, but we can track state)
1015
+ */
1016
+ stop() {
1017
+ console.log("Stop requested");
1018
+ }
1019
+ /**
1020
+ * Dispose of the synthesizer and free resources
1021
+ */
1022
+ async dispose() {
1023
+ this.ready = false;
1024
+ this.voiceLoaded = false;
1025
+ }
1026
+ };
1027
+ function textToPhonemes(_text) {
1028
+ console.warn(
1029
+ "textToPhonemes is deprecated. Use PiperSynthesizer.synthesize(text) instead."
1030
+ );
1031
+ return [];
1032
+ }
1033
+
1034
+ // src/tts/audio-player.ts
1035
+ var AudioPlayer = class {
1036
+ constructor(config = {}) {
1037
+ this.audioContext = null;
1038
+ this.currentSource = null;
1039
+ this.config = {
1040
+ sampleRate: 22050,
1041
+ volume: 1,
1042
+ ...config
1043
+ };
1044
+ }
1045
+ /**
1046
+ * Initialize the audio context
1047
+ */
1048
+ getAudioContext() {
1049
+ if (!this.audioContext) {
1050
+ this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
1051
+ sampleRate: this.config.sampleRate
1052
+ });
1053
+ }
1054
+ return this.audioContext;
1055
+ }
1056
+ /**
1057
+ * Play audio data
1058
+ * @param audioData - Float32Array of audio samples
1059
+ * @param sampleRate - Sample rate of the audio
1060
+ */
1061
+ async play(audioData, sampleRate) {
1062
+ const ctx = this.getAudioContext();
1063
+ if (ctx.state === "suspended") {
1064
+ await ctx.resume();
1065
+ }
1066
+ const audioBuffer = ctx.createBuffer(1, audioData.length, sampleRate);
1067
+ audioBuffer.getChannelData(0).set(audioData);
1068
+ const source = ctx.createBufferSource();
1069
+ source.buffer = audioBuffer;
1070
+ const gainNode = ctx.createGain();
1071
+ gainNode.gain.value = this.config.volume;
1072
+ source.connect(gainNode);
1073
+ gainNode.connect(ctx.destination);
1074
+ this.currentSource = source;
1075
+ source.start(0);
1076
+ return new Promise((resolve) => {
1077
+ source.onended = () => {
1078
+ this.currentSource = null;
1079
+ resolve();
1080
+ };
1081
+ });
1082
+ }
1083
+ /**
1084
+ * Stop current playback
1085
+ */
1086
+ stop() {
1087
+ if (this.currentSource) {
1088
+ try {
1089
+ this.currentSource.stop();
1090
+ this.currentSource = null;
1091
+ } catch (error) {
1092
+ }
1093
+ }
1094
+ }
1095
+ /**
1096
+ * Set volume (0.0 to 1.0)
1097
+ */
1098
+ setVolume(volume) {
1099
+ this.config.volume = Math.max(0, Math.min(1, volume));
1100
+ }
1101
+ /**
1102
+ * Close the audio context and free resources
1103
+ */
1104
+ async close() {
1105
+ this.stop();
1106
+ if (this.audioContext) {
1107
+ await this.audioContext.close();
1108
+ this.audioContext = null;
1109
+ }
1110
+ }
1111
+ };
1112
+ function createAudioPlayer(config) {
1113
+ return new AudioPlayer(config);
1114
+ }
1115
+
1116
+ export { AudioPlayer, ResetSTTLogic, STTLogic, SimpleQueue, TTSLogic, VADController, createAudioPlayer, createOrtEnvironment, emitSentence, ensureOrtReady, ensureVoiceLoaded, getAsyncIterator, getBackendLabel, handleChunk, isCorruptModelError, nextBoundaryIndex, playerWorker, preparePiperVoice, resetVoiceCache, streamTokensToSpeech, synthesizerWorker, textToPhonemes, useStreamingTTS, warmupPiper };
1117
+ //# sourceMappingURL=index.mjs.map
1118
+ //# sourceMappingURL=index.mjs.map