speech-to-speech 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1163 @@
1
+ 'use strict';
2
+
3
+ var vadWeb = require('@ricky0123/vad-web');
4
+ var piperTts = require('@realtimex/piper-tts-web');
5
+
6
+ function _interopNamespace(e) {
7
+ if (e && e.__esModule) return e;
8
+ var n = Object.create(null);
9
+ if (e) {
10
+ Object.keys(e).forEach(function (k) {
11
+ if (k !== 'default') {
12
+ var d = Object.getOwnPropertyDescriptor(e, k);
13
+ Object.defineProperty(n, k, d.get ? d : {
14
+ enumerable: true,
15
+ get: function () { return e[k]; }
16
+ });
17
+ }
18
+ });
19
+ }
20
+ n.default = e;
21
+ return Object.freeze(n);
22
+ }
23
+
24
+ var piperTts__namespace = /*#__PURE__*/_interopNamespace(piperTts);
25
+
26
+ // src/stt/reset-stt-logic.ts
27
+ var ResetSTTLogic = class {
28
+ constructor(options = {}) {
29
+ this.partialTranscript = "";
30
+ this.maxSilenceMs = options.maxSilenceMs ?? 2e3;
31
+ this.maxUtteranceMs = options.maxUtteranceMs ?? 15e3;
32
+ this.onReset = options.onReset;
33
+ this.now = options.now ?? (() => Date.now());
34
+ const start = this.now();
35
+ this.utteranceStartedAt = start;
36
+ this.lastActivityAt = start;
37
+ }
38
+ recordSpeechActivity(timestamp) {
39
+ const now = timestamp ?? this.now();
40
+ this.lastActivityAt = now;
41
+ if (!this.utteranceStartedAt) {
42
+ this.utteranceStartedAt = now;
43
+ }
44
+ }
45
+ updatePartialTranscript(partial, timestamp) {
46
+ this.partialTranscript = partial;
47
+ this.recordSpeechActivity(timestamp);
48
+ }
49
+ shouldReset(timestamp) {
50
+ const now = timestamp ?? this.now();
51
+ const silenceElapsed = now - this.lastActivityAt;
52
+ const utteranceElapsed = now - this.utteranceStartedAt;
53
+ if (silenceElapsed >= this.maxSilenceMs) {
54
+ return "silence";
55
+ }
56
+ if (utteranceElapsed >= this.maxUtteranceMs) {
57
+ return "utterance-complete";
58
+ }
59
+ return null;
60
+ }
61
+ maybeReset(timestamp) {
62
+ const reason = this.shouldReset(timestamp);
63
+ if (reason) {
64
+ this.reset(reason, timestamp);
65
+ }
66
+ return reason;
67
+ }
68
+ forceReset(reason = "manual", timestamp) {
69
+ this.reset(reason, timestamp);
70
+ }
71
+ reset(reason, timestamp) {
72
+ const now = timestamp ?? this.now();
73
+ const stats = {
74
+ utteranceStartedAt: this.utteranceStartedAt,
75
+ lastActivityAt: this.lastActivityAt,
76
+ partialTranscript: this.partialTranscript
77
+ };
78
+ this.utteranceStartedAt = now;
79
+ this.lastActivityAt = now;
80
+ this.partialTranscript = "";
81
+ if (this.onReset) {
82
+ this.onReset(reason, stats);
83
+ }
84
+ }
85
+ };
86
+ var VADController = class {
87
+ constructor(options) {
88
+ this.vad = null;
89
+ this.voiceStartListeners = /* @__PURE__ */ new Set();
90
+ this.voiceStopListeners = /* @__PURE__ */ new Set();
91
+ this.running = false;
92
+ this.options = options;
93
+ }
94
+ async start() {
95
+ if (this.running && this.vad) {
96
+ if (!this.vad.listening) {
97
+ await this.vad.start();
98
+ }
99
+ return;
100
+ }
101
+ if (typeof navigator === "undefined" || !navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
102
+ throw new Error("Microphone access is not available.");
103
+ }
104
+ try {
105
+ const ortAny = window.ort;
106
+ if (ortAny && ortAny.env && ortAny.env.wasm) {
107
+ ortAny.env.wasm.wasmPaths = "/ort/";
108
+ }
109
+ if (!this.vad) {
110
+ const defaultOptions = vadWeb.getDefaultRealTimeVADOptions("v5");
111
+ this.vad = await vadWeb.MicVAD.new({
112
+ ...defaultOptions,
113
+ startOnLoad: false,
114
+ onSpeechStart: () => {
115
+ this.emitVoiceStart();
116
+ },
117
+ onSpeechEnd: (audio) => {
118
+ this.emitVoiceStop();
119
+ },
120
+ onVADMisfire: () => {
121
+ },
122
+ minSpeechMs: this.options?.minSpeechMs || 150,
123
+ positiveSpeechThreshold: 0.5,
124
+ negativeSpeechThreshold: 0.35,
125
+ redemptionMs: this.options?.minSilenceMs || 450,
126
+ preSpeechPadMs: 50,
127
+ processorType: "ScriptProcessor",
128
+ onnxWASMBasePath: "/ort/",
129
+ baseAssetPath: "/vad/",
130
+ workletOptions: {}
131
+ });
132
+ }
133
+ if (!this.vad.listening) {
134
+ await this.vad.start();
135
+ }
136
+ this.running = true;
137
+ } catch (error) {
138
+ this.running = false;
139
+ throw new Error(
140
+ error?.message || "Failed to initialize voice activity detector"
141
+ );
142
+ }
143
+ }
144
+ stop() {
145
+ if (!this.running || !this.vad) return;
146
+ try {
147
+ this.vad.pause();
148
+ this.running = false;
149
+ } catch (error) {
150
+ }
151
+ }
152
+ destroy() {
153
+ this.stop();
154
+ if (this.vad) {
155
+ try {
156
+ this.vad.destroy();
157
+ } catch (error) {
158
+ }
159
+ this.vad = null;
160
+ }
161
+ this.voiceStartListeners.clear();
162
+ this.voiceStopListeners.clear();
163
+ }
164
+ isActive() {
165
+ return this.running && this.vad !== null && this.vad.listening;
166
+ }
167
+ onVoiceStart(listener) {
168
+ this.voiceStartListeners.add(listener);
169
+ return () => this.voiceStartListeners.delete(listener);
170
+ }
171
+ onVoiceStop(listener) {
172
+ this.voiceStopListeners.add(listener);
173
+ return () => this.voiceStopListeners.delete(listener);
174
+ }
175
+ emitVoiceStart() {
176
+ this.voiceStartListeners.forEach((listener) => {
177
+ try {
178
+ listener();
179
+ } catch (error) {
180
+ console.error("Error in voice start listener:", error);
181
+ }
182
+ });
183
+ }
184
+ emitVoiceStop() {
185
+ this.voiceStopListeners.forEach((listener) => {
186
+ try {
187
+ listener();
188
+ } catch (error) {
189
+ console.error("Error in voice stop listener:", error);
190
+ }
191
+ });
192
+ }
193
+ };
194
+
195
+ // src/stt/stt-logic.ts
196
+ var ResetSTTLogic2 = class {
197
+ constructor(onLog, onTranscript, options = {}) {
198
+ this.isListening = false;
199
+ this.fullTranscript = "";
200
+ this.heardWords = [];
201
+ this.onWordsUpdate = null;
202
+ this.onMicTimeUpdate = null;
203
+ this.onRestartMetrics = null;
204
+ this.micOnTime = 0;
205
+ this.sessionDuration = 3e4;
206
+ this.lastTickTime = 0;
207
+ this.micTimeInterval = null;
208
+ this.restartCount = 0;
209
+ this.isRestarting = false;
210
+ this.isRecognitionRunning = false;
211
+ this.lastInterimTranscript = "";
212
+ this.lastInterimSaveTime = 0;
213
+ this.interimSaveInterval = 1e3;
214
+ this.lastInterimResultTime = 0;
215
+ this.lastSavedLength = 0;
216
+ this.transcriptBeforeRestart = "";
217
+ this.sessionStartTranscript = "";
218
+ this.sessionId = 0;
219
+ this.awaitingRestartFirstResultId = null;
220
+ this.lastWasFinal = false;
221
+ this.restartMetrics = {};
222
+ this.isAutoRestarting = false;
223
+ this.onLog = onLog;
224
+ this.onTranscript = onTranscript;
225
+ this.options = {
226
+ sessionDurationMs: options.sessionDurationMs ?? 3e4,
227
+ interimSaveIntervalMs: options.interimSaveIntervalMs ?? 5e3,
228
+ preserveTranscriptOnStart: options.preserveTranscriptOnStart ?? false
229
+ };
230
+ this.sessionDuration = this.options.sessionDurationMs;
231
+ this.interimSaveInterval = this.options.interimSaveIntervalMs;
232
+ const SpeechRecognitionAPI = window.SpeechRecognition || window.webkitSpeechRecognition;
233
+ if (!SpeechRecognitionAPI) {
234
+ this.onLog("Speech Recognition API not supported", "error");
235
+ throw new Error("Speech Recognition API not available");
236
+ }
237
+ this.recognition = new SpeechRecognitionAPI();
238
+ this.setupRecognition();
239
+ }
240
+ setWordsUpdateCallback(callback) {
241
+ this.onWordsUpdate = callback;
242
+ }
243
+ setMicTimeUpdateCallback(callback) {
244
+ this.onMicTimeUpdate = callback;
245
+ }
246
+ setRestartMetricsCallback(callback) {
247
+ this.onRestartMetrics = callback;
248
+ }
249
+ setVadCallbacks(onSpeechStart, onSpeechEnd) {
250
+ this.onUserSpeechStart = onSpeechStart || void 0;
251
+ this.onUserSpeechEnd = onSpeechEnd || void 0;
252
+ }
253
+ getSessionDurationMs() {
254
+ return this.sessionDuration;
255
+ }
256
+ isInAutoRestart() {
257
+ return this.isAutoRestarting;
258
+ }
259
+ getFullTranscript() {
260
+ if (this.transcriptBeforeRestart.length > 0) {
261
+ if (this.fullTranscript.length > 0) {
262
+ return (this.transcriptBeforeRestart + " " + this.fullTranscript).trim();
263
+ }
264
+ return this.transcriptBeforeRestart;
265
+ }
266
+ return this.fullTranscript;
267
+ }
268
+ clearTranscript() {
269
+ this.fullTranscript = "";
270
+ this.transcriptBeforeRestart = "";
271
+ this.sessionStartTranscript = "";
272
+ this.heardWords = [];
273
+ }
274
+ setupRecognition() {
275
+ this.recognition.lang = "en-US";
276
+ this.recognition.interimResults = true;
277
+ this.recognition.continuous = true;
278
+ this.recognition.maxAlternatives = 1;
279
+ this.resultHandler = (event) => {
280
+ const speechEvent = event;
281
+ let completeTranscript = "";
282
+ for (let i = 0; i < speechEvent.results.length; i++) {
283
+ completeTranscript += speechEvent.results[i][0].transcript + " ";
284
+ }
285
+ completeTranscript = completeTranscript.trim();
286
+ const isFinal = speechEvent.results[speechEvent.results.length - 1].isFinal;
287
+ completeTranscript = this.collapseRepeats(completeTranscript);
288
+ this.lastInterimTranscript = completeTranscript;
289
+ this.lastInterimResultTime = Date.now();
290
+ if (this.awaitingRestartFirstResultId != null) {
291
+ const rid = this.awaitingRestartFirstResultId;
292
+ if (this.restartMetrics[rid] && !this.restartMetrics[rid].firstResultAt) {
293
+ this.restartMetrics[rid].firstResultAt = Date.now();
294
+ const delta = this.restartMetrics[rid].firstResultAt - this.restartMetrics[rid].requestedAt;
295
+ this.onLog(
296
+ `\u{1F514} First result after restart #${rid} in ${delta}ms`,
297
+ "info"
298
+ );
299
+ this.awaitingRestartFirstResultId = null;
300
+ }
301
+ }
302
+ this.onLog(
303
+ `[${isFinal ? "FINAL" : "INTERIM"}] "${completeTranscript}"`,
304
+ isFinal ? "info" : "warning"
305
+ );
306
+ if (!isFinal && this.lastWasFinal) {
307
+ try {
308
+ this.onUserSpeechStart?.();
309
+ } catch {
310
+ }
311
+ }
312
+ this.lastWasFinal = isFinal;
313
+ if (isFinal) {
314
+ this.fullTranscript = (this.sessionStartTranscript + " " + completeTranscript).trim();
315
+ this.fullTranscript = this.collapseRepeats(this.fullTranscript);
316
+ this.heardWords = this.fullTranscript.split(/\s+/).filter((word) => word.length > 0);
317
+ this.onTranscript(this.getFullTranscript());
318
+ this.lastSavedLength = this.fullTranscript.length;
319
+ if (this.onWordsUpdate) this.onWordsUpdate(this.heardWords);
320
+ this.lastInterimTranscript = "";
321
+ if (this.awaitingRestartFirstResultId != null) {
322
+ const rid = this.awaitingRestartFirstResultId;
323
+ if (this.restartMetrics[rid] && !this.restartMetrics[rid].firstResultAt) {
324
+ this.restartMetrics[rid].firstResultAt = Date.now();
325
+ this.restartMetrics[rid].startedAt || this.restartMetrics[rid].startAttemptAt || Date.now();
326
+ const firstResultDelta = this.restartMetrics[rid].firstResultAt - this.restartMetrics[rid].requestedAt;
327
+ this.onLog(
328
+ `\u{1F514} First result after restart #${rid} in ${firstResultDelta}ms`,
329
+ "info"
330
+ );
331
+ this.awaitingRestartFirstResultId = null;
332
+ }
333
+ }
334
+ }
335
+ };
336
+ this.recognition.addEventListener("result", this.resultHandler);
337
+ this.errorHandler = (event) => {
338
+ const errorEvent = event;
339
+ if (errorEvent.error === "aborted" && this.isRestarting) {
340
+ this.onLog("Aborted during restart (ignored)", "info");
341
+ this.isRecognitionRunning = false;
342
+ return;
343
+ }
344
+ this.onLog(`Error: ${errorEvent.error}`, "error");
345
+ if (errorEvent.error === "no-speech" || errorEvent.error === "audio-capture" || errorEvent.error === "network") {
346
+ setTimeout(() => {
347
+ if (this.isListening && !this.isRestarting && !this.isRecognitionRunning) {
348
+ try {
349
+ this.recognition.start();
350
+ this.isRecognitionRunning = true;
351
+ this.sessionId++;
352
+ } catch (e) {
353
+ this.onLog(`Failed restart after error: ${e}`, "error");
354
+ }
355
+ }
356
+ }, 500);
357
+ } else {
358
+ this.onLog(
359
+ `Unhandled SpeechRecognition error: ${errorEvent.error}`,
360
+ "warning"
361
+ );
362
+ }
363
+ };
364
+ this.recognition.addEventListener("error", this.errorHandler);
365
+ this.endHandler = () => {
366
+ this.isRecognitionRunning = false;
367
+ if (this.isListening && !this.isRestarting) {
368
+ setTimeout(() => {
369
+ if (this.isListening && !this.isRestarting) {
370
+ try {
371
+ this.recognition.start();
372
+ this.isRecognitionRunning = true;
373
+ this.sessionId++;
374
+ this.onLog(
375
+ `\u{1F501} Auto-resumed recognition after end (session ${this.sessionId})`,
376
+ "info"
377
+ );
378
+ } catch (e) {
379
+ this.onLog(`Failed to auto-start after end: ${e}`, "error");
380
+ }
381
+ }
382
+ }, 100);
383
+ }
384
+ };
385
+ this.recognition.addEventListener("end", this.endHandler);
386
+ this.startHandler = () => {
387
+ this.isRecognitionRunning = true;
388
+ const rid = this.awaitingRestartFirstResultId;
389
+ if (rid != null && this.restartMetrics[rid]) {
390
+ if (!this.restartMetrics[rid].startedAt) {
391
+ this.restartMetrics[rid].startedAt = Date.now();
392
+ this.onLog(
393
+ `\u25B6\uFE0F Restart #${rid} recognition started in ${this.restartMetrics[rid].startedAt - this.restartMetrics[rid].requestedAt}ms`,
394
+ "info"
395
+ );
396
+ }
397
+ }
398
+ };
399
+ this.recognition.addEventListener("start", this.startHandler);
400
+ }
401
+ waitForEventOnce(eventName, timeoutMs) {
402
+ return new Promise((resolve) => {
403
+ let timer = null;
404
+ const handler = (ev) => {
405
+ if (timer !== null) clearTimeout(timer);
406
+ this.recognition.removeEventListener(eventName, handler);
407
+ resolve(ev);
408
+ };
409
+ this.recognition.addEventListener(eventName, handler);
410
+ timer = window.setTimeout(() => {
411
+ this.recognition.removeEventListener(eventName, handler);
412
+ resolve(null);
413
+ }, timeoutMs);
414
+ });
415
+ }
416
+ startMicTimer() {
417
+ this.lastTickTime = Date.now();
418
+ this.lastInterimSaveTime = Date.now();
419
+ this.micTimeInterval = window.setInterval(() => {
420
+ if (this.isListening) {
421
+ const now = Date.now();
422
+ const elapsed = now - this.lastTickTime;
423
+ this.micOnTime += elapsed;
424
+ this.lastTickTime = now;
425
+ if (now - this.lastInterimSaveTime >= this.interimSaveInterval) {
426
+ this.saveInterimToFinal();
427
+ this.lastInterimSaveTime = now;
428
+ }
429
+ if (this.micOnTime >= this.sessionDuration) {
430
+ if (!this.isRestarting) this.performRestart();
431
+ }
432
+ if (this.onMicTimeUpdate) this.onMicTimeUpdate(this.micOnTime);
433
+ }
434
+ }, 100);
435
+ }
436
+ stopMicTimer() {
437
+ if (this.micTimeInterval) {
438
+ clearInterval(this.micTimeInterval);
439
+ this.micTimeInterval = null;
440
+ }
441
+ }
442
+ saveInterimToFinal() {
443
+ if (!this.lastInterimTranscript) return;
444
+ const now = Date.now();
445
+ if (now - this.lastInterimResultTime > this.interimSaveInterval && this.lastInterimTranscript.length > this.lastSavedLength) {
446
+ this.fullTranscript = (this.fullTranscript + " " + this.lastInterimTranscript).trim();
447
+ this.fullTranscript = this.collapseRepeats(this.fullTranscript);
448
+ this.lastSavedLength = this.fullTranscript.length;
449
+ if (this.onWordsUpdate) {
450
+ const words = this.fullTranscript.split(/\s+/).filter((w) => w.length > 0);
451
+ this.onWordsUpdate(words);
452
+ }
453
+ this.onTranscript(this.getFullTranscript());
454
+ }
455
+ }
456
+ getSuffixToAppend(base, current) {
457
+ if (!base || base.length === 0) return current;
458
+ if (!current || current.length === 0) return "";
459
+ base = base.trim();
460
+ current = current.trim();
461
+ if (current.startsWith(base)) {
462
+ return current.slice(base.length).trim();
463
+ }
464
+ const maxOverlap = Math.min(base.length, current.length);
465
+ for (let overlap = maxOverlap; overlap > 0; overlap--) {
466
+ if (base.endsWith(current.slice(0, overlap))) {
467
+ return current.slice(overlap).trim();
468
+ }
469
+ }
470
+ return current;
471
+ }
472
+ collapseRepeats(text) {
473
+ if (!text || text.trim().length === 0) return text.trim();
474
+ let normalized = text.replace(/\s+/g, " ").trim();
475
+ const n = normalized.length;
476
+ const lps = new Array(n).fill(0);
477
+ for (let i = 1; i < n; i++) {
478
+ let j = lps[i - 1];
479
+ while (j > 0 && normalized[i] !== normalized[j]) j = lps[j - 1];
480
+ if (normalized[i] === normalized[j]) j++;
481
+ lps[i] = j;
482
+ }
483
+ const period = n - lps[n - 1];
484
+ if (period < n && n % period === 0) {
485
+ return normalized.slice(0, period).trim();
486
+ }
487
+ const words = normalized.split(" ");
488
+ for (let block = Math.min(20, Math.floor(words.length / 2)); block >= 1; block--) {
489
+ let i = 0;
490
+ while (i + 2 * block <= words.length) {
491
+ let blockA = words.slice(i, i + block).join(" ");
492
+ let blockB = words.slice(i + block, i + 2 * block).join(" ");
493
+ if (blockA === blockB) {
494
+ words.splice(i + block, block);
495
+ } else {
496
+ i++;
497
+ }
498
+ }
499
+ }
500
+ const collapsedWords = [];
501
+ for (const w of words) {
502
+ if (collapsedWords.length === 0 || collapsedWords[collapsedWords.length - 1] !== w)
503
+ collapsedWords.push(w);
504
+ }
505
+ return collapsedWords.join(" ").trim();
506
+ }
507
+ performRestart() {
508
+ if (!this.isListening || this.isRestarting) return;
509
+ const restartStartTime = Date.now();
510
+ this.restartCount++;
511
+ this.isRestarting = true;
512
+ this.isAutoRestarting = true;
513
+ const rid = ++this.sessionId;
514
+ this.awaitingRestartFirstResultId = rid;
515
+ this.restartMetrics[rid] = { requestedAt: restartStartTime };
516
+ this.onLog(
517
+ `\u{1F504} [AUTO-RESTART] Session ${rid} - buffering transcript, waiting for silence...`,
518
+ "warning"
519
+ );
520
+ if (this.lastInterimTranscript.trim().length > 0) {
521
+ this.saveInterimToFinal();
522
+ }
523
+ this.transcriptBeforeRestart = this.getFullTranscript();
524
+ this.fullTranscript = "";
525
+ this.sessionStartTranscript = "";
526
+ this.lastInterimTranscript = "";
527
+ this.heardWords = [];
528
+ this.stopMicTimer();
529
+ const stopTimeout = 600;
530
+ const startTimeout = 1e3;
531
+ const firstResultTimeout = 2e3;
532
+ const stopNow = async () => {
533
+ try {
534
+ if (this.isRecognitionRunning) {
535
+ this.recognition.stop();
536
+ } else {
537
+ this.onLog("Recognition not running at stop attempt", "warning");
538
+ }
539
+ } catch (err) {
540
+ this.onLog(`Stop threw: ${err}`, "warning");
541
+ }
542
+ const endEvent = await this.waitForEventOnce("end", stopTimeout);
543
+ if (!endEvent) {
544
+ try {
545
+ this.recognition.abort();
546
+ } catch (err) {
547
+ this.onLog(`Abort also failed: ${err}`, "error");
548
+ }
549
+ await this.waitForEventOnce("end", 300);
550
+ }
551
+ this.restartMetrics[rid].stopAt = Date.now();
552
+ };
553
+ (async () => {
554
+ await stopNow();
555
+ this.restartMetrics[rid].startAttemptAt = Date.now();
556
+ try {
557
+ if (!this.isRecognitionRunning) {
558
+ this.sessionId = rid;
559
+ this.recognition.start();
560
+ } else {
561
+ this.onLog(
562
+ "Recognition already running at restart time; skipping start.",
563
+ "warning"
564
+ );
565
+ }
566
+ } catch (e) {
567
+ this.onLog(`Failed to start recognition after restart: ${e}`, "error");
568
+ }
569
+ const startEv = await this.waitForEventOnce("start", startTimeout);
570
+ if (startEv) {
571
+ this.restartMetrics[rid].startedAt = Date.now();
572
+ } else {
573
+ this.onLog(
574
+ `Restart #${rid} did not produce start event within ${startTimeout}ms`,
575
+ "warning"
576
+ );
577
+ }
578
+ const resEv = await this.waitForEventOnce("result", firstResultTimeout);
579
+ if (resEv) {
580
+ if (this.restartMetrics[rid])
581
+ this.restartMetrics[rid].firstResultAt = Date.now();
582
+ const firstResultDelta = (this.restartMetrics[rid].firstResultAt || Date.now()) - (this.restartMetrics[rid].requestedAt || Date.now());
583
+ this.onLog(
584
+ `\u{1F514} First result after restart #${rid} in ${firstResultDelta}ms`,
585
+ "info"
586
+ );
587
+ } else {
588
+ this.onLog(
589
+ `Restart #${rid} produced no result within ${firstResultTimeout}ms`,
590
+ "warning"
591
+ );
592
+ }
593
+ const startedAt = this.restartMetrics[rid].startedAt || this.restartMetrics[rid].startAttemptAt || Date.now();
594
+ const restartDuration = startedAt - this.restartMetrics[rid].requestedAt;
595
+ if (this.onRestartMetrics)
596
+ this.onRestartMetrics(this.restartCount, restartDuration);
597
+ this.onLog(
598
+ `\u2705 Session ${rid} restarted in ${restartDuration}ms - resuming from silence gate`,
599
+ "info"
600
+ );
601
+ this.startMicTimer();
602
+ this.isRestarting = false;
603
+ this.isAutoRestarting = false;
604
+ })();
605
+ }
606
+ start() {
607
+ if (this.isListening) return;
608
+ try {
609
+ this.isListening = true;
610
+ if (!this.options.preserveTranscriptOnStart) {
611
+ this.fullTranscript = "";
612
+ this.heardWords = [];
613
+ this.transcriptBeforeRestart = "";
614
+ this.sessionStartTranscript = "";
615
+ } else {
616
+ this.sessionStartTranscript = this.fullTranscript;
617
+ }
618
+ this.micOnTime = 0;
619
+ this.restartCount = 0;
620
+ this.lastSavedLength = 0;
621
+ this.lastInterimTranscript = "";
622
+ this.lastWasFinal = false;
623
+ if (!this.isRecognitionRunning) {
624
+ this.sessionId++;
625
+ this.recognition.start();
626
+ this.isRecognitionRunning = true;
627
+ }
628
+ this.startMicTimer();
629
+ this.onLog(
630
+ "Listening started (auto-restart every 30s of mic time)",
631
+ "info"
632
+ );
633
+ } catch (error) {
634
+ this.isListening = false;
635
+ this.onLog(`Failed to start: ${error}`, "error");
636
+ }
637
+ }
638
+ stop() {
639
+ if (!this.isListening) return;
640
+ try {
641
+ this.isListening = false;
642
+ this.isAutoRestarting = false;
643
+ this.stopMicTimer();
644
+ this.recognition.stop();
645
+ this.isRecognitionRunning = false;
646
+ this.onLog(
647
+ `Stopped listening (total mic time: ${(this.micOnTime / 1e3).toFixed(
648
+ 1
649
+ )}s, restarts: ${this.restartCount})`,
650
+ "info"
651
+ );
652
+ } catch (error) {
653
+ this.onLog(`Failed to stop: ${error}`, "error");
654
+ }
655
+ }
656
+ destroy() {
657
+ this.isListening = false;
658
+ this.stopMicTimer();
659
+ try {
660
+ this.recognition.abort?.();
661
+ } catch (e) {
662
+ }
663
+ try {
664
+ if (this.resultHandler)
665
+ this.recognition.removeEventListener("result", this.resultHandler);
666
+ if (this.errorHandler)
667
+ this.recognition.removeEventListener("error", this.errorHandler);
668
+ if (this.endHandler)
669
+ this.recognition.removeEventListener(
670
+ "end",
671
+ this.endHandler
672
+ );
673
+ if (this.startHandler)
674
+ this.recognition.removeEventListener(
675
+ "start",
676
+ this.startHandler
677
+ );
678
+ } catch (e) {
679
+ }
680
+ }
681
+ };
682
+ var STTLogic = class extends ResetSTTLogic2 {
683
+ };
684
+
685
+ // src/tts/prepare-piper-voice.ts
686
+ function preparePiperVoice(config) {
687
+ const modelPath = config.modelPath ?? `voices/${config.voiceId}.onnx`;
688
+ return {
689
+ voiceId: config.voiceId,
690
+ modelPath,
691
+ sampleRate: config.sampleRate ?? 22050,
692
+ inference: {
693
+ lengthScale: config.lengthScale ?? 1,
694
+ noiseScale: config.noiseScale ?? 0.667
695
+ },
696
+ metadata: {
697
+ speaker: config.speaker ?? "default"
698
+ }
699
+ };
700
+ }
701
+
702
+ // src/tts/stream-tokens-to-speech.ts
703
+ function isAsyncIterable(value) {
704
+ return typeof value[Symbol.asyncIterator] === "function";
705
+ }
706
+ var sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
707
+ async function streamTokensToSpeech(tokens, options = {}) {
708
+ const chunkSize = options.chunkSize ?? 40;
709
+ const delayMs = options.delayMs ?? 0;
710
+ let buffer = "";
711
+ let chunksEmitted = 0;
712
+ let characters = 0;
713
+ const emit = async () => {
714
+ if (!buffer) return;
715
+ characters += buffer.length;
716
+ chunksEmitted += 1;
717
+ if (options.onChunk) {
718
+ await options.onChunk(buffer);
719
+ }
720
+ buffer = "";
721
+ if (delayMs > 0) {
722
+ await sleep(delayMs);
723
+ }
724
+ };
725
+ if (isAsyncIterable(tokens)) {
726
+ for await (const token of tokens) {
727
+ buffer += token;
728
+ if (buffer.length >= chunkSize) {
729
+ await emit();
730
+ }
731
+ }
732
+ } else {
733
+ for (const token of tokens) {
734
+ buffer += token;
735
+ if (buffer.length >= chunkSize) {
736
+ await emit();
737
+ }
738
+ }
739
+ }
740
+ if (buffer) {
741
+ await emit();
742
+ }
743
+ return { chunksEmitted, characters };
744
+ }
745
+
746
+ // src/tts/ort-setup.ts
747
+ async function createOrtEnvironment(config = {}) {
748
+ const providers = config.providers ?? (config.device === "webgpu" ? ["webgpu", "wasm"] : ["wasm"]);
749
+ const environment = {
750
+ device: config.device ?? "cpu",
751
+ logLevel: config.logLevel ?? "warning",
752
+ providers,
753
+ initialized: false,
754
+ async init() {
755
+ this.initialized = true;
756
+ }
757
+ };
758
+ await environment.init();
759
+ return environment;
760
+ }
761
+
762
+ // src/tts/piper.ts
763
+ var voiceCache = /* @__PURE__ */ new Map();
764
+ var ortEnv = null;
765
+ async function ensureOrtReady(config = {}) {
766
+ if (ortEnv) return ortEnv;
767
+ ortEnv = await createOrtEnvironment(config);
768
+ return ortEnv;
769
+ }
770
+ async function ensureVoiceLoaded(config) {
771
+ const cached = voiceCache.get(config.voiceId);
772
+ if (cached) return cached;
773
+ const voice = preparePiperVoice(config);
774
+ voiceCache.set(config.voiceId, voice);
775
+ return voice;
776
+ }
777
+ async function warmupPiper(voiceConfig, synth, text = "warmup") {
778
+ const voice = await ensureVoiceLoaded(voiceConfig);
779
+ await synth(text, voice);
780
+ }
781
+ function resetVoiceCache() {
782
+ voiceCache.clear();
783
+ }
784
+ function getBackendLabel(device) {
785
+ if (!device) return "auto";
786
+ return device === "webgpu" ? "WebGPU" : "CPU";
787
+ }
788
+ function isCorruptModelError(error) {
789
+ if (!error) return false;
790
+ const msg = typeof error === "string" ? error : error.message;
791
+ if (!msg) return false;
792
+ return /corrupt|checksum|integrity/i.test(msg);
793
+ }
794
+ async function* synthesizerWorker(textQueue, voiceConfig, synth) {
795
+ const voice = await ensureVoiceLoaded(voiceConfig);
796
+ for await (const text of textQueue) {
797
+ yield synth(text, voice);
798
+ }
799
+ }
800
+ async function playerWorker(audioQueue, play) {
801
+ for await (const audio of audioQueue) {
802
+ await play(audio);
803
+ }
804
+ }
805
+ function nextBoundaryIndex(text) {
806
+ const idx = text.search(/[.!?,]/);
807
+ return idx >= 0 ? idx : -1;
808
+ }
809
+ function emitSentence(queue, sentence) {
810
+ const trimmed = sentence.trim();
811
+ if (trimmed) {
812
+ queue.put(trimmed);
813
+ }
814
+ }
815
+ function handleChunk(state, chunk, queue) {
816
+ state.buffer += chunk;
817
+ let boundary = nextBoundaryIndex(state.buffer);
818
+ while (boundary >= 0) {
819
+ const sentence = state.buffer.slice(0, boundary + 1);
820
+ state.buffer = state.buffer.slice(boundary + 1);
821
+ emitSentence(queue, sentence);
822
+ boundary = nextBoundaryIndex(state.buffer);
823
+ }
824
+ }
825
+ function getAsyncIterator(source) {
826
+ if (source[Symbol.asyncIterator]) {
827
+ return source;
828
+ }
829
+ return {
830
+ async *[Symbol.asyncIterator]() {
831
+ for (const item of source) {
832
+ yield item;
833
+ }
834
+ }
835
+ };
836
+ }
837
+ var SimpleQueue = class {
838
+ constructor() {
839
+ this.buffer = [];
840
+ this.resolvers = [];
841
+ }
842
+ put(item) {
843
+ if (this.resolvers.length > 0) {
844
+ const resolve = this.resolvers.shift();
845
+ resolve?.({ value: item, done: false });
846
+ } else {
847
+ this.buffer.push(item);
848
+ }
849
+ }
850
+ size() {
851
+ return this.buffer.length;
852
+ }
853
+ async get() {
854
+ if (this.buffer.length > 0) {
855
+ return this.buffer.shift();
856
+ }
857
+ return new Promise((resolve) => {
858
+ this.resolvers.push(({ value }) => resolve(value));
859
+ });
860
+ }
861
+ async *[Symbol.asyncIterator]() {
862
+ while (true) {
863
+ const value = await this.get();
864
+ yield value;
865
+ }
866
+ }
867
+ };
868
+
869
+ // src/tts/use-streaming-tts.ts
870
+ var defaultSynth = async (text) => text;
871
+ var defaultPlayer = async () => void 0;
872
+ function useStreamingTTS(options) {
873
+ const textQueue = new SimpleQueue();
874
+ const bufferState = { buffer: "" };
875
+ let ready = false;
876
+ let stopped = false;
877
+ let voice = null;
878
+ const synth = options.synth ?? defaultSynth;
879
+ const play = options.play ?? defaultPlayer;
880
+ const chunkSize = options.chunkSize ?? 48;
881
+ const delayMs = options.delayMs ?? 0;
882
+ async function ensureReady() {
883
+ if (ready) return;
884
+ await ensureOrtReady(options.ort ?? {});
885
+ voice = await ensureVoiceLoaded(options.voice);
886
+ ready = true;
887
+ }
888
+ async function addChunk(text) {
889
+ handleChunk(bufferState, text, textQueue);
890
+ if (bufferState.buffer.length >= chunkSize) {
891
+ emitSentence(textQueue, bufferState.buffer);
892
+ bufferState.buffer = "";
893
+ }
894
+ }
895
+ async function finishStreaming() {
896
+ if (bufferState.buffer) {
897
+ emitSentence(textQueue, bufferState.buffer);
898
+ bufferState.buffer = "";
899
+ }
900
+ }
901
+ function stop() {
902
+ stopped = true;
903
+ }
904
+ async function synthAndPlayChunk(text) {
905
+ await ensureReady();
906
+ const audio = await synth(text, voice);
907
+ await play(audio);
908
+ }
909
+ async function processQueue() {
910
+ await ensureReady();
911
+ const tokenIterator = getAsyncIterator(textQueue);
912
+ const audioIterator = synthesizerWorker(tokenIterator, options.voice, synth);
913
+ await playerWorker(audioIterator, play);
914
+ }
915
+ function createTokenIterable(text) {
916
+ return text.split(/\s+/g).filter(Boolean);
917
+ }
918
+ async function streamTokens(tokens) {
919
+ await ensureReady();
920
+ await streamTokensToSpeech(tokens, {
921
+ chunkSize,
922
+ delayMs,
923
+ onChunk: async (chunk) => {
924
+ if (stopped) return;
925
+ await synthAndPlayChunk(chunk);
926
+ }
927
+ });
928
+ }
929
+ processQueue().catch(() => void 0);
930
+ streamTokens(textQueue).catch(() => void 0);
931
+ return {
932
+ ensureReady,
933
+ addChunk,
934
+ finishStreaming,
935
+ stop,
936
+ synthAndPlayChunk,
937
+ processQueue,
938
+ createTokenIterable
939
+ };
940
+ }
941
+ var DEFAULT_VOICE_ID = "en_US-hfc_female-medium";
942
+ var TTSLogic = class {
943
+ constructor(config = {}) {
944
+ this.ready = false;
945
+ this.voiceLoaded = false;
946
+ this.config = {
947
+ voiceId: DEFAULT_VOICE_ID,
948
+ sampleRate: 22050,
949
+ ...config
950
+ };
951
+ }
952
+ /**
953
+ * Initialize the synthesizer by loading the voice model
954
+ */
955
+ async initialize() {
956
+ if (this.ready) return;
957
+ try {
958
+ const voiceId = this.config.voiceId;
959
+ console.log("\u{1F4CD} Loading Piper voice:", voiceId);
960
+ const storedVoices = await piperTts__namespace.stored();
961
+ const alreadyCached = Array.isArray(storedVoices) ? storedVoices.includes(voiceId) : false;
962
+ if (!alreadyCached) {
963
+ console.log("\u2B07\uFE0F Downloading voice model...");
964
+ await piperTts__namespace.download(voiceId, (progress) => {
965
+ if (progress?.total) {
966
+ const pct = Math.round(progress.loaded * 100 / progress.total);
967
+ console.log(`\u2B07\uFE0F Downloading: ${pct}%`);
968
+ }
969
+ });
970
+ } else {
971
+ console.log("\u2713 Voice found in cache");
972
+ }
973
+ this.voiceLoaded = true;
974
+ this.ready = true;
975
+ console.log("\u2713 Piper synthesizer initialized");
976
+ } catch (error) {
977
+ throw new Error(`Failed to initialize Piper synthesizer: ${error}`);
978
+ }
979
+ }
980
+ /**
981
+ * Check if the synthesizer is ready
982
+ */
983
+ isReady() {
984
+ return this.ready;
985
+ }
986
+ /**
987
+ * Synthesize speech from text
988
+ * @param text - Text to convert to speech
989
+ * @returns Audio data as WAV Blob and Float32Array
990
+ */
991
+ async synthesize(text) {
992
+ if (!this.ready) {
993
+ throw new Error("Synthesizer not initialized. Call initialize() first.");
994
+ }
995
+ const trimmed = text?.trim();
996
+ if (!trimmed) {
997
+ throw new Error("No text provided for synthesis");
998
+ }
999
+ try {
1000
+ const wavBlob = await piperTts__namespace.predict({
1001
+ text: trimmed,
1002
+ voiceId: this.config.voiceId
1003
+ });
1004
+ const arrayBuffer = await wavBlob.arrayBuffer();
1005
+ const audioContext = new (window.AudioContext || window.webkitAudioContext)();
1006
+ const decodedBuffer = await audioContext.decodeAudioData(arrayBuffer);
1007
+ const audioData = decodedBuffer.getChannelData(0);
1008
+ audioContext.close();
1009
+ return {
1010
+ audioBlob: wavBlob,
1011
+ audio: audioData,
1012
+ sampleRate: decodedBuffer.sampleRate,
1013
+ duration: decodedBuffer.duration
1014
+ };
1015
+ } catch (error) {
1016
+ throw new Error(`Synthesis failed: ${error}`);
1017
+ }
1018
+ }
1019
+ /**
1020
+ * Synthesize and return WAV Blob only (faster, no decoding)
1021
+ */
1022
+ async synthesizeToBlob(text) {
1023
+ if (!this.ready) {
1024
+ throw new Error("Synthesizer not initialized. Call initialize() first.");
1025
+ }
1026
+ const trimmed = text?.trim();
1027
+ if (!trimmed) {
1028
+ throw new Error("No text provided for synthesis");
1029
+ }
1030
+ return piperTts__namespace.predict({
1031
+ text: trimmed,
1032
+ voiceId: this.config.voiceId
1033
+ });
1034
+ }
1035
+ /**
1036
+ * Stop current synthesis (not directly supported, but we can track state)
1037
+ */
1038
+ stop() {
1039
+ console.log("Stop requested");
1040
+ }
1041
+ /**
1042
+ * Dispose of the synthesizer and free resources
1043
+ */
1044
+ async dispose() {
1045
+ this.ready = false;
1046
+ this.voiceLoaded = false;
1047
+ }
1048
+ };
1049
+ function textToPhonemes(_text) {
1050
+ console.warn(
1051
+ "textToPhonemes is deprecated. Use PiperSynthesizer.synthesize(text) instead."
1052
+ );
1053
+ return [];
1054
+ }
1055
+
1056
+ // src/tts/audio-player.ts
1057
+ var AudioPlayer = class {
1058
+ constructor(config = {}) {
1059
+ this.audioContext = null;
1060
+ this.currentSource = null;
1061
+ this.config = {
1062
+ sampleRate: 22050,
1063
+ volume: 1,
1064
+ ...config
1065
+ };
1066
+ }
1067
+ /**
1068
+ * Initialize the audio context
1069
+ */
1070
+ getAudioContext() {
1071
+ if (!this.audioContext) {
1072
+ this.audioContext = new (window.AudioContext || window.webkitAudioContext)({
1073
+ sampleRate: this.config.sampleRate
1074
+ });
1075
+ }
1076
+ return this.audioContext;
1077
+ }
1078
+ /**
1079
+ * Play audio data
1080
+ * @param audioData - Float32Array of audio samples
1081
+ * @param sampleRate - Sample rate of the audio
1082
+ */
1083
+ async play(audioData, sampleRate) {
1084
+ const ctx = this.getAudioContext();
1085
+ if (ctx.state === "suspended") {
1086
+ await ctx.resume();
1087
+ }
1088
+ const audioBuffer = ctx.createBuffer(1, audioData.length, sampleRate);
1089
+ audioBuffer.getChannelData(0).set(audioData);
1090
+ const source = ctx.createBufferSource();
1091
+ source.buffer = audioBuffer;
1092
+ const gainNode = ctx.createGain();
1093
+ gainNode.gain.value = this.config.volume;
1094
+ source.connect(gainNode);
1095
+ gainNode.connect(ctx.destination);
1096
+ this.currentSource = source;
1097
+ source.start(0);
1098
+ return new Promise((resolve) => {
1099
+ source.onended = () => {
1100
+ this.currentSource = null;
1101
+ resolve();
1102
+ };
1103
+ });
1104
+ }
1105
+ /**
1106
+ * Stop current playback
1107
+ */
1108
+ stop() {
1109
+ if (this.currentSource) {
1110
+ try {
1111
+ this.currentSource.stop();
1112
+ this.currentSource = null;
1113
+ } catch (error) {
1114
+ }
1115
+ }
1116
+ }
1117
+ /**
1118
+ * Set volume (0.0 to 1.0)
1119
+ */
1120
+ setVolume(volume) {
1121
+ this.config.volume = Math.max(0, Math.min(1, volume));
1122
+ }
1123
+ /**
1124
+ * Close the audio context and free resources
1125
+ */
1126
+ async close() {
1127
+ this.stop();
1128
+ if (this.audioContext) {
1129
+ await this.audioContext.close();
1130
+ this.audioContext = null;
1131
+ }
1132
+ }
1133
+ };
1134
+ function createAudioPlayer(config) {
1135
+ return new AudioPlayer(config);
1136
+ }
1137
+
1138
+ exports.AudioPlayer = AudioPlayer;
1139
+ exports.ResetSTTLogic = ResetSTTLogic;
1140
+ exports.STTLogic = STTLogic;
1141
+ exports.SimpleQueue = SimpleQueue;
1142
+ exports.TTSLogic = TTSLogic;
1143
+ exports.VADController = VADController;
1144
+ exports.createAudioPlayer = createAudioPlayer;
1145
+ exports.createOrtEnvironment = createOrtEnvironment;
1146
+ exports.emitSentence = emitSentence;
1147
+ exports.ensureOrtReady = ensureOrtReady;
1148
+ exports.ensureVoiceLoaded = ensureVoiceLoaded;
1149
+ exports.getAsyncIterator = getAsyncIterator;
1150
+ exports.getBackendLabel = getBackendLabel;
1151
+ exports.handleChunk = handleChunk;
1152
+ exports.isCorruptModelError = isCorruptModelError;
1153
+ exports.nextBoundaryIndex = nextBoundaryIndex;
1154
+ exports.playerWorker = playerWorker;
1155
+ exports.preparePiperVoice = preparePiperVoice;
1156
+ exports.resetVoiceCache = resetVoiceCache;
1157
+ exports.streamTokensToSpeech = streamTokensToSpeech;
1158
+ exports.synthesizerWorker = synthesizerWorker;
1159
+ exports.textToPhonemes = textToPhonemes;
1160
+ exports.useStreamingTTS = useStreamingTTS;
1161
+ exports.warmupPiper = warmupPiper;
1162
+ //# sourceMappingURL=index.cjs.map
1163
+ //# sourceMappingURL=index.cjs.map