@amaster.ai/tts-client 1.1.9 → 1.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -20,184 +20,1013 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
20
20
  // src/index.ts
21
21
  var index_exports = {};
22
22
  __export(index_exports, {
23
- createTTSClient: () => tts_client_default
23
+ createTTSClient: () => createTTSClient,
24
+ createTTSSpeakController: () => createTTSSpeakController,
25
+ preprocessTTSContent: () => preprocessTTSContent,
26
+ splitTextIntoFragments: () => splitTextIntoFragments
24
27
  });
25
28
  module.exports = __toCommonJS(index_exports);
26
29
 
27
30
  // src/tts-client.ts
28
31
  var TTS_PATH = "/api/proxy/builtin/platform/qwen-tts/api-ws/v1/realtime";
29
- function createTTSClient(config) {
32
+ var TTS_MAX_FRAGMENT_LENGTH = 2e3;
33
+ function isBrowserEnvironment() {
34
+ return typeof window !== "undefined";
35
+ }
36
+ function getWeightedTextLength(text) {
37
+ let length = 0;
38
+ for (const char of text) {
39
+ length += /[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff]/.test(char) ? 2 : 1;
40
+ }
41
+ return length;
42
+ }
43
+ function splitOversizedSegment(segment, maxLength) {
44
+ const fragments = [];
45
+ let current = "";
46
+ let currentLength = 0;
47
+ for (const char of segment) {
48
+ const charLength = getWeightedTextLength(char);
49
+ if (current && currentLength + charLength > maxLength) {
50
+ fragments.push(current);
51
+ current = char;
52
+ currentLength = charLength;
53
+ continue;
54
+ }
55
+ current += char;
56
+ currentLength += charLength;
57
+ }
58
+ if (current) {
59
+ fragments.push(current);
60
+ }
61
+ return fragments;
62
+ }
63
+ function splitTextIntoFragments(text, maxLength = TTS_MAX_FRAGMENT_LENGTH) {
64
+ const fragments = [];
65
+ const segments = text.match(/.+?(?:\r?\n+|$)/gs) ?? [];
66
+ const softBreakPattern = /(?<=[。!?;.!?;])|(?<=[,、,::])|(?<=\s)/;
67
+ let current = "";
68
+ const pushFragment = (fragment) => {
69
+ const trimmed = fragment.trim();
70
+ if (trimmed) {
71
+ fragments.push(trimmed);
72
+ }
73
+ };
74
+ const appendPart = (part) => {
75
+ if (!part.trim()) {
76
+ return;
77
+ }
78
+ if (getWeightedTextLength(part) > maxLength) {
79
+ for (const fragment of splitOversizedSegment(part, maxLength)) {
80
+ pushFragment(fragment);
81
+ }
82
+ current = "";
83
+ return;
84
+ }
85
+ const next = current ? `${current}${part}` : part;
86
+ if (getWeightedTextLength(next) <= maxLength) {
87
+ current = next;
88
+ return;
89
+ }
90
+ pushFragment(current);
91
+ current = part;
92
+ };
93
+ for (const segment of segments) {
94
+ const parts = segment.split(softBreakPattern).filter((part) => part.trim());
95
+ if (!parts.length) {
96
+ continue;
97
+ }
98
+ for (const part of parts) {
99
+ appendPart(part);
100
+ }
101
+ }
102
+ pushFragment(current);
103
+ return fragments;
104
+ }
105
+ function normalizeWhitespace(text) {
106
+ return text.replace(/\r\n/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ");
107
+ }
108
+ function stripMarkdownTables(text) {
109
+ return text.replace(
110
+ /^\|(.+)\|$/gm,
111
+ (_, row) => row.split("|").map((cell) => cell.trim()).filter(Boolean).join("\uFF0C")
112
+ );
113
+ }
114
+ function stripMarkdown(text) {
115
+ return text.replace(/```[\s\S]*?```/g, " ").replace(/`([^`]+)`/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/\[([^\]]+)\]\(([^)]+)\)/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/^\s*>\s?/gm, "").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/[*_~#>]+/g, "");
116
+ }
117
+ function stripUrls(text) {
118
+ return text.replace(/https?:\/\/\S+/gi, " ");
119
+ }
120
+ function stripEmojiAndSymbols(text) {
121
+ return text.replace(/[\u{1F000}-\u{1FAFF}\u{2600}-\u{27BF}]/gu, " ");
122
+ }
123
+ function preprocessTTSContent(text) {
124
+ const normalized = normalizeWhitespace(text);
125
+ const withoutTables = stripMarkdownTables(normalized);
126
+ const withoutMarkdown = stripMarkdown(withoutTables);
127
+ const withoutUrls = stripUrls(withoutMarkdown);
128
+ const withoutEmoji = stripEmojiAndSymbols(withoutUrls);
129
+ return withoutEmoji.replace(/[|]/g, "\uFF0C").replace(/[ \t]+\n/g, "\n").replace(/\n+/g, "\n").replace(/[ ]{2,}/g, " ").trim();
130
+ }
131
+ function decodeBase64Chunk(chunk) {
132
+ const binaryString = atob(chunk);
133
+ const bytes = new Uint8Array(binaryString.length);
134
+ for (let index = 0; index < binaryString.length; index += 1) {
135
+ bytes[index] = binaryString.charCodeAt(index);
136
+ }
137
+ return bytes;
138
+ }
139
+ function concatUint8Arrays(items) {
140
+ let totalLength = 0;
141
+ for (const item of items) {
142
+ totalLength += item.length;
143
+ }
144
+ const result = new Uint8Array(totalLength);
145
+ let offset = 0;
146
+ for (const item of items) {
147
+ result.set(item, offset);
148
+ offset += item.length;
149
+ }
150
+ return result;
151
+ }
152
+ function pcmToWav(pcmBytes, sampleRate) {
153
+ const header = new ArrayBuffer(44);
154
+ const view = new DataView(header);
155
+ const dataSize = pcmBytes.byteLength;
156
+ const writeString = (offset, value) => {
157
+ for (let index = 0; index < value.length; index += 1) {
158
+ view.setUint8(offset + index, value.charCodeAt(index));
159
+ }
160
+ };
161
+ writeString(0, "RIFF");
162
+ view.setUint32(4, 36 + dataSize, true);
163
+ writeString(8, "WAVE");
164
+ writeString(12, "fmt ");
165
+ view.setUint32(16, 16, true);
166
+ view.setUint16(20, 1, true);
167
+ view.setUint16(22, 1, true);
168
+ view.setUint32(24, sampleRate, true);
169
+ view.setUint32(28, sampleRate * 2, true);
170
+ view.setUint16(32, 2, true);
171
+ view.setUint16(34, 16, true);
172
+ writeString(36, "data");
173
+ view.setUint32(40, dataSize, true);
174
+ return concatUint8Arrays([new Uint8Array(header), pcmBytes]);
175
+ }
176
+ function bytesToDataUri(bytes, mimeType) {
177
+ let binary = "";
178
+ for (const byte of bytes) {
179
+ binary += String.fromCharCode(byte);
180
+ }
181
+ return `data:${mimeType};base64,${btoa(binary)}`;
182
+ }
183
+ function createBrowserPlaybackBackend(config) {
184
+ const { sampleRate, onAudioStart, onAudioEnd, onError } = config;
185
+ let audioContext = null;
186
+ const audioSources = /* @__PURE__ */ new Set();
187
+ let nextScheduleTime = 0;
188
+ let streamEndHandled = false;
189
+ let streamingStarted = false;
190
+ const ensureAudioContext = () => {
191
+ if (!audioContext && typeof AudioContext !== "undefined") {
192
+ audioContext = new AudioContext({ sampleRate });
193
+ }
194
+ return audioContext;
195
+ };
196
+ const createAudioBufferFromPCM = (bytes) => {
197
+ const numSamples = Math.floor(bytes.length / 2);
198
+ const buffer = new AudioBuffer({ length: numSamples, sampleRate });
199
+ const channelData = buffer.getChannelData(0);
200
+ const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
201
+ for (let index = 0; index < numSamples; index += 1) {
202
+ const int16 = dataView.getInt16(index * 2, true);
203
+ channelData[index] = int16 / 32768;
204
+ }
205
+ return buffer;
206
+ };
207
+ const stop = () => {
208
+ for (const source of audioSources) {
209
+ try {
210
+ source.onended = null;
211
+ source.stop();
212
+ } catch {
213
+ }
214
+ source.disconnect();
215
+ }
216
+ audioSources.clear();
217
+ nextScheduleTime = 0;
218
+ streamEndHandled = false;
219
+ streamingStarted = false;
220
+ };
221
+ return {
222
+ kind: "browser",
223
+ hasStreamingPlayback: true,
224
+ isPlaying() {
225
+ return audioSources.size > 0;
226
+ },
227
+ stop,
228
+ async playBuffered(input) {
229
+ if (input.audioFormat !== "pcm") {
230
+ onError?.(new Error(`Built-in playback only supports pcm, got ${input.audioFormat}`));
231
+ return;
232
+ }
233
+ const context = ensureAudioContext();
234
+ if (!context || !input.chunks.length) {
235
+ return;
236
+ }
237
+ stop();
238
+ const chunkBytes = input.chunks.map((chunk) => decodeBase64Chunk(chunk));
239
+ const combined = concatUint8Arrays(chunkBytes);
240
+ const buffer = createAudioBufferFromPCM(combined);
241
+ const source = context.createBufferSource();
242
+ if (context.state === "suspended") {
243
+ await context.resume();
244
+ }
245
+ source.buffer = buffer;
246
+ source.connect(context.destination);
247
+ audioSources.add(source);
248
+ source.onended = () => {
249
+ audioSources.delete(source);
250
+ source.disconnect();
251
+ onAudioEnd?.();
252
+ };
253
+ onAudioStart?.();
254
+ source.start(0);
255
+ },
256
+ async scheduleStreamingChunk(input) {
257
+ if (input.audioFormat !== "pcm") {
258
+ onError?.(new Error(`Built-in playback only supports pcm, got ${input.audioFormat}`));
259
+ return;
260
+ }
261
+ const context = ensureAudioContext();
262
+ if (!context) {
263
+ return;
264
+ }
265
+ if (context.state === "suspended") {
266
+ await context.resume();
267
+ }
268
+ const chunkBytes = decodeBase64Chunk(input.chunk);
269
+ const chunkBuffer = createAudioBufferFromPCM(chunkBytes);
270
+ const source = context.createBufferSource();
271
+ const leadTime = 0.05;
272
+ if (!nextScheduleTime) {
273
+ nextScheduleTime = Math.max(context.currentTime + leadTime, leadTime);
274
+ }
275
+ source.buffer = chunkBuffer;
276
+ source.connect(context.destination);
277
+ audioSources.add(source);
278
+ source.onended = () => {
279
+ audioSources.delete(source);
280
+ source.disconnect();
281
+ if (streamEndHandled && audioSources.size === 0) {
282
+ nextScheduleTime = 0;
283
+ streamEndHandled = false;
284
+ streamingStarted = false;
285
+ onAudioEnd?.();
286
+ }
287
+ };
288
+ if (!streamingStarted) {
289
+ streamingStarted = true;
290
+ onAudioStart?.();
291
+ }
292
+ source.start(nextScheduleTime);
293
+ nextScheduleTime += chunkBuffer.duration;
294
+ },
295
+ finalizeStreaming() {
296
+ streamEndHandled = true;
297
+ if (audioSources.size === 0 && streamingStarted) {
298
+ nextScheduleTime = 0;
299
+ streamEndHandled = false;
300
+ streamingStarted = false;
301
+ onAudioEnd?.();
302
+ }
303
+ },
304
+ close() {
305
+ stop();
306
+ if (audioContext) {
307
+ void audioContext.close().catch(() => {
308
+ });
309
+ audioContext = null;
310
+ }
311
+ }
312
+ };
313
+ }
314
+ function createMiniProgramPlaybackBackend(config) {
315
+ const { runtime, onAudioStart, onAudioEnd, onError } = config;
316
+ const taro = runtime?.Taro;
317
+ const createInnerAudioContext = taro?.createInnerAudioContext;
318
+ const getFileSystemManager = taro?.getFileSystemManager;
319
+ const userDataPath = taro?.env?.USER_DATA_PATH;
320
+ let innerAudio = createInnerAudioContext?.() ?? null;
321
+ let currentTempFile = null;
322
+ let playing = false;
323
+ const bindEvents = () => {
324
+ innerAudio?.onPlay?.(() => {
325
+ playing = true;
326
+ onAudioStart?.();
327
+ });
328
+ innerAudio?.onEnded?.(() => {
329
+ playing = false;
330
+ onAudioEnd?.();
331
+ });
332
+ innerAudio?.onStop?.(() => {
333
+ playing = false;
334
+ });
335
+ innerAudio?.onError?.((error) => {
336
+ playing = false;
337
+ onError?.(new Error(error?.errMsg || "Mini-program audio playback failed"));
338
+ });
339
+ };
340
+ bindEvents();
341
+ const cleanupTempFile = () => {
342
+ if (!currentTempFile) {
343
+ return;
344
+ }
345
+ const pathToDelete = currentTempFile;
346
+ currentTempFile = null;
347
+ getFileSystemManager?.()?.unlink?.({
348
+ filePath: pathToDelete,
349
+ fail: () => {
350
+ }
351
+ });
352
+ };
353
+ const writeTempAudioFile = async (bytes) => {
354
+ if (!userDataPath || !getFileSystemManager) {
355
+ return bytesToDataUri(bytes, "audio/wav");
356
+ }
357
+ const filePath = `${userDataPath}/amaster-tts-${Date.now()}-${Math.random().toString(16).slice(2)}.wav`;
358
+ const fsManager = getFileSystemManager();
359
+ if (!fsManager?.writeFile) {
360
+ return bytesToDataUri(bytes, "audio/wav");
361
+ }
362
+ await new Promise((resolve, reject) => {
363
+ fsManager.writeFile?.({
364
+ filePath,
365
+ data: bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength),
366
+ success: () => resolve(),
367
+ fail: (error) => reject(error)
368
+ });
369
+ });
370
+ cleanupTempFile();
371
+ currentTempFile = filePath;
372
+ return filePath;
373
+ };
374
+ return {
375
+ kind: "mini-program",
376
+ hasStreamingPlayback: false,
377
+ isPlaying() {
378
+ return playing;
379
+ },
380
+ stop() {
381
+ innerAudio?.stop?.();
382
+ playing = false;
383
+ },
384
+ async playBuffered(input) {
385
+ if (input.audioFormat !== "pcm") {
386
+ onError?.(new Error(`Mini-program built-in playback only supports pcm, got ${input.audioFormat}`));
387
+ return;
388
+ }
389
+ const pcmChunks = input.chunks.map((chunk) => decodeBase64Chunk(chunk));
390
+ const wavBytes = pcmToWav(concatUint8Arrays(pcmChunks), input.sampleRate);
391
+ const source = await writeTempAudioFile(wavBytes);
392
+ if (!innerAudio && createInnerAudioContext) {
393
+ innerAudio = createInnerAudioContext();
394
+ bindEvents();
395
+ }
396
+ if (!innerAudio?.play) {
397
+ onError?.(new Error("Mini-program audio context is unavailable"));
398
+ return;
399
+ }
400
+ innerAudio.src = source;
401
+ innerAudio.autoplay = false;
402
+ innerAudio.obeyMuteSwitch = false;
403
+ innerAudio.play();
404
+ },
405
+ close() {
406
+ innerAudio?.stop?.();
407
+ innerAudio?.destroy?.();
408
+ innerAudio = null;
409
+ playing = false;
410
+ cleanupTempFile();
411
+ }
412
+ };
413
+ }
414
+ function resolvePlaybackBackend(config) {
415
+ if (config.runtime?.Taro?.createInnerAudioContext && !isBrowserEnvironment()) {
416
+ return createMiniProgramPlaybackBackend(config);
417
+ }
418
+ if (typeof AudioContext !== "undefined") {
419
+ return createBrowserPlaybackBackend(config);
420
+ }
421
+ return null;
422
+ }
423
+ function createRawTTSClient(config) {
30
424
  const {
31
425
  getAccessToken,
32
426
  voice = "Cherry",
33
427
  autoPlay = true,
34
428
  audioFormat = "pcm",
35
429
  sampleRate = 24e3,
430
+ runtime,
36
431
  onReady,
37
432
  onAudioStart,
38
433
  onAudioEnd,
39
434
  onAudioChunk,
40
- onError
435
+ onError,
436
+ onClose
41
437
  } = config;
438
+ void getAccessToken;
42
439
  let ws = null;
440
+ let connected = false;
43
441
  let audioChunks = [];
44
- let audioContext = null;
45
- let audioSource = null;
442
+ let responseDone = false;
443
+ let autoPlayed = false;
444
+ let playbackSuppressed = false;
445
+ let playbackBackend = resolvePlaybackBackend({
446
+ runtime,
447
+ sampleRate,
448
+ onAudioStart,
449
+ onAudioEnd,
450
+ onError
451
+ });
452
+ function buildWsUrl() {
453
+ let path = TTS_PATH;
454
+ return path;
455
+ }
456
+ function play() {
457
+ playbackSuppressed = false;
458
+ if (!audioChunks.length || !playbackBackend) {
459
+ return;
460
+ }
461
+ if (!responseDone && playbackBackend.hasStreamingPlayback) {
462
+ return;
463
+ }
464
+ void playbackBackend.playBuffered({
465
+ chunks: [...audioChunks],
466
+ sampleRate,
467
+ audioFormat
468
+ });
469
+ }
470
+ function stop() {
471
+ playbackSuppressed = true;
472
+ playbackBackend?.stop();
473
+ }
474
+ function close() {
475
+ stop();
476
+ if (ws) {
477
+ ws.close();
478
+ ws = null;
479
+ }
480
+ connected = false;
481
+ playbackBackend?.close();
482
+ playbackBackend = resolvePlaybackBackend({
483
+ runtime,
484
+ sampleRate,
485
+ onAudioStart,
486
+ onAudioEnd,
487
+ onError
488
+ });
489
+ onClose?.();
490
+ }
46
491
  async function connect() {
47
- return new Promise((resolve, reject) => {
48
- let wsUrl = TTS_PATH;
49
- if (getAccessToken) {
50
- const token = getAccessToken();
51
- if (token) {
52
- const separator = wsUrl.includes("?") ? "&" : "?";
53
- wsUrl = `${wsUrl}${separator}token=${encodeURIComponent(token)}`;
492
+ if (connected && ws?.readyState === WebSocket.OPEN) {
493
+ return;
494
+ }
495
+ await new Promise((resolve, reject) => {
496
+ const socket = new WebSocket(buildWsUrl());
497
+ ws = socket;
498
+ let settled = false;
499
+ const settleResolve = () => {
500
+ if (settled) {
501
+ return;
54
502
  }
55
- }
56
- ws = new WebSocket(wsUrl);
57
- ws.onopen = () => {
503
+ settled = true;
504
+ connected = true;
505
+ resolve();
58
506
  };
59
- ws.onmessage = (event) => {
60
- const data = JSON.parse(event.data);
61
- if (data.type === "session.created") {
62
- ws.send(
63
- JSON.stringify({
64
- type: "session.update",
65
- session: {
66
- mode: "server_commit",
67
- voice,
68
- response_format: audioFormat,
69
- sample_rate: sampleRate
70
- }
71
- })
72
- );
507
+ const settleReject = (error) => {
508
+ if (settled) {
509
+ return;
73
510
  }
74
- if (data.type === "session.updated") {
75
- onReady?.();
76
- resolve();
77
- }
78
- if (data.type === "response.audio.delta") {
79
- audioChunks.push(data.delta);
80
- onAudioChunk?.(audioChunks);
81
- }
82
- if (data.type === "response.audio.done") {
83
- onAudioChunk?.(audioChunks);
84
- if (autoPlay && typeof window !== "undefined") {
85
- playAudio();
511
+ settled = true;
512
+ connected = false;
513
+ reject(error);
514
+ };
515
+ socket.onmessage = async (event) => {
516
+ try {
517
+ const data = JSON.parse(event.data);
518
+ if (data.type === "session.created") {
519
+ socket.send(
520
+ JSON.stringify({
521
+ type: "session.update",
522
+ session: {
523
+ mode: "server_commit",
524
+ voice,
525
+ response_format: audioFormat,
526
+ sample_rate: sampleRate
527
+ }
528
+ })
529
+ );
86
530
  }
87
- }
88
- if (data.type === "response.done") {
89
- ws.send(JSON.stringify({ type: "session.finish" }));
90
- }
91
- if (data.type === "error") {
92
- const err = new Error(data.error?.message || "Unknown error");
93
- onError?.(err);
94
- reject(err);
531
+ if (data.type === "session.updated") {
532
+ onReady?.();
533
+ settleResolve();
534
+ return;
535
+ }
536
+ if (data.type === "response.audio.delta") {
537
+ audioChunks.push(data.delta);
538
+ onAudioChunk?.([...audioChunks]);
539
+ if (autoPlay && !playbackSuppressed && playbackBackend?.hasStreamingPlayback) {
540
+ autoPlayed = true;
541
+ await playbackBackend.scheduleStreamingChunk?.({
542
+ chunk: data.delta,
543
+ sampleRate,
544
+ audioFormat
545
+ });
546
+ }
547
+ return;
548
+ }
549
+ if (data.type === "response.audio.done") {
550
+ responseDone = true;
551
+ onAudioChunk?.([...audioChunks]);
552
+ if (playbackBackend?.hasStreamingPlayback) {
553
+ playbackBackend.finalizeStreaming?.();
554
+ return;
555
+ }
556
+ if (autoPlay && !playbackSuppressed && !autoPlayed) {
557
+ autoPlayed = true;
558
+ await playbackBackend?.playBuffered({
559
+ chunks: [...audioChunks],
560
+ sampleRate,
561
+ audioFormat
562
+ });
563
+ }
564
+ return;
565
+ }
566
+ if (data.type === "error") {
567
+ const error = new Error(data.error?.message || "Unknown TTS error");
568
+ onError?.(error);
569
+ settleReject(error);
570
+ }
571
+ } catch (error) {
572
+ const parsedError = error instanceof Error ? error : new Error(String(error));
573
+ onError?.(parsedError);
574
+ settleReject(parsedError);
95
575
  }
96
576
  };
97
- ws.onerror = () => {
98
- const err = new Error("WebSocket connection error");
99
- onError?.(err);
100
- reject(err);
577
+ socket.onerror = () => {
578
+ const error = new Error("WebSocket connection error");
579
+ onError?.(error);
580
+ settleReject(error);
101
581
  };
102
- ws.onclose = () => {
582
+ socket.onclose = () => {
583
+ connected = false;
103
584
  ws = null;
104
585
  };
105
586
  });
106
587
  }
107
- async function speak(text) {
588
+ function resetSynthesisState() {
589
+ stop();
590
+ audioChunks = [];
591
+ responseDone = false;
592
+ autoPlayed = false;
593
+ playbackSuppressed = false;
594
+ }
595
+ function ensureSocketReady() {
108
596
  if (!ws || ws.readyState !== WebSocket.OPEN) {
109
597
  throw new Error("WebSocket not connected");
110
598
  }
111
- audioChunks = [];
112
- ws.send(
113
- JSON.stringify({
114
- type: "input_text_buffer.append",
115
- text
116
- })
117
- );
118
- setTimeout(() => {
119
- ws.send(
599
+ }
600
+ function appendText(text) {
601
+ const normalizedText = preprocessTTSContent(text);
602
+ const fragments = splitTextIntoFragments(normalizedText);
603
+ if (!fragments.length) {
604
+ return;
605
+ }
606
+ ensureSocketReady();
607
+ const socket = ws;
608
+ for (const fragment of fragments) {
609
+ socket?.send(
120
610
  JSON.stringify({
121
- type: "input_text_buffer.commit"
611
+ type: "input_text_buffer.append",
612
+ text: fragment
122
613
  })
123
614
  );
124
- }, 100);
125
- }
126
- function playAudio() {
127
- let chunks = audioChunks;
128
- if (typeof window === "undefined") return;
129
- try {
130
- if (!audioContext) {
131
- audioContext = new AudioContext();
132
- }
133
- onAudioStart?.();
134
- let totalBytes = 0;
135
- const allBytes = [];
136
- for (const chunk of chunks) {
137
- const binaryString = atob(chunk);
138
- const bytes = new Uint8Array(binaryString.length);
139
- for (let i = 0; i < binaryString.length; i++) {
140
- bytes[i] = binaryString.charCodeAt(i);
141
- }
142
- allBytes.push(bytes);
143
- totalBytes += bytes.length;
144
- }
145
- const combined = new Uint8Array(totalBytes);
146
- let offset = 0;
147
- for (const bytes of allBytes) {
148
- combined.set(bytes, offset);
149
- offset += bytes.length;
150
- }
151
- const numSamples = combined.length / 2;
152
- const audioBuffer = audioContext.createBuffer(1, numSamples, sampleRate);
153
- const channelData = audioBuffer.getChannelData(0);
154
- const dataView = new DataView(combined.buffer);
155
- for (let i = 0; i < numSamples; i++) {
156
- const int16 = dataView.getInt16(i * 2, true);
157
- channelData[i] = int16 / 32768;
158
- }
159
- const source = audioContext.createBufferSource();
160
- source.buffer = audioBuffer;
161
- source.connect(audioContext.destination);
162
- source.onended = () => onAudioEnd?.();
163
- source.start(0);
164
- audioSource = source;
165
- } catch (err) {
166
- onError?.(err);
167
615
  }
168
616
  }
169
- function stopAudio() {
170
- if (audioSource) {
171
- audioSource.stop();
172
- audioSource = null;
173
- }
174
- if (audioContext) {
175
- audioContext.close();
176
- audioContext = null;
177
- }
617
+ function commitText() {
618
+ ensureSocketReady();
619
+ const socket = ws;
620
+ socket?.send(
621
+ JSON.stringify({
622
+ type: "input_text_buffer.commit"
623
+ })
624
+ );
178
625
  }
179
- function close() {
180
- if (ws) {
181
- ws.close();
182
- ws = null;
183
- }
184
- stopAudio();
626
+ function startStream() {
627
+ resetSynthesisState();
628
+ }
629
+ async function speak(text) {
630
+ startStream();
631
+ appendText(text);
632
+ commitText();
185
633
  }
186
634
  return {
187
635
  connect,
188
636
  speak,
637
+ startStream,
638
+ appendText,
639
+ commitText,
640
+ play,
641
+ stop,
189
642
  close,
190
- play: playAudio,
191
- stop: stopAudio
643
+ isConnected() {
644
+ return connected && ws?.readyState === WebSocket.OPEN;
645
+ },
646
+ hasAudio() {
647
+ return audioChunks.length > 0;
648
+ },
649
+ isResponseDone() {
650
+ return responseDone;
651
+ },
652
+ isPlaying() {
653
+ return playbackBackend?.isPlaying() ?? false;
654
+ },
655
+ isStreamingPlayback() {
656
+ return playbackBackend?.hasStreamingPlayback ?? false;
657
+ }
192
658
  };
193
659
  }
194
- var tts_client_default = (authConfig) => {
195
- return (config) => {
196
- return createTTSClient({ ...authConfig, ...config });
660
+ function defaultSnapshot(voice) {
661
+ return {
662
+ status: "idle",
663
+ activeId: null,
664
+ error: null,
665
+ requestId: 0,
666
+ text: null,
667
+ voice,
668
+ fallbackMode: "none"
197
669
  };
198
- };
670
+ }
671
+ function canUseSystemSpeech() {
672
+ return isBrowserEnvironment() && "speechSynthesis" in window && "SpeechSynthesisUtterance" in window;
673
+ }
674
+ function systemSpeak(text, options) {
675
+ if (!canUseSystemSpeech()) {
676
+ throw new Error("SpeechSynthesis is not supported");
677
+ }
678
+ const utterance = new SpeechSynthesisUtterance(text);
679
+ utterance.onstart = () => {
680
+ options.onStart?.();
681
+ };
682
+ utterance.onerror = (event) => {
683
+ options.onError?.(new Error(event.error || "Speech synthesis failed"));
684
+ };
685
+ utterance.onend = () => {
686
+ options.onEnd?.();
687
+ };
688
+ window.speechSynthesis.cancel();
689
+ window.speechSynthesis.speak(utterance);
690
+ }
691
+ function stopSystemSpeech() {
692
+ if (canUseSystemSpeech()) {
693
+ window.speechSynthesis.cancel();
694
+ }
695
+ }
696
+ function createTTSSpeakController(createClient, options = {}) {
697
+ const listeners = /* @__PURE__ */ new Set();
698
+ const persistedVoice = options.voiceStorageKey && options.storage ? options.storage.getItem(options.voiceStorageKey) || null : null;
699
+ let client = null;
700
+ let snapshot = defaultSnapshot(persistedVoice);
701
+ let streamActive = false;
702
+ let streamId = null;
703
+ const emit = () => {
704
+ for (const listener of listeners) {
705
+ listener(snapshot);
706
+ }
707
+ };
708
+ const setSnapshot = (next) => {
709
+ snapshot = {
710
+ ...snapshot,
711
+ ...next
712
+ };
713
+ emit();
714
+ };
715
+ const persistVoice = (voice) => {
716
+ if (!options.voiceStorageKey || !options.storage) {
717
+ return;
718
+ }
719
+ if (!voice) {
720
+ options.storage.removeItem?.(options.voiceStorageKey);
721
+ return;
722
+ }
723
+ options.storage.setItem(options.voiceStorageKey, voice);
724
+ };
725
+ const reset = (requestId, preserved) => {
726
+ snapshot = {
727
+ status: "idle",
728
+ activeId: null,
729
+ error: null,
730
+ requestId,
731
+ text: preserved?.text ?? null,
732
+ voice: preserved?.voice ?? snapshot.voice,
733
+ fallbackMode: "none"
734
+ };
735
+ emit();
736
+ };
737
+ const stop = (stopOptions) => {
738
+ const preserveClient = stopOptions?.preserveClient ?? true;
739
+ const nextRequestId = snapshot.requestId + 1;
740
+ streamActive = false;
741
+ streamId = null;
742
+ client?.stop();
743
+ stopSystemSpeech();
744
+ if (!preserveClient || !client || !snapshot.text) {
745
+ client?.close();
746
+ client = null;
747
+ reset(nextRequestId, { voice: snapshot.voice });
748
+ return;
749
+ }
750
+ reset(nextRequestId, {
751
+ text: snapshot.text,
752
+ voice: snapshot.voice
753
+ });
754
+ };
755
+ const createManagedClient = (input, requestId, content) => {
756
+ const nextVoice = input.voice ?? snapshot.voice ?? void 0;
757
+ const nextClient = createClient({
758
+ voice: nextVoice,
759
+ autoPlay: true,
760
+ audioFormat: input.audioFormat,
761
+ sampleRate: input.sampleRate,
762
+ runtime: options.runtime,
763
+ onReady: () => {
764
+ if (client !== nextClient) {
765
+ return;
766
+ }
767
+ setSnapshot({
768
+ status: "connecting",
769
+ error: null
770
+ });
771
+ },
772
+ onAudioStart: () => {
773
+ if (client !== nextClient) {
774
+ return;
775
+ }
776
+ setSnapshot({
777
+ status: "speaking",
778
+ error: null,
779
+ fallbackMode: "none"
780
+ });
781
+ },
782
+ onAudioEnd: () => {
783
+ if (client !== nextClient) {
784
+ return;
785
+ }
786
+ setSnapshot({
787
+ status: streamActive ? "connecting" : "idle",
788
+ activeId: streamActive ? streamId : null,
789
+ error: null,
790
+ fallbackMode: "none"
791
+ });
792
+ },
793
+ onError: (error) => {
794
+ if (client !== nextClient) {
795
+ return;
796
+ }
797
+ streamActive = false;
798
+ streamId = null;
799
+ setSnapshot({
800
+ status: "error",
801
+ error: error.message,
802
+ fallbackMode: "none"
803
+ });
804
+ },
805
+ onClose: () => {
806
+ if (client !== nextClient) {
807
+ return;
808
+ }
809
+ client = null;
810
+ }
811
+ });
812
+ client = nextClient;
813
+ setSnapshot({
814
+ status: "connecting",
815
+ activeId: input.id ?? null,
816
+ error: null,
817
+ requestId,
818
+ text: content,
819
+ voice: nextVoice ?? null,
820
+ fallbackMode: "none"
821
+ });
822
+ return {
823
+ nextClient,
824
+ nextVoice
825
+ };
826
+ };
827
+ const ensureStreamClient = async (streamOptions) => {
828
+ const nextRequestId = snapshot.requestId + 1;
829
+ const nextVoice = streamOptions.voice ?? snapshot.voice ?? void 0;
830
+ if (client && snapshot.voice === (nextVoice ?? null)) {
831
+ streamActive = true;
832
+ streamId = streamOptions.id ?? null;
833
+ setSnapshot({
834
+ status: client.isPlaying() ? "speaking" : "connecting",
835
+ activeId: streamId,
836
+ error: null,
837
+ requestId: nextRequestId,
838
+ voice: nextVoice ?? null,
839
+ fallbackMode: "none"
840
+ });
841
+ client.startStream();
842
+ return;
843
+ }
844
+ stop({ preserveClient: false });
845
+ const { nextClient } = createManagedClient(
846
+ {
847
+ id: streamOptions.id,
848
+ voice: streamOptions.voice,
849
+ audioFormat: streamOptions.audioFormat,
850
+ sampleRate: streamOptions.sampleRate
851
+ },
852
+ nextRequestId,
853
+ ""
854
+ );
855
+ streamActive = true;
856
+ streamId = streamOptions.id ?? null;
857
+ await nextClient.connect();
858
+ nextClient.startStream();
859
+ };
860
+ const speak = async (speakOptions) => {
861
+ const content = preprocessTTSContent(speakOptions.text);
862
+ if (!content) {
863
+ stop({ preserveClient: false });
864
+ return;
865
+ }
866
+ const nextRequestId = snapshot.requestId + 1;
867
+ const nextVoice = speakOptions.voice ?? snapshot.voice ?? void 0;
868
+ if (client && snapshot.text === content && snapshot.voice === (nextVoice ?? null) && client.hasAudio()) {
869
+ setSnapshot({
870
+ status: "speaking",
871
+ activeId: speakOptions.id ?? null,
872
+ error: null,
873
+ requestId: nextRequestId,
874
+ text: content,
875
+ voice: nextVoice ?? null,
876
+ fallbackMode: "none"
877
+ });
878
+ client.play();
879
+ return;
880
+ }
881
+ stop({ preserveClient: false });
882
+ streamActive = false;
883
+ streamId = null;
884
+ const { nextClient } = createManagedClient(
885
+ {
886
+ id: speakOptions.id,
887
+ voice: speakOptions.voice,
888
+ audioFormat: speakOptions.audioFormat,
889
+ sampleRate: speakOptions.sampleRate
890
+ },
891
+ nextRequestId,
892
+ content
893
+ );
894
+ try {
895
+ await nextClient.connect();
896
+ await nextClient.speak(content);
897
+ } catch (error) {
898
+ if (client !== nextClient) {
899
+ return;
900
+ }
901
+ if (options.fallbackToSystemSpeech !== false && canUseSystemSpeech()) {
902
+ client?.close();
903
+ client = null;
904
+ systemSpeak(content, {
905
+ onStart: () => {
906
+ setSnapshot({
907
+ status: "speaking",
908
+ error: null,
909
+ activeId: speakOptions.id ?? null,
910
+ requestId: nextRequestId,
911
+ text: content,
912
+ voice: nextVoice ?? null,
913
+ fallbackMode: "system"
914
+ });
915
+ },
916
+ onEnd: () => {
917
+ setSnapshot({
918
+ status: "idle",
919
+ activeId: null,
920
+ error: null,
921
+ fallbackMode: "none"
922
+ });
923
+ },
924
+ onError: (fallbackError) => {
925
+ setSnapshot({
926
+ status: "error",
927
+ error: fallbackError.message,
928
+ fallbackMode: "none"
929
+ });
930
+ }
931
+ });
932
+ return;
933
+ }
934
+ client?.close();
935
+ client = null;
936
+ setSnapshot({
937
+ status: "error",
938
+ error: error instanceof Error ? error.message : String(error),
939
+ fallbackMode: "none"
940
+ });
941
+ }
942
+ };
943
+ return {
944
+ getSnapshot() {
945
+ return snapshot;
946
+ },
947
+ subscribe(listener) {
948
+ listeners.add(listener);
949
+ return () => {
950
+ listeners.delete(listener);
951
+ };
952
+ },
953
+ speak,
954
+ async startStream(streamOptions) {
955
+ await ensureStreamClient(streamOptions);
956
+ },
957
+ async appendStreamText(streamOptions) {
958
+ const content = preprocessTTSContent(streamOptions.text);
959
+ if (!content) {
960
+ return;
961
+ }
962
+ if (!streamActive || streamId !== (streamOptions.id ?? null) || !client) {
963
+ await ensureStreamClient(streamOptions);
964
+ }
965
+ client?.appendText(content);
966
+ setSnapshot({
967
+ status: snapshot.status === "speaking" ? "speaking" : "connecting",
968
+ activeId: streamOptions.id ?? null,
969
+ error: null,
970
+ text: `${snapshot.text || ""}${content}`,
971
+ fallbackMode: "none"
972
+ });
973
+ },
974
+ commitStream() {
975
+ if (!client || !streamActive) {
976
+ return;
977
+ }
978
+ client.commitText();
979
+ setSnapshot({
980
+ status: snapshot.status === "speaking" ? "speaking" : "connecting",
981
+ activeId: streamId,
982
+ error: null,
983
+ fallbackMode: "none"
984
+ });
985
+ },
986
+ finishStream() {
987
+ streamActive = false;
988
+ streamId = null;
989
+ setSnapshot({
990
+ status: client?.isPlaying() ? "speaking" : "idle",
991
+ activeId: client?.isPlaying() ? snapshot.activeId : null,
992
+ error: null,
993
+ fallbackMode: "none"
994
+ });
995
+ },
996
+ stop,
997
+ release() {
998
+ stop({ preserveClient: false });
999
+ },
1000
+ async toggle(toggleOptions) {
1001
+ if (this.isActive(toggleOptions.id)) {
1002
+ stop();
1003
+ return;
1004
+ }
1005
+ await speak(toggleOptions);
1006
+ },
1007
+ isActive(id) {
1008
+ if (!id) {
1009
+ return snapshot.status === "connecting" || snapshot.status === "speaking";
1010
+ }
1011
+ return snapshot.activeId === id && (snapshot.status === "connecting" || snapshot.status === "speaking");
1012
+ },
1013
+ setVoice(voice) {
1014
+ setSnapshot({ voice });
1015
+ persistVoice(voice);
1016
+ },
1017
+ getVoice() {
1018
+ return snapshot.voice;
1019
+ }
1020
+ };
1021
+ }
1022
+ function createTTSClient(authConfig) {
1023
+ return (config) => createRawTTSClient({ ...authConfig, ...config });
1024
+ }
199
1025
  // Annotate the CommonJS export names for ESM import in node:
200
1026
  0 && (module.exports = {
201
- createTTSClient
1027
+ createTTSClient,
1028
+ createTTSSpeakController,
1029
+ preprocessTTSContent,
1030
+ splitTextIntoFragments
202
1031
  });
203
1032
  //# sourceMappingURL=index.cjs.map